revision-id: 648cf7176cc95f697abd8b94e860c74768680298 (mariadb-galera-10.0.34-6-g648cf7176cc) parent(s): 7b115181987fb88b97ef6d3d88bb16bdbc281e40 1ecd68d867ced1d00ebffdcedbf6bc97493f5067 author: Jan Lindström committer: Jan Lindström timestamp: 2018-05-07 13:49:14 +0300 message: Merge remote-tracking branch 'origin/5.5-galera' into 10.0-galera .gitignore | 1 + extra/yassl/src/handshake.cpp | 10 + include/heap.h | 1 + include/my_valgrind.h | 2 + include/mysql_com.h | 2 +- include/sql_common.h | 2 +- mysql-test/mysql-test-run.pl | 2 +- mysql-test/r/connect_debug.result | 5 + mysql-test/r/ctype_ucs.result | 31 +++ mysql-test/r/ctype_utf8mb4.result | 23 ++ mysql-test/r/func_misc.result | 11 + mysql-test/r/join_outer.result | 18 +- mysql-test/r/join_outer_jcl6.result | 18 +- mysql-test/r/mysqld--help.result | 4 +- mysql-test/r/parser.result | 7 + mysql-test/r/ps_qc_innodb.result | 23 ++ mysql-test/r/subselect4.result | 35 ++- mysql-test/r/subselect_mat.result | 15 + mysql-test/r/view.result | 305 ++++++++++++-------- mysql-test/suite/galera/disabled.def | 2 + mysql-test/suite/galera/r/MW-416.result | 114 ++++++++ mysql-test/suite/galera/r/galera_defaults.result | 2 +- .../suite/galera/r/galera_var_dirty_reads.result | 2 - mysql-test/suite/galera/t/MW-416.test | 134 +++++++++ .../suite/galera/t/galera_concurrent_ctas.test | 8 +- .../suite/galera/t/galera_var_dirty_reads.test | 9 +- .../suite/innodb/r/innodb-replace-debug.result | 5 +- .../suite/innodb/t/innodb-replace-debug.test | 5 +- mysql-test/suite/maria/dynamic.result | 4 + mysql-test/suite/maria/dynamic.test | 7 + .../suite/parts/r/partition_alter_maria.result | 9 + .../suite/parts/t/partition_alter_maria.test | 7 + mysql-test/suite/plugins/t/server_audit.test | 2 + mysql-test/suite/wsrep/r/variables.result | 7 + mysql-test/suite/wsrep/t/variables.test | 14 + mysql-test/t/connect_debug.test | 12 + mysql-test/t/ctype_ucs.test | 22 ++ mysql-test/t/ctype_utf8mb4.test | 19 ++ mysql-test/t/func_misc.test | 12 + mysql-test/t/join_outer.test | 18 +- mysql-test/t/parser.test | 9 + mysql-test/t/ps_qc_innodb.test | 35 +++ mysql-test/t/subselect4.test | 31 +++ mysql-test/t/subselect_mat.test | 13 + mysql-test/t/view.test | 308 +++++++++++++-------- mysys/lf_hash.c | 9 +- mysys/mf_iocache.c | 2 +- mysys/my_addr_resolve.c | 2 +- mysys/my_symlink.c | 2 +- policy/selinux/mariadb-server.fc | 2 +- policy/selinux/mariadb-server.te | 2 +- scripts/CMakeLists.txt | 16 ++ scripts/wsrep_sst_xtrabackup-v2.sh | 2 +- sql-common/client.c | 12 +- sql/event_data_objects.cc | 21 +- sql/event_db_repository.cc | 5 +- sql/events.cc | 15 + sql/handler.cc | 6 + sql/item_cmpfunc.h | 5 + sql/item_func.h | 4 +- sql/item_strfunc.h | 2 + sql/item_subselect.cc | 2 +- sql/log.cc | 8 +- sql/log_event.cc | 69 ++++- sql/log_event_old.cc | 3 +- sql/mysqld.cc | 5 +- sql/mysqld.h | 3 +- sql/opt_subselect.cc | 9 +- sql/slave.cc | 2 +- sql/sp.cc | 6 +- sql/sql_acl.cc | 3 +- sql/sql_admin.cc | 2 +- sql/sql_base.h | 2 + sql/sql_cache.cc | 1 + sql/sql_class.cc | 7 +- sql/sql_class.h | 1 + sql/sql_insert.cc | 29 ++ sql/sql_parse.cc | 14 +- sql/sql_partition.cc | 2 +- sql/sql_plugin.cc | 33 ++- sql/sql_prepare.cc | 6 +- sql/sql_priv.h | 4 +- sql/sql_table.cc | 2 +- sql/sql_trigger.cc | 6 + sql/sql_truncate.cc | 2 +- sql/sql_update.cc | 2 + sql/sql_view.cc | 5 + sql/sql_yacc.yy | 5 + sql/sys_vars.cc | 6 +- sql/table.cc | 16 +- sql/table.h | 8 +- sql/wsrep_hton.cc | 38 ++- sql/wsrep_mysqld.cc | 65 ++--- sql/wsrep_mysqld.h | 13 + sql/wsrep_priv.h | 2 +- sql/wsrep_sst.cc | 1 - sql/wsrep_thd.cc | 2 +- sql/wsrep_utils.cc | 1 - storage/heap/_check.c | 2 +- storage/heap/ha_heap.cc | 11 +- storage/heap/hp_create.c | 8 +- storage/heap/hp_delete.c | 2 +- storage/heap/hp_rrnd.c | 4 +- storage/heap/hp_rsame.c | 2 +- storage/heap/hp_scan.c | 2 +- storage/heap/hp_write.c | 4 +- storage/innobase/handler/ha_innodb.cc | 73 ++--- storage/innobase/os/os0file.cc | 10 +- storage/maria/ma_control_file.c | 2 +- storage/maria/ma_dynrec.c | 10 +- storage/maria/ma_loghandler.c | 6 +- storage/maria/ma_open.c | 8 +- storage/myisam/mi_open.c | 8 +- storage/xtradb/handler/ha_innodb.cc | 20 +- storage/xtradb/log/log0online.cc | 3 - storage/xtradb/os/os0file.cc | 12 +- support-files/mysql.server.sh | 2 - support-files/wsrep.cnf.sh | 3 + 118 files changed, 1536 insertions(+), 473 deletions(-) diff --cc mysql-test/r/ctype_ucs.result index 6520694a804,1c9e31d3a06..59d88414cab --- a/mysql-test/r/ctype_ucs.result +++ b/mysql-test/r/ctype_ucs.result @@@ -4590,1014 -4397,36 +4590,1045 @@@ Field Type Null Key Default Extr c1 mediumtext YES NULL DROP TABLE t1; # + # MDEV-15624 Changing the default character set to utf8mb4 changes query evaluation in a very surprising way + # + SET NAMES utf8; + CREATE TABLE t1 (id INT); + INSERT INTO t1 VALUES (1),(2),(3); + SELECT COUNT(DISTINCT c) FROM (SELECT id, REPLACE(uuid_short(), '0', CAST('o' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1; + COUNT(DISTINCT c) + 3 + SELECT DISTINCT REPLACE(uuid_short(), '0', CAST('o' AS CHAR CHARACTER SET ucs2)) AS c FROM t1; + c + xxxxxxxxxxxxxxxxx + xxxxxxxxxxxxxxxxx + xxxxxxxxxxxxxxxxx + SELECT COUNT(DISTINCT c) FROM (SELECT id, INSERT(uuid_short(), 1, 1, CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1; + COUNT(DISTINCT c) + 3 + SELECT DISTINCT INSERT(uuid_short(), 1, 1, CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1; + c + xxxxxxxxxxxxxxxxx + xxxxxxxxxxxxxxxxx + xxxxxxxxxxxxxxxxx + SELECT COUNT(DISTINCT c) FROM (SELECT id, CONCAT(uuid_short(), CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1) AS d1; + COUNT(DISTINCT c) + 3 + SELECT DISTINCT CONCAT(uuid_short(), CAST('0' AS CHAR CHARACTER SET ucs2)) AS c FROM t1; + c + xxxxxxxxxxxxxxxxx + xxxxxxxxxxxxxxxxx + xxxxxxxxxxxxxxxxx + DROP TABLE t1; + # # End of 5.5 tests # +# +# Start of 5.6 tests +# +# +# WL#3664 WEIGHT_STRING +# +set collation_connection=ucs2_general_ci; +select @@collation_connection; +@@collation_connection +ucs2_general_ci +CREATE TABLE t1 AS SELECT 'a' AS a; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(1) CHARACTER SET ucs2 NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `ws` varbinary(2) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +SELECT HEX(WEIGHT_STRING(a)) FROM t1; +HEX(WEIGHT_STRING(a)) +0041 +SELECT HEX(ws) FROM t2; +HEX(ws) +0041 +DROP TABLE t2; +DROP TABLE t1; +CREATE TABLE t1 AS SELECT REPEAT('a',5) AS a; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(5) CHARACTER SET ucs2 NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `ws` varbinary(10) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +SELECT HEX(WEIGHT_STRING(a)) FROM t1; +HEX(WEIGHT_STRING(a)) +00410041004100410041 +SELECT HEX(ws) FROM t2; +HEX(ws) +00410041004100410041 +DROP TABLE t2; +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(3)) AS ws FROM t1; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `ws` varbinary(6) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +SELECT HEX(WEIGHT_STRING(a AS CHAR(3))) FROM t1; +HEX(WEIGHT_STRING(a AS CHAR(3))) +004100410041 +SELECT HEX(ws) FROM t2; +HEX(ws) +004100410041 +DROP TABLE t2; +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(10)) AS ws FROM t1; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `ws` varbinary(20) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +SELECT HEX(WEIGHT_STRING(a AS CHAR(10))) FROM t1; +HEX(WEIGHT_STRING(a AS CHAR(10))) +0041004100410041004100200020002000200020 +SELECT HEX(ws) FROM t2; +HEX(ws) +0041004100410041004100200020002000200020 +DROP TABLE t2; +DROP TABLE t1; +select hex(weight_string('a')); +hex(weight_string('a')) +0041 +select hex(weight_string('A')); +hex(weight_string('A')) +0041 +select hex(weight_string('abc')); +hex(weight_string('abc')) +004100420043 +select hex(weight_string('abc' as char(2))); +hex(weight_string('abc' as char(2))) +00410042 +select hex(weight_string('abc' as char(3))); +hex(weight_string('abc' as char(3))) +004100420043 +select hex(weight_string('abc' as char(5))); +hex(weight_string('abc' as char(5))) +00410042004300200020 +select hex(weight_string('abc', 1, 2, 0xC0)); +hex(weight_string('abc', 1, 2, 0xC0)) +00 +select hex(weight_string('abc', 2, 2, 0xC0)); +hex(weight_string('abc', 2, 2, 0xC0)) +0041 +select hex(weight_string('abc', 3, 2, 0xC0)); +hex(weight_string('abc', 3, 2, 0xC0)) +004100 +select hex(weight_string('abc', 4, 2, 0xC0)); +hex(weight_string('abc', 4, 2, 0xC0)) +00410042 +select hex(weight_string('abc', 5, 2, 0xC0)); +hex(weight_string('abc', 5, 2, 0xC0)) +0041004200 +select hex(weight_string('abc',25, 2, 0xC0)); +hex(weight_string('abc',25, 2, 0xC0)) +00410042002000200020002000200020002000200020002000 +select hex(weight_string('abc', 1, 3, 0xC0)); +hex(weight_string('abc', 1, 3, 0xC0)) +00 +select hex(weight_string('abc', 2, 3, 0xC0)); +hex(weight_string('abc', 2, 3, 0xC0)) +0041 +select hex(weight_string('abc', 3, 3, 0xC0)); +hex(weight_string('abc', 3, 3, 0xC0)) +004100 +select hex(weight_string('abc', 4, 3, 0xC0)); +hex(weight_string('abc', 4, 3, 0xC0)) +00410042 +select hex(weight_string('abc', 5, 3, 0xC0)); +hex(weight_string('abc', 5, 3, 0xC0)) +0041004200 +select hex(weight_string('abc',25, 3, 0xC0)); +hex(weight_string('abc',25, 3, 0xC0)) +00410042004300200020002000200020002000200020002000 +select hex(weight_string('abc', 1, 4, 0xC0)); +hex(weight_string('abc', 1, 4, 0xC0)) +00 +select hex(weight_string('abc', 2, 4, 0xC0)); +hex(weight_string('abc', 2, 4, 0xC0)) +0041 +select hex(weight_string('abc', 3, 4, 0xC0)); +hex(weight_string('abc', 3, 4, 0xC0)) +004100 +select hex(weight_string('abc', 4, 4, 0xC0)); +hex(weight_string('abc', 4, 4, 0xC0)) +00410042 +select hex(weight_string('abc', 5, 4, 0xC0)); +hex(weight_string('abc', 5, 4, 0xC0)) +0041004200 +select hex(weight_string('abc',25, 4, 0xC0)); +hex(weight_string('abc',25, 4, 0xC0)) +00410042004300200020002000200020002000200020002000 +select @@collation_connection; +@@collation_connection +ucs2_general_ci +select hex(weight_string(cast(_latin1 0x80 as char))); +hex(weight_string(cast(_latin1 0x80 as char))) +20AC +select hex(weight_string(cast(_latin1 0x808080 as char))); +hex(weight_string(cast(_latin1 0x808080 as char))) +20AC20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char) as char(2))); +hex(weight_string(cast(_latin1 0x808080 as char) as char(2))) +20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char) as char(3))); +hex(weight_string(cast(_latin1 0x808080 as char) as char(3))) +20AC20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char) as char(5))); +hex(weight_string(cast(_latin1 0x808080 as char) as char(5))) +20AC20AC20AC00200020 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0)) +20 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0)) +20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0)) +20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0)) +20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0)) +20AC20AC00 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0)) +20AC20AC002000200020002000200020002000200020002000 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0)) +20 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0)) +20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0)) +20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0)) +20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0)) +20AC20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0)) +20AC20AC20AC00200020002000200020002000200020002000 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0)) +20 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0)) +20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0)) +20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0)) +20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0)) +20AC20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0)) +20AC20AC20AC00200020002000200020002000200020002000 +select @@collation_connection; +@@collation_connection +ucs2_general_ci +select hex(weight_string('a' LEVEL 1)); +hex(weight_string('a' LEVEL 1)) +0041 +select hex(weight_string('A' LEVEL 1)); +hex(weight_string('A' LEVEL 1)) +0041 +select hex(weight_string('abc' LEVEL 1)); +hex(weight_string('abc' LEVEL 1)) +004100420043 +select hex(weight_string('abc' as char(2) LEVEL 1)); +hex(weight_string('abc' as char(2) LEVEL 1)) +00410042 +select hex(weight_string('abc' as char(3) LEVEL 1)); +hex(weight_string('abc' as char(3) LEVEL 1)) +004100420043 +select hex(weight_string('abc' as char(5) LEVEL 1)); +hex(weight_string('abc' as char(5) LEVEL 1)) +00410042004300200020 +select hex(weight_string('abc' as char(5) LEVEL 1 REVERSE)); +hex(weight_string('abc' as char(5) LEVEL 1 REVERSE)) +20002000430042004100 +select hex(weight_string('abc' as char(5) LEVEL 1 DESC)); +hex(weight_string('abc' as char(5) LEVEL 1 DESC)) +FFBEFFBDFFBCFFDFFFDF +select hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE)); +hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE)) +DFFFDFFFBCFFBDFFBEFF +set collation_connection=ucs2_bin; +select @@collation_connection; +@@collation_connection +ucs2_bin +CREATE TABLE t1 AS SELECT 'a' AS a; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(1) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `ws` varbinary(2) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +SELECT HEX(WEIGHT_STRING(a)) FROM t1; +HEX(WEIGHT_STRING(a)) +0061 +SELECT HEX(ws) FROM t2; +HEX(ws) +0061 +DROP TABLE t2; +DROP TABLE t1; +CREATE TABLE t1 AS SELECT REPEAT('a',5) AS a; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(5) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a) AS ws FROM t1; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `ws` varbinary(10) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +SELECT HEX(WEIGHT_STRING(a)) FROM t1; +HEX(WEIGHT_STRING(a)) +00610061006100610061 +SELECT HEX(ws) FROM t2; +HEX(ws) +00610061006100610061 +DROP TABLE t2; +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(3)) AS ws FROM t1; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `ws` varbinary(6) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +SELECT HEX(WEIGHT_STRING(a AS CHAR(3))) FROM t1; +HEX(WEIGHT_STRING(a AS CHAR(3))) +006100610061 +SELECT HEX(ws) FROM t2; +HEX(ws) +006100610061 +DROP TABLE t2; +CREATE TABLE t2 AS SELECT WEIGHT_STRING(a AS CHAR(10)) AS ws FROM t1; +SHOW CREATE TABLE t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `ws` varbinary(20) DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +SELECT HEX(WEIGHT_STRING(a AS CHAR(10))) FROM t1; +HEX(WEIGHT_STRING(a AS CHAR(10))) +0061006100610061006100200020002000200020 +SELECT HEX(ws) FROM t2; +HEX(ws) +0061006100610061006100200020002000200020 +DROP TABLE t2; +DROP TABLE t1; +select hex(weight_string('a')); +hex(weight_string('a')) +0061 +select hex(weight_string('A')); +hex(weight_string('A')) +0041 +select hex(weight_string('abc')); +hex(weight_string('abc')) +006100620063 +select hex(weight_string('abc' as char(2))); +hex(weight_string('abc' as char(2))) +00610062 +select hex(weight_string('abc' as char(3))); +hex(weight_string('abc' as char(3))) +006100620063 +select hex(weight_string('abc' as char(5))); +hex(weight_string('abc' as char(5))) +00610062006300200020 +select hex(weight_string('abc', 1, 2, 0xC0)); +hex(weight_string('abc', 1, 2, 0xC0)) +00 +select hex(weight_string('abc', 2, 2, 0xC0)); +hex(weight_string('abc', 2, 2, 0xC0)) +0061 +select hex(weight_string('abc', 3, 2, 0xC0)); +hex(weight_string('abc', 3, 2, 0xC0)) +006100 +select hex(weight_string('abc', 4, 2, 0xC0)); +hex(weight_string('abc', 4, 2, 0xC0)) +00610062 +select hex(weight_string('abc', 5, 2, 0xC0)); +hex(weight_string('abc', 5, 2, 0xC0)) +0061006200 +select hex(weight_string('abc',25, 2, 0xC0)); +hex(weight_string('abc',25, 2, 0xC0)) +00610062002000200020002000200020002000200020002000 +select hex(weight_string('abc', 1, 3, 0xC0)); +hex(weight_string('abc', 1, 3, 0xC0)) +00 +select hex(weight_string('abc', 2, 3, 0xC0)); +hex(weight_string('abc', 2, 3, 0xC0)) +0061 +select hex(weight_string('abc', 3, 3, 0xC0)); +hex(weight_string('abc', 3, 3, 0xC0)) +006100 +select hex(weight_string('abc', 4, 3, 0xC0)); +hex(weight_string('abc', 4, 3, 0xC0)) +00610062 +select hex(weight_string('abc', 5, 3, 0xC0)); +hex(weight_string('abc', 5, 3, 0xC0)) +0061006200 +select hex(weight_string('abc',25, 3, 0xC0)); +hex(weight_string('abc',25, 3, 0xC0)) +00610062006300200020002000200020002000200020002000 +select hex(weight_string('abc', 1, 4, 0xC0)); +hex(weight_string('abc', 1, 4, 0xC0)) +00 +select hex(weight_string('abc', 2, 4, 0xC0)); +hex(weight_string('abc', 2, 4, 0xC0)) +0061 +select hex(weight_string('abc', 3, 4, 0xC0)); +hex(weight_string('abc', 3, 4, 0xC0)) +006100 +select hex(weight_string('abc', 4, 4, 0xC0)); +hex(weight_string('abc', 4, 4, 0xC0)) +00610062 +select hex(weight_string('abc', 5, 4, 0xC0)); +hex(weight_string('abc', 5, 4, 0xC0)) +0061006200 +select hex(weight_string('abc',25, 4, 0xC0)); +hex(weight_string('abc',25, 4, 0xC0)) +00610062006300200020002000200020002000200020002000 +select @@collation_connection; +@@collation_connection +ucs2_bin +select hex(weight_string(cast(_latin1 0x80 as char))); +hex(weight_string(cast(_latin1 0x80 as char))) +20AC +select hex(weight_string(cast(_latin1 0x808080 as char))); +hex(weight_string(cast(_latin1 0x808080 as char))) +20AC20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char) as char(2))); +hex(weight_string(cast(_latin1 0x808080 as char) as char(2))) +20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char) as char(3))); +hex(weight_string(cast(_latin1 0x808080 as char) as char(3))) +20AC20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char) as char(5))); +hex(weight_string(cast(_latin1 0x808080 as char) as char(5))) +20AC20AC20AC00200020 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 1, 2, 0xC0)) +20 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 2, 2, 0xC0)) +20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 3, 2, 0xC0)) +20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 4, 2, 0xC0)) +20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 5, 2, 0xC0)) +20AC20AC00 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char),25, 2, 0xC0)) +20AC20AC002000200020002000200020002000200020002000 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 1, 3, 0xC0)) +20 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 2, 3, 0xC0)) +20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 3, 3, 0xC0)) +20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 4, 3, 0xC0)) +20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 5, 3, 0xC0)) +20AC20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char),25, 3, 0xC0)) +20AC20AC20AC00200020002000200020002000200020002000 +select hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 1, 4, 0xC0)) +20 +select hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 2, 4, 0xC0)) +20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 3, 4, 0xC0)) +20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 4, 4, 0xC0)) +20AC20AC +select hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char), 5, 4, 0xC0)) +20AC20AC20 +select hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0)); +hex(weight_string(cast(_latin1 0x808080 as char),25, 4, 0xC0)) +20AC20AC20AC00200020002000200020002000200020002000 +select @@collation_connection; +@@collation_connection +ucs2_bin +select hex(weight_string('a' LEVEL 1)); +hex(weight_string('a' LEVEL 1)) +0061 +select hex(weight_string('A' LEVEL 1)); +hex(weight_string('A' LEVEL 1)) +0041 +select hex(weight_string('abc' LEVEL 1)); +hex(weight_string('abc' LEVEL 1)) +006100620063 +select hex(weight_string('abc' as char(2) LEVEL 1)); +hex(weight_string('abc' as char(2) LEVEL 1)) +00610062 +select hex(weight_string('abc' as char(3) LEVEL 1)); +hex(weight_string('abc' as char(3) LEVEL 1)) +006100620063 +select hex(weight_string('abc' as char(5) LEVEL 1)); +hex(weight_string('abc' as char(5) LEVEL 1)) +00610062006300200020 +select hex(weight_string('abc' as char(5) LEVEL 1 REVERSE)); +hex(weight_string('abc' as char(5) LEVEL 1 REVERSE)) +20002000630062006100 +select hex(weight_string('abc' as char(5) LEVEL 1 DESC)); +hex(weight_string('abc' as char(5) LEVEL 1 DESC)) +FF9EFF9DFF9CFFDFFFDF +select hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE)); +hex(weight_string('abc' as char(5) LEVEL 1 DESC REVERSE)) +DFFFDFFF9CFF9DFF9EFF +# +# Bug #36418 Character sets: crash if char(256 using utf32) +# +select hex(char(0x01 using ucs2)); +hex(char(0x01 using ucs2)) +0001 +select hex(char(0x0102 using ucs2)); +hex(char(0x0102 using ucs2)) +0102 +select hex(char(0x010203 using ucs2)); +hex(char(0x010203 using ucs2)) +00010203 +select hex(char(0x01020304 using ucs2)); +hex(char(0x01020304 using ucs2)) +01020304 +# +# Bug#10094 Displays wrong error message for UNIQUE key index on CHAR(255) Unicode datatype +# +CREATE TABLE t1 (f1 CHAR(255) unicode); +INSERT INTO t1 values ('abc'),('bcd'),('abc'); +ALTER TABLE t1 ADD UNIQUE Index_1 (f1); +ERROR 23000: Duplicate entry 'abc' for key 'Index_1' +DROP TABLE t1; +# +# Test how character set works with date/time +# +SET collation_connection=ucs2_general_ci; +# +# Bug#32390 Character sets: casting utf32 to/from date doesn't work +# +CREATE TABLE t1 AS SELECT repeat('a',20) AS s1 LIMIT 0; +SET time_zone=_latin1'+03:00'; +SET timestamp=1216359724; +INSERT INTO t1 VALUES (current_date); +INSERT INTO t1 VALUES (current_time); +INSERT INTO t1 VALUES (current_timestamp); +SELECT s1, hex(s1) FROM t1; +s1 hex(s1) +2008-07-18 0032003000300038002D00300037002D00310038 +08:42:04 00300038003A00340032003A00300034 +2008-07-18 08:42:04 0032003000300038002D00300037002D00310038002000300038003A00340032003A00300034 +DROP TABLE t1; +SET timestamp=0; +SET time_zone=default; +# +# MDEV-5298 Illegal mix of collations on timestamp +# +SELECT CHARSET('2013-11-15 00:41:28' - INTERVAL 7 DAY); +CHARSET('2013-11-15 00:41:28' - INTERVAL 7 DAY) +ucs2 +SELECT COERCIBILITY('2013-11-15 00:41:28' - INTERVAL 7 DAY); +COERCIBILITY('2013-11-15 00:41:28' - INTERVAL 7 DAY) +4 +SELECT CHARSET(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY); +CHARSET(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY) +binary +SELECT COERCIBILITY(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY); +COERCIBILITY(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY) +5 +SELECT CHARSET(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY)); +CHARSET(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY)) +ucs2 +SELECT COERCIBILITY(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY)); +COERCIBILITY(CONCAT('2013-11-15 00:41:28' - INTERVAL 7 DAY)) +4 +SELECT CHARSET(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY)); +CHARSET(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY)) +ucs2 +SELECT COERCIBILITY(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY)); +COERCIBILITY(CONCAT(TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY)) +4 +SELECT CHARSET(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY)); +CHARSET(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY)) +ucs2 +SELECT COERCIBILITY(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY)); +COERCIBILITY(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY)) +4 +SELECT HEX(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY)); +HEX(CONCAT('','2001-01-08 00:00:00' - INTERVAL 7 DAY)) +0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030 +SELECT CHARSET(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)); +CHARSET(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +ucs2 +SELECT COERCIBILITY(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)); +COERCIBILITY(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +4 +SELECT HEX(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)); +HEX(CONCAT('',TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030 +CREATE TABLE t1 AS SELECT REPEAT('a', 64) AS a LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(64) CHARACTER SET ucs2 NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES (''); +SELECT CHARSET(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1; +CHARSET(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +ucs2 +SELECT COERCIBILITY(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1; +COERCIBILITY(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +2 +SELECT HEX(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1; +HEX(CONCAT(a,'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030 +SELECT CHARSET(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1; +CHARSET(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +ucs2 +SELECT COERCIBILITY(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1; +COERCIBILITY(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +2 +SELECT HEX(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) FROM t1; +HEX(CONCAT(a,TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY)) +0032003000300031002D00300031002D00300031002000300030003A00300030003A00300030 +DROP TABLE t1; +CREATE TABLE t1 (t TIMESTAMP NOT NULL); +INSERT INTO t1 VALUES ('2001-01-01 00:00:00'); +SELECT * FROM t1 WHERE t < '2013-11-15 00:41:28' - INTERVAL 7 DAY; +t +2001-01-01 00:00:00 +SELECT * FROM t1 WHERE t = '2001-01-08 00:00:00' - INTERVAL 7 DAY; +t +2001-01-01 00:00:00 +SELECT * FROM t1 WHERE t < CONCAT('2013-11-15 00:41:28',LEFT(RAND(),0)) - INTERVAL 7 DAY; +t +2001-01-01 00:00:00 +SELECT * FROM t1 WHERE t = CONCAT('2001-01-08 00:00:00',LEFT(RAND(),0)) - INTERVAL 7 DAY; +t +2001-01-01 00:00:00 +SELECT * FROM t1 WHERE t < TIMESTAMP'2013-11-15 00:41:28' - INTERVAL 7 DAY; +t +2001-01-01 00:00:00 +SELECT * FROM t1 WHERE t = TIMESTAMP'2001-01-08 00:00:00' - INTERVAL 7 DAY; +t +2001-01-01 00:00:00 +DROP TABLE t1; +SET NAMES latin1; +# +# WL#4013 Unicode german2 collation +# +SET collation_connection=ucs2_german2_ci; +"BEGIN ctype_german.inc" +drop table if exists t1; +create table t1 as select repeat(' ', 64) as s1; +select collation(s1) from t1; +collation(s1) +ucs2_german2_ci +delete from t1; +INSERT INTO t1 VALUES ('ud'),('uf'); +INSERT INTO t1 VALUES ('od'),('of'); +INSERT INTO t1 VALUES ('e'); +INSERT INTO t1 VALUES ('ad'),('af'); +insert into t1 values ('a'),('ae'),(_latin1 0xE4); +insert into t1 values ('o'),('oe'),(_latin1 0xF6); +insert into t1 values ('s'),('ss'),(_latin1 0xDF); +insert into t1 values ('u'),('ue'),(_latin1 0xFC); +INSERT INTO t1 VALUES (_latin1 0xE6), (_latin1 0xC6); +INSERT INTO t1 VALUES (_latin1 0x9C), (_latin1 0x8C); +select s1, hex(s1) from t1 order by s1, binary s1; +s1 hex(s1) +a 0061 +ad 00610064 +ae 00610065 +� 00C6 +� 00E4 +� 00E6 +af 00610066 +e 0065 +o 006F +od 006F0064 +oe 006F0065 +� 00F6 +� 0152 +� 0153 +of 006F0066 +s 0073 +ss 00730073 +� 00DF +u 0075 +ud 00750064 +ue 00750065 +� 00FC +uf 00750066 +select group_concat(s1 order by binary s1) from t1 group by s1; +group_concat(s1 order by binary s1) +a +ad +ae,�,�,� +af +e +o +od +oe,�,�,� +of +s +ss,� +u +ud +ue,� +uf +SELECT s1, hex(s1), hex(weight_string(s1)) FROM t1 ORDER BY s1, BINARY(s1); +s1 hex(s1) hex(weight_string(s1)) +a 0061 0E33 +ad 00610064 0E330E6D +ae 00610065 0E330E8B +� 00C6 0E330E8B +� 00E4 0E330E8B +� 00E6 0E330E8B +af 00610066 0E330EB9 +e 0065 0E8B +o 006F 0F82 +od 006F0064 0F820E6D +oe 006F0065 0F820E8B +� 00F6 0F820E8B +� 0152 0F820E8B +� 0153 0F820E8B +of 006F0066 0F820EB9 +s 0073 0FEA +ss 00730073 0FEA0FEA +� 00DF 0FEA0FEA +u 0075 101F +ud 00750064 101F0E6D +ue 00750065 101F0E8B +� 00FC 101F0E8B +uf 00750066 101F0EB9 +SELECT s1, hex(s1) FROM t1 WHERE s1='ae' ORDER BY s1, BINARY(s1); +s1 hex(s1) +ae 00610065 +� 00C6 +� 00E4 +� 00E6 +drop table t1; +CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a, 1 AS b LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `a` varchar(1) CHARACTER SET ucs2 COLLATE ucs2_german2_ci NOT NULL DEFAULT '', + `b` int(1) NOT NULL DEFAULT '0' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('s',0),(_latin1 0xDF,1); +SELECT * FROM t1 ORDER BY a, b; +a b +s 0 +� 1 +SELECT * FROM t1 ORDER BY a DESC, b; +a b +� 1 +s 0 +SELECT * FROM t1 ORDER BY CONCAT(a), b; +a b +s 0 +� 1 +SELECT * FROM t1 ORDER BY CONCAT(a) DESC, b; +a b +� 1 +s 0 +DROP TABLE t1; +"END ctype_german.inc" +# +# Bug#59145 valgrind warnings for uninitialized values in my_strtoll10_mb2 +# +SET NAMES latin1; +SELECT CONVERT(CHAR(NULL USING ucs2), UNSIGNED); +CONVERT(CHAR(NULL USING ucs2), UNSIGNED) +0 +Warnings: +Warning 1292 Truncated incorrect INTEGER value: '' +DO IFNULL(CHAR(NULL USING ucs2), ''); +DO CAST(CONVERT('' USING ucs2) AS UNSIGNED); +Warnings: +Warning 1292 Truncated incorrect INTEGER value: '' +# +# Test error message for conversion using different charset +# +CREATE TABLE t1 (a DECIMAL(2,0)); +SET sql_mode='strict_all_tables'; +INSERT INTO t1 VALUES (CONVERT('9e99999999' USING ucs2)); +ERROR 22007: Incorrect decimal value: '9e99999999' for column 'a' at row 1 +SET sql_mode=DEFAULT; +INSERT INTO t1 VALUES (CONVERT('aaa' USING ucs2)); +Warnings: +Warning 1366 Incorrect decimal value: 'aaa' for column 'a' at row 1 +DROP TABLE t1; +# +# End of 5.6 tests +# +# +# Start of 10.0 tests +# +SET NAMES latin1, collation_connection=ucs2_bin; +# +# MDEV-7149 Constant condition propagation erroneously applied for LIKE +# +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('a'),('a '); +SELECT * FROM t1 WHERE CONCAT(c1)='a'; +c1 +a +a +SELECT * FROM t1 WHERE CONCAT(c1) LIKE 'a '; +c1 +a +SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a '; +c1 +a +EXPLAIN EXTENDED SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a '; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where ((concat(`test`.`t1`.`c1`) = 'a') and (concat(`test`.`t1`.`c1`) like 'a ')) +DROP TABLE t1; +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('a'),('a '); +SELECT * FROM t1 WHERE 'a'=CONCAT(c1); +c1 +a +a +SELECT * FROM t1 WHERE 'a ' LIKE CONCAT(c1); +c1 +a +SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1); +c1 +a +EXPLAIN EXTENDED SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('a' = concat(`test`.`t1`.`c1`)) and ('a ' like concat(`test`.`t1`.`c1`))) +DROP TABLE t1; +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('%'),('% '); +SELECT * FROM t1 WHERE '% '=CONCAT(c1); +c1 +% +% +SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1); +c1 +% +SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1); +c1 +% +EXPLAIN EXTENDED SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('% ' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`))) +DROP TABLE t1; +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_bin NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('%'),('% '); +SELECT * FROM t1 WHERE '%'=CONCAT(c1); +c1 +% +% +SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1); +c1 +% +SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1); +c1 +% +EXPLAIN EXTENDED SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('%' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`))) +DROP TABLE t1; +SET NAMES latin1, collation_connection=ucs2_general_ci; +# +# MDEV-7149 Constant condition propagation erroneously applied for LIKE +# +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('a'),('a '); +SELECT * FROM t1 WHERE CONCAT(c1)='a'; +c1 +a +a +SELECT * FROM t1 WHERE CONCAT(c1) LIKE 'a '; +c1 +a +SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a '; +c1 +a +EXPLAIN EXTENDED SELECT * FROM t1 WHERE CONCAT(c1)='a' AND CONCAT(c1) LIKE 'a '; +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where ((concat(`test`.`t1`.`c1`) = 'a') and (concat(`test`.`t1`.`c1`) like 'a ')) +DROP TABLE t1; +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('a'),('a '); +SELECT * FROM t1 WHERE 'a'=CONCAT(c1); +c1 +a +a +SELECT * FROM t1 WHERE 'a ' LIKE CONCAT(c1); +c1 +a +SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1); +c1 +a +EXPLAIN EXTENDED SELECT * FROM t1 WHERE 'a'=CONCAT(c1) AND 'a ' LIKE CONCAT(c1); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('a' = concat(`test`.`t1`.`c1`)) and ('a ' like concat(`test`.`t1`.`c1`))) +DROP TABLE t1; +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('%'),('% '); +SELECT * FROM t1 WHERE '% '=CONCAT(c1); +c1 +% +% +SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1); +c1 +% +SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1); +c1 +% +EXPLAIN EXTENDED SELECT * FROM t1 WHERE '% '=CONCAT(c1) AND 'a' LIKE CONCAT(c1); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('% ' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`))) +DROP TABLE t1; +CREATE TABLE t1 AS SELECT REPEAT('a', 10) AS c1 LIMIT 0; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `c1` varchar(10) CHARACTER SET ucs2 NOT NULL DEFAULT '' +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +INSERT INTO t1 VALUES ('%'),('% '); +SELECT * FROM t1 WHERE '%'=CONCAT(c1); +c1 +% +% +SELECT * FROM t1 WHERE 'a' LIKE CONCAT(c1); +c1 +% +SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1); +c1 +% +EXPLAIN EXTENDED SELECT * FROM t1 WHERE '%'=CONCAT(c1) AND 'a' LIKE CONCAT(c1); +id select_type table type possible_keys key key_len ref rows filtered Extra +1 SIMPLE t1 ALL NULL NULL NULL NULL 2 100.00 Using where +Warnings: +Note 1003 select `test`.`t1`.`c1` AS `c1` from `test`.`t1` where (('%' = concat(`test`.`t1`.`c1`)) and ('a' like concat(`test`.`t1`.`c1`))) +DROP TABLE t1; +SET NAMES latin1; +# +# MDEV-6661 PI() does not work well in UCS2/UTF16/UTF32 context +# +SELECT CONCAT(CONVERT('pi=' USING ucs2),PI()) AS PI; +PI +pi=3.141593 +# +# MDEV-6695 Bad column name for UCS2 string literals +# +SET NAMES utf8, character_set_connection=ucs2; +SELECT 'a','aa'; +a aa +a aa +# +# MDEV-10306 Wrong results with combination of CONCAT, SUBSTR and CONVERT in subquery +# +SET NAMES utf8, character_set_connection=ucs2; +SET @save_optimizer_switch=@@optimizer_switch; +SET optimizer_switch=_utf8'derived_merge=on'; +CREATE TABLE t1 (t VARCHAR(10) CHARSET latin1); +INSERT INTO t1 VALUES('abcdefghi'); +SET NAMES utf8, character_set_connection=ucs2; +SELECT CONCAT(t2,'-',t2) c2 FROM (SELECT HEX(t) t2 FROM t1) sub; +c2 +616263646566676869-616263646566676869 +SELECT CONCAT(t2,'-',t2) c2 FROM (SELECT TO_BASE64(t) t2 FROM t1) sub; +c2 +YWJjZGVmZ2hp-YWJjZGVmZ2hp +DROP TABLE t1; +SET optimizer_switch=@save_optimizer_switch; +# +# End of 10.0 tests +# diff --cc mysql-test/r/func_misc.result index d54a70cab45,66e3cfd4ff4..ea3f57c6204 --- a/mysql-test/r/func_misc.result +++ b/mysql-test/r/func_misc.result @@@ -571,6 -571,20 +571,17 @@@ AND 57813X540X1723 = 'Test' N AVG 0 NULL drop table t1; + # + # MDEV-15630 uuid() function evaluates at wrong time in query + # + CREATE TABLE t1 (id INT); + INSERT INTO t1 VALUES (1),(2),(3); + SELECT COUNT(1), UUID() as uid FROM t1 GROUP BY uid; + COUNT(1) uid + 1 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + 1 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + 1 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + DROP TABLE t1; -# -# End of 5.5 tests -# SELECT NAME_CONST('a', -(1 OR 2)) OR 1; ERROR HY000: Incorrect arguments to NAME_CONST SELECT NAME_CONST('a', -(1 AND 2)) OR 1; diff --cc mysql-test/r/view.result index 5a51ea85f55,7fc3c48c3a0..4e3146052e9 --- a/mysql-test/r/view.result +++ b/mysql-test/r/view.result @@@ -5644,6 -5535,203 +5536,203 @@@ View Create View character_set_client c v1 CREATE ALGORITHM=UNDEFINED DEFINER=`root`@`localhost` SQL SECURITY DEFINER VIEW `v1` AS select group_concat(`t1`.`str` separator '\\') AS `GROUP_CONCAT(str SEPARATOR '\\')` from `t1` latin1 latin1_swedish_ci drop view v1; drop table t1; + CREATE TABLE IF NOT EXISTS t0 (f0 INT); + CREATE TABLE IF NOT EXISTS t1 (f1 INT); + CREATE TABLE IF NOT EXISTS t2 (f2 INT); + CREATE TABLE IF NOT EXISTS t3 (f3 INT); + CREATE TABLE IF NOT EXISTS t4 (f4 INT); + CREATE TABLE IF NOT EXISTS t5 (f5 INT); + CREATE TABLE IF NOT EXISTS t6 (f6 INT); + CREATE TABLE IF NOT EXISTS t7 (f7 INT); + CREATE TABLE IF NOT EXISTS t8 (f8 INT); + CREATE TABLE IF NOT EXISTS t9 (f9 INT); + CREATE TABLE IF NOT EXISTS t10 (f10 INT); + CREATE TABLE IF NOT EXISTS t11 (f11 INT); + CREATE TABLE IF NOT EXISTS t12 (f12 INT); + CREATE TABLE IF NOT EXISTS t13 (f13 INT); + CREATE TABLE IF NOT EXISTS t14 (f14 INT); + CREATE TABLE IF NOT EXISTS t15 (f15 INT); + CREATE TABLE IF NOT EXISTS t16 (f16 INT); + CREATE TABLE IF NOT EXISTS t17 (f17 INT); + CREATE TABLE IF NOT EXISTS t18 (f18 INT); + CREATE TABLE IF NOT EXISTS t19 (f19 INT); + CREATE TABLE IF NOT EXISTS t20 (f20 INT); + CREATE TABLE IF NOT EXISTS t21 (f21 INT); + CREATE TABLE IF NOT EXISTS t22 (f22 INT); + CREATE TABLE IF NOT EXISTS t23 (f23 INT); + CREATE TABLE IF NOT EXISTS t24 (f24 INT); + CREATE TABLE IF NOT EXISTS t25 (f25 INT); + CREATE TABLE IF NOT EXISTS t26 (f26 INT); + CREATE TABLE IF NOT EXISTS t27 (f27 INT); + CREATE TABLE IF NOT EXISTS t28 (f28 INT); + CREATE TABLE IF NOT EXISTS t29 (f29 INT); + CREATE TABLE IF NOT EXISTS t30 (f30 INT); + CREATE TABLE IF NOT EXISTS t31 (f31 INT); + CREATE TABLE IF NOT EXISTS t32 (f32 INT); + CREATE TABLE IF NOT EXISTS t33 (f33 INT); + CREATE TABLE IF NOT EXISTS t34 (f34 INT); + CREATE TABLE IF NOT EXISTS t35 (f35 INT); + CREATE TABLE IF NOT EXISTS t36 (f36 INT); + CREATE TABLE IF NOT EXISTS t37 (f37 INT); + CREATE TABLE IF NOT EXISTS t38 (f38 INT); + CREATE TABLE IF NOT EXISTS t39 (f39 INT); + CREATE TABLE IF NOT EXISTS t40 (f40 INT); + CREATE TABLE IF NOT EXISTS t41 (f41 INT); + CREATE TABLE IF NOT EXISTS t42 (f42 INT); + CREATE TABLE IF NOT EXISTS t43 (f43 INT); + CREATE TABLE IF NOT EXISTS t44 (f44 INT); + CREATE TABLE IF NOT EXISTS t45 (f45 INT); + CREATE TABLE IF NOT EXISTS t46 (f46 INT); + CREATE TABLE IF NOT EXISTS t47 (f47 INT); + CREATE TABLE IF NOT EXISTS t48 (f48 INT); + CREATE TABLE IF NOT EXISTS t49 (f49 INT); + CREATE TABLE IF NOT EXISTS t50 (f50 INT); + CREATE TABLE IF NOT EXISTS t51 (f51 INT); + CREATE TABLE IF NOT EXISTS t52 (f52 INT); + CREATE TABLE IF NOT EXISTS t53 (f53 INT); + CREATE TABLE IF NOT EXISTS t54 (f54 INT); + CREATE TABLE IF NOT EXISTS t55 (f55 INT); + CREATE TABLE IF NOT EXISTS t56 (f56 INT); + CREATE TABLE IF NOT EXISTS t57 (f57 INT); + CREATE TABLE IF NOT EXISTS t58 (f58 INT); + CREATE TABLE IF NOT EXISTS t59 (f59 INT); + CREATE TABLE IF NOT EXISTS t60 (f60 INT); + CREATE OR REPLACE VIEW v60 AS SELECT * FROM t60; + EXPLAIN + SELECT t0.* + FROM t0 + JOIN t1 + ON t1.f1 = t0.f0 + LEFT JOIN t2 + ON t0.f0 = t2.f2 + LEFT JOIN t3 + ON t0.f0 = t3.f3 + LEFT JOIN t4 + ON t0.f0 = t4.f4 + LEFT JOIN t5 + ON t4.f4 = t5.f5 + LEFT JOIN t6 + ON t0.f0 = t6.f6 + LEFT JOIN t7 + ON t0.f0 = t7.f7 + LEFT JOIN t8 + ON t0.f0 = t8.f8 + LEFT JOIN t9 + ON t0.f0 = t9.f9 + LEFT JOIN t10 + ON t0.f0 = t10.f10 + LEFT JOIN t11 + ON t0.f0 = t11.f11 + LEFT JOIN t12 + ON t0.f0 = t12.f12 + LEFT JOIN t13 + ON t0.f0 = t13.f13 + LEFT JOIN t14 + ON t0.f0 = t14.f14 + LEFT JOIN t15 + ON t0.f0 = t15.f15 + LEFT JOIN t16 + ON t0.f0 = t16.f16 + LEFT JOIN t17 + ON t0.f0 = t17.f17 + LEFT JOIN t18 + ON t0.f0 = t18.f18 + LEFT JOIN t19 + ON t18.f18 = t19.f19 + LEFT JOIN t20 + ON t20.f20 = t19.f19 + LEFT JOIN t21 + ON t20.f20 = t21.f21 + LEFT JOIN t22 + ON t19.f19 = t22.f22 + LEFT JOIN t23 + ON t23.f23 = t0.f0 + LEFT JOIN t24 + ON t24.f24 = t23.f23 + LEFT JOIN t25 + ON t0.f0 = t25.f25 + LEFT JOIN t26 + ON t26.f26 = t0.f0 + LEFT JOIN t27 + ON t27.f27 = t0.f0 + LEFT JOIN t28 + ON t0.f0 = t28.f28 + LEFT JOIN t29 + ON t0.f0 = t29.f29 + LEFT JOIN t30 + ON t30.f30 = t0.f0 + LEFT JOIN t31 + ON t0.f0 = t31.f31 + LEFT JOIN t32 + ON t32.f32 = t31.f31 + LEFT JOIN t33 + ON t33.f33 = t0.f0 + LEFT JOIN t34 + ON t33.f33 = t34.f34 + LEFT JOIN t35 + ON t33.f33 = t35.f35 + LEFT JOIN t36 + ON t36.f36 = t0.f0 + LEFT JOIN t37 + ON t32.f32 = t37.f37 + LEFT JOIN t38 + ON t31.f31 = t38.f38 + LEFT JOIN t39 + ON t39.f39 = t0.f0 + LEFT JOIN t40 + ON t40.f40 = t39.f39 + LEFT JOIN t41 + ON t41.f41 = t0.f0 + LEFT JOIN t42 + ON t42.f42 = t41.f41 + LEFT JOIN t43 + ON t43.f43 = t41.f41 + LEFT JOIN t44 + ON t44.f44 = t0.f0 + LEFT JOIN t45 + ON t45.f45 = t0.f0 + LEFT JOIN t46 + ON t46.f46 = t0.f0 + LEFT JOIN t47 + ON t47.f47 = t0.f0 + LEFT JOIN t48 + ON t48.f48 = t0.f0 + LEFT JOIN t49 + ON t0.f0 = t49.f49 + LEFT JOIN t50 + ON t0.f0 = t50.f50 + LEFT JOIN t51 + ON t0.f0 = t51.f51 + LEFT JOIN t52 + ON t52.f52 = t0.f0 + LEFT JOIN t53 + ON t53.f53 = t0.f0 + LEFT JOIN t54 + ON t54.f54 = t0.f0 + LEFT JOIN t55 + ON t55.f55 = t0.f0 + LEFT JOIN t56 + ON t56.f56 = t0.f0 + LEFT JOIN t57 + ON t57.f57 = t0.f0 + LEFT JOIN t58 + ON t58.f58 = t57.f57 + LEFT JOIN t59 + ON t36.f36 = t59.f59 + LEFT JOIN v60 + ON t36.f36 = v60.f60 + ; + id select_type table type possible_keys key key_len ref rows Extra -1 SIMPLE NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables ++1 PRIMARY NULL NULL NULL NULL NULL NULL NULL Impossible WHERE noticed after reading const tables + 2 SUBQUERY NULL NULL NULL NULL NULL NULL NULL no matching row in const table + drop table t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, + t10, t11, t12, t13, t14, t15, t16, t17, t18, + t19, t20, t21, t22, t23, t24, t25, t26, t27, + t28, t29, t30, t31, t32, t33, t34, t35, t36, + t37, t38, t39, t40, t41, t42, t43, t44, t45, + t46, t47, t48, t49, t50, t51, t52, t53, t54, + t55, t56, t57, t58, t59,t60; + drop view v60; # ----------------------------------------------------------------- # -- End of 5.5 tests. # ----------------------------------------------------------------- diff --cc mysql-test/suite/galera/disabled.def index f9909914089,ad966ebab0d..78c3565b99d --- a/mysql-test/suite/galera/disabled.def +++ b/mysql-test/suite/galera/disabled.def @@@ -1,49 -1,4 +1,51 @@@ -galera_var_dirty_reads : MDEV-12539 -query_cache : MDEV-12539 -MW-421 : MDEV-12539 -galera_concurrent_ctas : MDEV-12539 +############################################################################## +# +# List the test cases that are to be disabled temporarily. +# +# Separate the test case name and the comment with ':'. +# +# <testcasename> : MDEV-<xxxx> <comment> +# +# Do not use any TAB characters for whitespace. +# +############################################################################## ++ +MW-336 : MDEV-13549 Galera test failures +galera_gra_log : MDEV-13549 Galera test failures +galera_flush_local : MDEV-13549 Galera test failures +galera_flush : MDEV-13549 Galera test failures +MW-329 : MDEV-13549 Galera test failures +galera_account_management : MariaDB 10.0 does not support ALTER USER +galera_binlog_row_image : MariaDB 10.0 does not support binlog_row_image +galera_binlog_rows_query_log_events: MariaDB does not support binlog_rows_query_log_events +GAL-419 : MDEV-13549 Galera test failures +galera_toi_ddl_fk_insert : MDEV-13549 Galera test failures +galera_var_notify_cmd : MDEV-13549 Galera test failures +galera_var_slave_threads : MDEV-13549 Galera test failures +mysql-wsrep#90 : MDEV-13549 Galera test failures +galera_as_master_gtid : Requires MySQL GTID +galera_as_master_gtid_change_master : Requires MySQL GTID +galera_as_slave_replication_bundle : MDEV-13549 Galera test failures +galera_as_slave_preordered : wsrep-preordered feature not merged to MariaDB +galera_gcs_fragment : MDEV-13549 Galera test failures +galera_gcache_recover : MDEV-13549 Galera test failures +galera_gcache_recover_full_gcache : MDEV-13549 Galera test failures +galera_gcache_recover_manytrx : MDEV-13549 Galera test failures +galera_ist_mysqldump : MDEV-13549 Galera test failures +mysql-wsrep#31 : MDEV-13549 Galera test failures +galera_migrate : MariaDB 10.0 does not support START SLAVE USER +galera_concurrent_ctas : MDEV-13549 Galera test failures +galera_bf_abort_for_update : MDEV-13549 Galera test failures +galera_wsrep_desync_wsrep_on : MDEV-13549 Galera test failures +galera_ssl_upgrade : MDEV-13549 Galera test failures +mysql-wsrep#33 : MDEV-13549 Galera test failures +galera_var_auto_inc_control_on : MDEV-13549 Galera test failures +MW-44 : MDEV-13549 Galera test failures +galera_var_retry_autocommit : MDEV-13549 Galera test failures +pxc-421 : MDEV-13549 Galera test failures +lp1376747-2 : MDEV-13549 Galera test failures +lp1376747 : MDEV-13549 Galera test failures +galera_toi_ddl_nonconflicting : MDEV-13549 Galera test failures +galera_parallel_simple : MDEV-13549 Galera test failures +galera_admin : MDEV-13549 Galera test failures ++MW-416 : MDEV-13549 Galera test failures diff --cc mysql-test/suite/galera/r/galera_defaults.result index b242a468f72,00000000000..e7a776e9047 mode 100644,000000..100644 --- a/mysql-test/suite/galera/r/galera_defaults.result +++ b/mysql-test/suite/galera/r/galera_defaults.result @@@ -1,119 -1,0 +1,119 @@@ +SELECT COUNT(*) = 40 FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%'; +COUNT(*) = 40 +1 +SELECT VARIABLE_NAME, VARIABLE_VALUE +FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES +WHERE VARIABLE_NAME LIKE 'wsrep_%' +AND VARIABLE_NAME NOT IN ( +'WSREP_PROVIDER_OPTIONS', +'WSREP_SST_RECEIVE_ADDRESS', +'WSREP_NODE_ADDRESS', +'WSREP_NODE_NAME', +'WSREP_PROVIDER', +'WSREP_DATA_HOME_DIR', +'WSREP_NODE_INCOMING_ADDRESS', +'WSREP_START_POSITION' +) +ORDER BY VARIABLE_NAME; +VARIABLE_NAME VARIABLE_VALUE +WSREP_AUTO_INCREMENT_CONTROL ON +WSREP_CAUSAL_READS ON +WSREP_CERTIFY_NONPK ON +WSREP_CLUSTER_ADDRESS gcomm:// +WSREP_CLUSTER_NAME my_wsrep_cluster +WSREP_CONVERT_LOCK_TO_TRX OFF +WSREP_DBUG_OPTION +WSREP_DEBUG OFF +WSREP_DESYNC OFF +WSREP_DIRTY_READS OFF +WSREP_DRUPAL_282555_WORKAROUND OFF +WSREP_FORCED_BINLOG_FORMAT NONE +WSREP_LOAD_DATA_SPLITTING ON +WSREP_LOG_CONFLICTS OFF +WSREP_MAX_WS_ROWS 0 +WSREP_MAX_WS_SIZE 2147483647 +WSREP_MYSQL_REPLICATION_BUNDLE 0 +WSREP_NOTIFY_CMD +WSREP_ON ON +WSREP_OSU_METHOD TOI +WSREP_RECOVER OFF +WSREP_REPLICATE_MYISAM OFF +WSREP_RESTART_SLAVE OFF +WSREP_RETRY_AUTOCOMMIT 1 +WSREP_SLAVE_FK_CHECKS ON +WSREP_SLAVE_THREADS 1 +WSREP_SLAVE_UK_CHECKS OFF +WSREP_SST_AUTH +WSREP_SST_DONOR +WSREP_SST_DONOR_REJECTS_QUERIES OFF +WSREP_SST_METHOD rsync +WSREP_SYNC_WAIT 15 - <BASE_DIR>; <BASE_HOST>; <BASE_PORT>; cert.log_conflicts = no; debug = no; evs.auto_evict = 0; evs.causal_keepalive_period = PT1S; evs.debug_log_mask = 0x1; evs.delay_margin = PT1S; evs.delayed_keep_period = PT30S; evs.inactive_check_period = PT0.5S; evs.inactive_timeout = PT15S; evs.info_log_mask = 0; evs.install_timeout = PT7.5S; evs.join_retrans_period = PT1S; evs.keepalive_period = PT1S; evs.max_install_timeouts = 3; evs.send_window = 4; evs.stats_report_period = PT1M; evs.suspect_timeout = PT10S; evs.use_aggregate = true; evs.user_send_window = 2; evs.version = 0; evs.view_forget_timeout = P1D; <GCACHE_DIR>; gcache.keep_pages_size = 0; gcache.mem_size = 0; <GCACHE_NAME>; gcache.page_size = 128M; gcache.recover = no; gcache.size = 10M; gcomm.thread_prio = ; gcs.fc_debug = 0; gcs.fc_factor = 1.0; gcs.fc_limit = 16; gcs.fc_master_slave = no; gcs.max_packet_size = 64500; gcs.max_throttle = 0.25; <GCS_RECV_Q_HARD_LIMIT>; gcs.recv_q_soft_limit = 0.25; gcs.sync_donor = no; <G MCAST_LISTEN_ADDR>; gmcast.mcast_addr = ; gmcast.mcast_ttl = 1; gmcast.peer_timeout = PT3S; gmcast.segment = 0; gmcast.time_wait = PT5S; gmcast.version = 0; <IST_RECV_ADDR>; pc.announce_timeout = PT3S; pc.checksum = false; pc.ignore_quorum = false; pc.ignore_sb = false; pc.linger = PT20S; pc.npvo = false; pc.recovery = true; pc.version = 0; pc.wait_prim = true; pc.wait_prim_timeout = PT30S; pc.weight = 1; protonet.backend = asio; protonet.version = 0; repl.causal_read_timeout = PT90S; repl.commit_order = 3; repl.key_format = FLAT8; repl.max_ws_size = 2147483647; repl.proto_max = 7; socket.checksum = 2; socket.recv_buf_size = 212992; ++<BASE_DIR>; <BASE_HOST>; <BASE_PORT>; cert.log_conflicts = no; debug = no; evs.auto_evict = 0; evs.causal_keepalive_period = PT1S; evs.debug_log_mask = 0x1; evs.delay_margin = PT1S; evs.delayed_keep_period = PT30S; evs.inactive_check_period = PT0.5S; evs.inactive_timeout = PT15S; evs.info_log_mask = 0; evs.install_timeout = PT7.5S; evs.join_retrans_period = PT1S; evs.keepalive_period = PT1S; evs.max_install_timeouts = 3; evs.send_window = 4; evs.stats_report_period = PT1M; evs.suspect_timeout = PT10S; evs.use_aggregate = true; evs.user_send_window = 2; evs.version = 0; evs.view_forget_timeout = P1D; <GCACHE_DIR>; gcache.keep_pages_size = 0; gcache.mem_size = 0; <GCACHE_NAME>; gcache.page_size = 128M; gcache.recover = no; gcache.size = 10M; gcomm.thread_prio = ; gcs.fc_debug = 0; gcs.fc_factor = 1.0; gcs.fc_limit = 16; gcs.fc_master_slave = no; gcs.max_packet_size = 64500; gcs.max_throttle = 0.25; <GCS_RECV_Q_HARD_LIMIT>; gcs.recv_q_soft_limit = 0.25; gcs.sync_donor = no; <G MCAST_LISTEN_ADDR>; gmcast.mcast_addr = ; gmcast.mcast_ttl = 1; gmcast.peer_timeout = PT3S; gmcast.segment = 0; gmcast.time_wait = PT5S; gmcast.version = 0; <IST_RECV_ADDR>; pc.announce_timeout = PT3S; pc.checksum = false; pc.ignore_quorum = false; pc.ignore_sb = false; pc.linger = PT20S; pc.npvo = false; pc.recovery = true; pc.version = 0; pc.wait_prim = true; pc.wait_prim_timeout = PT30S; pc.weight = 1; protonet.backend = asio; protonet.version = 0; repl.causal_read_timeout = PT90S; repl.commit_order = 3; repl.key_format = FLAT8; repl.max_ws_size = 2147483647; repl.proto_max = 8; socket.checksum = 2; socket.recv_buf_size = 212992; +SELECT COUNT(*) FROM INFORMATION_SCHEMA.GLOBAL_STATUS +WHERE VARIABLE_NAME LIKE 'wsrep_%' +AND VARIABLE_NAME != 'wsrep_debug_sync_waiters'; +COUNT(*) +58 +SELECT VARIABLE_NAME FROM INFORMATION_SCHEMA.GLOBAL_STATUS +WHERE VARIABLE_NAME LIKE 'wsrep_%' +AND VARIABLE_NAME != 'wsrep_debug_sync_waiters' +ORDER BY VARIABLE_NAME; +VARIABLE_NAME +WSREP_APPLY_OOOE +WSREP_APPLY_OOOL +WSREP_APPLY_WINDOW +WSREP_CAUSAL_READS +WSREP_CERT_DEPS_DISTANCE +WSREP_CERT_INDEX_SIZE +WSREP_CERT_INTERVAL +WSREP_CLUSTER_CONF_ID +WSREP_CLUSTER_SIZE +WSREP_CLUSTER_STATE_UUID +WSREP_CLUSTER_STATUS +WSREP_COMMIT_OOOE +WSREP_COMMIT_OOOL +WSREP_COMMIT_WINDOW +WSREP_CONNECTED +WSREP_DESYNC_COUNT +WSREP_EVS_DELAYED +WSREP_EVS_EVICT_LIST +WSREP_EVS_REPL_LATENCY +WSREP_EVS_STATE +WSREP_FLOW_CONTROL_PAUSED +WSREP_FLOW_CONTROL_PAUSED_NS +WSREP_FLOW_CONTROL_RECV +WSREP_FLOW_CONTROL_SENT +WSREP_GCOMM_UUID +WSREP_INCOMING_ADDRESSES +WSREP_LAST_COMMITTED +WSREP_LOCAL_BF_ABORTS +WSREP_LOCAL_CACHED_DOWNTO +WSREP_LOCAL_CERT_FAILURES +WSREP_LOCAL_COMMITS +WSREP_LOCAL_INDEX +WSREP_LOCAL_RECV_QUEUE +WSREP_LOCAL_RECV_QUEUE_AVG +WSREP_LOCAL_RECV_QUEUE_MAX +WSREP_LOCAL_RECV_QUEUE_MIN +WSREP_LOCAL_REPLAYS +WSREP_LOCAL_SEND_QUEUE +WSREP_LOCAL_SEND_QUEUE_AVG +WSREP_LOCAL_SEND_QUEUE_MAX +WSREP_LOCAL_SEND_QUEUE_MIN +WSREP_LOCAL_STATE +WSREP_LOCAL_STATE_COMMENT +WSREP_LOCAL_STATE_UUID +WSREP_PROTOCOL_VERSION +WSREP_PROVIDER_NAME +WSREP_PROVIDER_VENDOR +WSREP_PROVIDER_VERSION +WSREP_READY +WSREP_RECEIVED +WSREP_RECEIVED_BYTES +WSREP_REPLICATED +WSREP_REPLICATED_BYTES +WSREP_REPL_DATA_BYTES +WSREP_REPL_KEYS +WSREP_REPL_KEYS_BYTES +WSREP_REPL_OTHER_BYTES +WSREP_THREAD_COUNT diff --cc mysql-test/suite/galera/r/galera_var_dirty_reads.result index c469e49731d,8a3175912c7..405d86b3027 --- a/mysql-test/suite/galera/r/galera_var_dirty_reads.result +++ b/mysql-test/suite/galera/r/galera_var_dirty_reads.result @@@ -42,6 -88,8 +42,4 @@@ SELECT * FROM t1 i 1 DROP TABLE t1; - set GLOBAL auto_increment_offset = 1; - set GLOBAL auto_increment_offset = 2; -drop user user1; -drop user user2; -disconnect node_2; -disconnect node_1; # End of test diff --cc mysql-test/suite/galera/t/galera_var_dirty_reads.test index 152c875a946,8fd3b1d22f2..df4c033ab3d --- a/mysql-test/suite/galera/t/galera_var_dirty_reads.test +++ b/mysql-test/suite/galera/t/galera_var_dirty_reads.test @@@ -5,6 -5,14 +5,11 @@@ --source include/galera_cluster.inc --source include/have_innodb.inc ---disable_query_log + # Save original auto_increment_offset values. ---connection node_1 -let $auto_increment_offset_node_1 = `SELECT @@global.auto_increment_offset`; ---connection node_2 -let $auto_increment_offset_node_2 = `SELECT @@global.auto_increment_offset`; ---enable_query_log ++--let $node_1=node_1 ++--let $node_2=node_2 ++--source include/auto_increment_offset_save.inc + --connection node_2 --let $wsrep_cluster_address_saved = `SELECT @@global.wsrep_cluster_address` @@@ -64,10 -114,16 +69,8 @@@ USE test SELECT * FROM t1; # Cleanup DROP TABLE t1; - set GLOBAL auto_increment_offset = 1; -drop user user1; -drop user user2; ---disable_query_log -# Restore original auto_increment_offset values. ---connection node_1 ---eval SET @@global.auto_increment_offset = $auto_increment_offset_node_1; ----connection node_2 - set GLOBAL auto_increment_offset = 2; ---eval SET @@global.auto_increment_offset = $auto_increment_offset_node_2; ---enable_query_log ++--source include/auto_increment_offset_restore.inc --source include/galera_end.inc --echo # End of test diff --cc mysql-test/suite/parts/r/partition_alter_maria.result index fd09c0bd4bb,d79bc0a41fe..7d923570cfe --- a/mysql-test/suite/parts/r/partition_alter_maria.result +++ b/mysql-test/suite/parts/r/partition_alter_maria.result @@@ -16,16 -16,16 +16,25 @@@ select * from t1 pk dt 1 2017-09-28 15:12:00 drop table t1; + create table t1 (a int) engine=Aria transactional=1 partition by hash(a) partitions 2; + show create table t1; + Table Create Table + t1 CREATE TABLE `t1` ( + `a` int(11) DEFAULT NULL + ) ENGINE=Aria DEFAULT CHARSET=latin1 TRANSACTIONAL=1 + /*!50100 PARTITION BY HASH (a) + PARTITIONS 2 */ + drop table t1; # +# MDEV-14641 Incompatible key or row definition between the MariaDB .frm file and the information in the storage engine +# +CREATE TABLE t1 (i INT) ENGINE=Aria PARTITION BY LIST(i) (PARTITION p0 VALUES IN (1), PARTITION p1 VALUES IN (2));; +ALTER TABLE t1 ROW_FORMAT=COMPRESSED; +ALTER TABLE t1 DROP PARTITION p1; +SELECT * FROM t1; +i +DROP TABLE t1; +# # MDEV-13788 Server crash when issuing bad SQL partition syntax # CREATE TABLE t1 (id int, d date) ENGINE=Aria PARTITION BY RANGE COLUMNS(d) (PARTITION p1 VALUES LESS THAN (MAXVALUE)); diff --cc mysql-test/t/func_misc.test index dc7202268d6,c21630c0c7b..4afed7d6f6e --- a/mysql-test/t/func_misc.test +++ b/mysql-test/t/func_misc.test @@@ -596,6 -596,22 +596,18 @@@ AND 57813X540X1723 = 'Test' drop table t1; + + --echo # + --echo # MDEV-15630 uuid() function evaluates at wrong time in query + --echo # + + CREATE TABLE t1 (id INT); + INSERT INTO t1 VALUES (1),(2),(3); + --replace_column 2 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + SELECT COUNT(1), UUID() as uid FROM t1 GROUP BY uid; + DROP TABLE t1; + + ---echo # ---echo # End of 5.5 tests ---echo # - # # Bug#12735545 - PARSER STACK OVERFLOW WITH NAME_CONST # CONTAINING OR EXPRESSION diff --cc scripts/wsrep_sst_xtrabackup-v2.sh index 26119af2c61,00d8fe21113..f107cea6c74 --- a/scripts/wsrep_sst_xtrabackup-v2.sh +++ b/scripts/wsrep_sst_xtrabackup-v2.sh @@@ -1045,9 -863,9 +1045,9 @@@ the wsrep_log_info "Cleaning the existing datadir and innodb-data/log directories" - find $ib_home_dir $ib_log_dir $ib_undo_dir $DATA -mindepth 1 -regex $cpat -prune -o -exec rm -rfv {} 1>&2 \+ + find $ib_home_dir $ib_log_dir $ib_undo_dir $DATA -mindepth 1 -prune -regex $cpat -o -exec rm -rfv {} 1>&2 \+ - tempdir=$(parse_cnf mysqld log-bin "") + tempdir=$(parse_cnf --mysqld log-bin "") if [[ -n ${tempdir:-} ]];then binlog_dir=$(dirname $tempdir) binlog_file=$(basename $tempdir) diff --cc sql/event_data_objects.cc index e7bdc42b2e6,0cb123451df..aa85b570a84 --- a/sql/event_data_objects.cc +++ b/sql/event_data_objects.cc @@@ -1469,29 -1466,38 +1469,28 @@@ end saved_master_access= thd->security_ctx->master_access; thd->security_ctx->master_access |= SUPER_ACL; + bool save_tx_read_only= thd->tx_read_only; + thd->tx_read_only= false; #ifdef WITH_WSREP - if (WSREP(thd)) { - // sql_print_information("sizeof(LEX) = %d", sizeof(struct LEX)); - // sizeof(LEX) = 4512, so it's relatively safe to allocate it on stack. - LEX lex; - LEX* saved = thd->lex; - lex.sql_command = SQLCOM_DROP_EVENT; - thd->lex = &lex; - WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL); - thd->lex = saved; - /* - This code is processing event execution and does not have client - connection. Here, event execution will now execute a prepared - DROP EVENT statement, but thd->lex->sql_command is set to - SQLCOM_CREATE_PROCEDURE - DROP EVENT will be logged in binlog, and we have to - replicate it to make all nodes have consistent event definitions - Wsrep DDL replication is triggered inside Events::drop_event(), - and here we need to prepare the THD so that DDL replication is - possible, essentially it requires setting sql_command to - SQLCOMM_DROP_EVENT, we will switch sql_command for the duration - of DDL replication only. - */ - const enum_sql_command sql_command_save= thd->lex->sql_command; + const bool sql_command_set= WSREP(thd); - if (WSREP(thd)) - { ++ const enum_sql_command sql_command_save= thd->lex->sql_command; ++ ++ if (sql_command_set) { + thd->lex->sql_command = SQLCOM_DROP_EVENT; } #endif - + ret= Events::drop_event(thd, dbname, name, FALSE); #ifdef WITH_WSREP - WSREP_TO_ISOLATION_END; - error: + if (sql_command_set) + { + WSREP_TO_ISOLATION_END; + thd->lex->sql_command = sql_command_save; + } #endif + thd->tx_read_only= save_tx_read_only; thd->security_ctx->master_access= saved_master_access; } } diff --cc sql/events.cc index dd4e4887d50,a6379ec5a46..661d9e19001 --- a/sql/events.cc +++ b/sql/events.cc @@@ -405,10 -401,16 +406,14 @@@ Events::create_event(THD *thd, Event_pa } } } - /* Restore the state of binlog format */ - DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row()); - if (save_binlog_row_based) - thd->set_current_stmt_binlog_format_row(); + + thd->restore_stmt_binlog_format(save_binlog_format); DBUG_RETURN(ret); + #ifdef WITH_WSREP + error: + DBUG_RETURN(TRUE); + #endif /* WITH_WSREP */ } @@@ -517,9 -521,16 +523,13 @@@ Events::update_event(THD *thd, Event_pa ret= write_bin_log(thd, TRUE, thd->query(), thd->query_length()); } } - /* Restore the state of binlog format */ - DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row()); - if (save_binlog_row_based) - thd->set_current_stmt_binlog_format_row(); + thd->restore_stmt_binlog_format(save_binlog_format); DBUG_RETURN(ret); + #ifdef WITH_WSREP + error: + DBUG_RETURN(TRUE); + #endif /* WITH_WSREP */ } @@@ -578,9 -591,15 +589,13 @@@ Events::drop_event(THD *thd, LEX_STRIN DBUG_ASSERT(thd->query() && thd->query_length()); ret= write_bin_log(thd, TRUE, thd->query(), thd->query_length()); } - /* Restore the state of binlog format */ - DBUG_ASSERT(!thd->is_current_stmt_binlog_format_row()); - if (save_binlog_row_based) - thd->set_current_stmt_binlog_format_row(); + + thd->restore_stmt_binlog_format(save_binlog_format); DBUG_RETURN(ret); + #ifdef WITH_WSREP + error: + DBUG_RETURN(TRUE); + #endif /* WITH_WSREP */ } diff --cc sql/handler.cc index 657cb01cbc8,7da373e6802..fc8bb53f35d --- a/sql/handler.cc +++ b/sql/handler.cc @@@ -4453,19 -3838,13 +4453,20 @@@ handler::ha_create(const char *name, TA */ int -handler::ha_create_handler_files(const char *name, const char *old_name, - int action_flag, HA_CREATE_INFO *info) +handler::ha_create_partitioning_metadata(const char *name, const char *old_name, + int action_flag) { - if (!opt_readonly || !info || !(info->options & HA_LEX_CREATE_TMP_TABLE)) - mark_trx_read_write(); + /* + Normally this is done when unlocked, but in fast_alter_partition_table, + it is done on an already locked handler when preparing to alter/rename + partitions. + */ + DBUG_ASSERT(m_lock_type == F_UNLCK || + (!old_name && strcmp(name, table_share->path.str))); + - return create_handler_files(name, old_name, action_flag, info); + mark_trx_read_write(); + + return create_partitioning_metadata(name, old_name, action_flag); } diff --cc sql/item_cmpfunc.h index 6d81c7acc40,3c8cc71370d..6cd7e0e3e78 --- a/sql/item_cmpfunc.h +++ b/sql/item_cmpfunc.h @@@ -272,11 -268,13 +273,15 @@@ public virtual void get_cache_parameters(List<Item> ¶meters); bool is_top_level_item(); bool eval_not_null_tables(uchar *opt_arg); - void fix_after_pullout(st_select_lex *new_parent, Item **ref); + void fix_after_pullout(st_select_lex *new_parent, Item **ref, bool merge); + bool invisible_mode(); + void reset_cache() { cache= NULL; } virtual void print(String *str, enum_query_type query_type); void restore_first_argument(); + Item* get_wrapped_in_subselect_item() + { + return args[1]; + } }; class Comp_creator diff --cc sql/item_func.h index 7dea193c99b,57818228b98..b0ba87b4bd0 --- a/sql/item_func.h +++ b/sql/item_func.h @@@ -73,7 -66,7 +73,7 @@@ public NOW_FUNC, TRIG_COND_FUNC, SUSERVAR_FUNC, GUSERVAR_FUNC, COLLATE_FUNC, EXTRACT_FUNC, CHAR_TYPECAST_FUNC, FUNC_SP, UDF_FUNC, - NEG_FUNC, GSYSVAR_FUNC, DYNCOL_FUNC }; - NEG_FUNC, GSYSVAR_FUNC, IN_OPTIMIZER_FUNC }; ++ NEG_FUNC, GSYSVAR_FUNC, IN_OPTIMIZER_FUNC, DYNCOL_FUNC }; enum optimize_type { OPTIMIZE_NONE,OPTIMIZE_KEY,OPTIMIZE_OP, OPTIMIZE_NULL, OPTIMIZE_EQUAL }; enum Type type() const { return FUNC_ITEM; } diff --cc sql/log.cc index b63d72f0d4a,ca7833a0460..0098dd2ba3d --- a/sql/log.cc +++ b/sql/log.cc @@@ -8589,9 -7042,10 +8589,9 @@@ int TC_LOG_MMAP::open(const char *opt_n DBUG_ASSERT(opt_name && opt_name[0]); tc_log_page_size= my_getpagesize(); - DBUG_ASSERT(TC_LOG_PAGE_SIZE % tc_log_page_size == 0); fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME); - if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR, MYF(0))) < 0) + if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR | O_CLOEXEC, MYF(0))) < 0) { if (my_errno != ENOENT) goto err; diff --cc sql/log_event.cc index c57331df807,12489d6d7eb..e799f37ddae --- a/sql/log_event.cc +++ b/sql/log_event.cc @@@ -4314,38 -3832,22 +4347,38 @@@ int Query_log_event::do_apply_event(rpl } else thd->variables.collation_database= thd->db_charset; - + + /* + Record any GTID in the same transaction, so slave state is + transactionally consistent. + */ + if (current_stmt_is_commit) { - const CHARSET_INFO *cs= thd->charset(); - /* - We cannot ask for parsing a statement using a character set - without state_maps (parser internal data). - */ - if (!cs->state_map) + thd->variables.option_bits&= ~OPTION_GTID_BEGIN; + if (rgi->gtid_pending) { - rli->report(ERROR_LEVEL, ER_SLAVE_FATAL_ERROR, - ER_THD(thd, ER_SLAVE_FATAL_ERROR), - "character_set cannot be parsed"); - thd->is_slave_error= true; - goto end; - } - } + sub_id= rgi->gtid_sub_id; + rgi->gtid_pending= false; + + gtid= rgi->current_gtid; + if (rpl_global_gtid_slave_state->record_gtid(thd, >id, sub_id, + true, false)) + { + int errcode= thd->get_stmt_da()->sql_errno(); + if (!is_parallel_retry_error(rgi, errcode)) + rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE, + rgi->gtid_info(), + "Error during COMMIT: failed to update GTID state in " + "%s.%s: %d: %s", + "mysql", rpl_gtid_slave_state_table_name.str, + errcode, + thd->get_stmt_da()->message()); + sub_id= 0; + thd->is_slave_error= 1; + goto end; + } + } + } thd->table_map_for_update= (table_map)table_map_for_update; thd->set_invoker(&user, &host); @@@ -7703,7 -6540,12 +7752,12 @@@ User_var_log_event(const char* buf, uin Old events will not have this extra byte, thence, we keep the flags set to UNDEF_F. */ - uint bytes_read= ((val + val_len) - start); + uint bytes_read= ((val + val_len) - buf_start); + if (bytes_read > event_len) + { + error= true; + goto err; + } if ((data_written - bytes_read) > 0) { flags= (uint) *(buf + UV_VAL_IS_NULL + UV_VAL_TYPE_SIZE + diff --cc sql/mysqld.cc index 8575709203c,4acfe57c684..f558b78104f --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@@ -1110,19 -962,10 +1110,20 @@@ static PSI_cond_info all_server_conds[] { &key_COND_wsrep_sst_init, "COND_wsrep_sst_init", PSI_FLAG_GLOBAL}, { &key_COND_wsrep_sst_thread, "wsrep_sst_thread", 0}, { &key_COND_wsrep_rollback, "COND_wsrep_rollback", PSI_FLAG_GLOBAL}, + { &key_COND_wsrep_thd, "THD::COND_wsrep_thd", 0}, { &key_COND_wsrep_replaying, "COND_wsrep_replaying", PSI_FLAG_GLOBAL}, #endif - { &key_COND_flush_thread_cache, "COND_flush_thread_cache", PSI_FLAG_GLOBAL} + { &key_COND_flush_thread_cache, "COND_flush_thread_cache", PSI_FLAG_GLOBAL}, + { &key_COND_rpl_thread, "COND_rpl_thread", 0}, + { &key_COND_rpl_thread_queue, "COND_rpl_thread_queue", 0}, + { &key_COND_rpl_thread_stop, "COND_rpl_thread_stop", 0}, + { &key_COND_rpl_thread_pool, "COND_rpl_thread_pool", 0}, + { &key_COND_parallel_entry, "COND_parallel_entry", 0}, + { &key_COND_group_commit_orderer, "COND_group_commit_orderer", 0}, + { &key_COND_prepare_ordered, "COND_prepare_ordered", 0}, + { &key_COND_slave_init, "COND_slave_init", 0}, + { &key_COND_wait_gtid, "COND_wait_gtid", 0}, + { &key_COND_gtid_ignore_duplicates, "COND_gtid_ignore_duplicates", 0} }; PSI_thread_key key_thread_bootstrap, key_thread_delayed_insert, diff --cc sql/mysqld.h index 4af04a3df75,91fa2eda7fd..3bb9f35077e --- a/sql/mysqld.h +++ b/sql/mysqld.h @@@ -245,12 -219,12 +245,13 @@@ extern pthread_key(MEM_ROOT**,THR_MALLO #ifdef HAVE_PSI_INTERFACE #ifdef HAVE_MMAP extern PSI_mutex_key key_PAGE_lock, key_LOCK_sync, key_LOCK_active, - key_LOCK_pool; + key_LOCK_pool, key_LOCK_pending_checkpoint; #endif /* HAVE_MMAP */ + #ifdef WITH_WSREP extern PSI_mutex_key key_LOCK_wsrep_thd; - #endif /* WITH_WSREP */ + extern PSI_cond_key key_COND_wsrep_thd; + #endif /* HAVE_WSREP */ #ifdef HAVE_OPENSSL extern PSI_mutex_key key_LOCK_des_key_file; diff --cc sql/slave.cc index a633722db16,f370e3dd27f..3dee39ad65f --- a/sql/slave.cc +++ b/sql/slave.cc @@@ -4936,39 -3778,37 +4936,39 @@@ err_during_init to avoid unneeded position re-init */ thd->temporary_tables = 0; // remove tempation from destructor to close them - DBUG_ASSERT(thd->net.buff != 0); - net_end(&thd->net); // destructor will not free it, because we are weird - DBUG_ASSERT(rli->sql_thd == thd); THD_CHECK_SENTRY(thd); - rli->sql_thd= 0; - set_thd_in_use_temporary_tables(rli); // (re)set sql_thd in use for saved temp tables + rli->sql_driver_thd= 0; mysql_mutex_lock(&LOCK_thread_count); - THD_CHECK_SENTRY(thd); - delete thd; + thd->rgi_fake= thd->rgi_slave= NULL; + delete serial_rgi; mysql_mutex_unlock(&LOCK_thread_count); + #ifdef WITH_WSREP - /* if slave stopped due to node going non primary, we set global flag to - trigger automatic restart of slave when node joins back to cluster + /* + If slave stopped due to node going non primary, we set global flag to + trigger automatic restart of slave when node joins back to cluster. */ - if (wsrep_node_dropped && wsrep_restart_slave) - { - if (wsrep_ready_get()) - { - WSREP_INFO("Slave error due to node temporarily non-primary" - "SQL slave will continue"); - wsrep_node_dropped= FALSE; - mysql_mutex_unlock(&rli->run_lock); - goto wsrep_restart_point; - } else { - WSREP_INFO("Slave error due to node going non-primary"); - WSREP_INFO("wsrep_restart_slave was set and therefore slave will be " - "automatically restarted when node joins back to cluster"); - wsrep_restart_slave_activated= TRUE; - } - } + if (wsrep_node_dropped && wsrep_restart_slave) + { - if (wsrep_ready) ++ if (wsrep_ready_get()) + { + WSREP_INFO("Slave error due to node temporarily non-primary" + "SQL slave will continue"); + wsrep_node_dropped= FALSE; + mysql_mutex_unlock(&rli->run_lock); + WSREP_DEBUG("wsrep_conflict_state now: %d", thd->wsrep_conflict_state); + WSREP_INFO("slave restart: %d", thd->wsrep_conflict_state); + thd->wsrep_conflict_state= NO_CONFLICT; + goto wsrep_restart_point; + } else { + WSREP_INFO("Slave error due to node going non-primary"); + WSREP_INFO("wsrep_restart_slave was set and therefore slave will be " + "automatically restarted when node joins back to cluster."); + wsrep_restart_slave_activated= TRUE; + } + } #endif /* WITH_WSREP */ + /* Note: the order of the broadcast and unlock calls below (first broadcast, then unlock) is important. Otherwise a killer_thread can execute between the calls and diff --cc sql/sql_class.cc index 2502962cef0,ce875ba87ef..b3d964d4006 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@@ -4544,251 -4305,27 +4549,251 @@@ extern "C" int thd_slave_thread(const M return(thd->slave_thread); } -extern "C" int thd_non_transactional_update(const MYSQL_THD thd) +/* Returns true for a worker thread in parallel replication. */ +extern "C" int thd_rpl_is_parallel(const MYSQL_THD thd) { - return(thd->transaction.all.modified_non_trans_table); + return thd->rgi_slave && thd->rgi_slave->is_parallel_exec; } -extern "C" int thd_binlog_format(const MYSQL_THD thd) -{ -#ifdef WITH_WSREP - if (WSREP(thd)) - { - /* for wsrep binlog format is meaningful also when binlogging is off */ - return (int) WSREP_BINLOG_FORMAT(thd->variables.binlog_format); - } -#endif /* WITH_WSREP */ - if (mysql_bin_log.is_open() && (thd->variables.option_bits & OPTION_BIN_LOG)) - return (int) thd->variables.binlog_format; - else - return BINLOG_FORMAT_UNSPEC; -} +/* + This function can optionally be called to check if thd_report_wait_for() + needs to be called for waits done by a given transaction. -extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all) + If this function returns false for a given thd, there is no need to do any + calls to thd_report_wait_for() on that thd. + + This call is optional; it is safe to call thd_report_wait_for() in any case. + This call can be used to save some redundant calls to thd_report_wait_for() + if desired. (This is unlikely to matter much unless there are _lots_ of + waits to report, as the overhead of thd_report_wait_for() is small). +*/ +extern "C" int +thd_need_wait_for(const MYSQL_THD thd) +{ + rpl_group_info *rgi; + + if (mysql_bin_log.is_open() && opt_binlog_commit_wait_count > 0) + return true; + if (!thd) + return false; + rgi= thd->rgi_slave; + if (!rgi) + return false; + return rgi->is_parallel_exec; +} + +/* + Used by InnoDB/XtraDB to report that one transaction THD is about to go to + wait for a transactional lock held by another transactions OTHER_THD. + + This is used for parallel replication, where transactions are required to + commit in the same order on the slave as they did on the master. If the + transactions on the slave encounters lock conflicts on the slave that did + not exist on the master, this can cause deadlocks. + + Normally, such conflicts will not occur, because the same conflict would + have prevented the two transactions from committing in parallel on the + master, thus preventing them from running in parallel on the slave in the + first place. However, it is possible in case when the optimizer chooses a + different plan on the slave than on the master (eg. table scan instead of + index scan). + + InnoDB/XtraDB reports lock waits using this call. If a lock wait causes a + deadlock with the pre-determined commit order, we kill the later transaction, + and later re-try it, to resolve the deadlock. + + This call need only receive reports about waits for locks that will remain + until the holding transaction commits. InnoDB/XtraDB auto-increment locks + are released earlier, and so need not be reported. (Such false positives are + not harmful, but could lead to unnecessary kill and retry, so best avoided). +*/ +extern "C" void +thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd) +{ + rpl_group_info *rgi; + rpl_group_info *other_rgi; + + if (!thd || !other_thd) + return; + binlog_report_wait_for(thd, other_thd); + rgi= thd->rgi_slave; + other_rgi= other_thd->rgi_slave; + if (!rgi || !other_rgi) + return; + if (!rgi->is_parallel_exec) + return; + if (rgi->rli != other_rgi->rli) + return; + if (!rgi->gtid_sub_id || !other_rgi->gtid_sub_id) + return; + if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id) + return; + if (rgi->gtid_sub_id > other_rgi->gtid_sub_id) + return; + /* + This transaction is about to wait for another transaction that is required + by replication binlog order to commit after. This would cause a deadlock. + + So send a kill to the other transaction, with a temporary error; this will + cause replication to rollback (and later re-try) the other transaction, + releasing the lock for this transaction so replication can proceed. + */ + other_rgi->killed_for_retry= true; + mysql_mutex_lock(&other_thd->LOCK_thd_data); + other_thd->awake(KILL_CONNECTION); + mysql_mutex_unlock(&other_thd->LOCK_thd_data); +} + +/* + This function is called from InnoDB/XtraDB to check if the commit order of + two transactions has already been decided by the upper layer. This happens + in parallel replication, where the commit order is forced to be the same on + the slave as it was originally on the master. + + If this function returns false, it means that such commit order will be + enforced. This allows the storage engine to optionally omit gap lock waits + or similar measures that would otherwise be needed to ensure that + transactions would be serialised in a way that would cause a commit order + that is correct for binlogging for statement-based replication. + + Since transactions are only run in parallel on the slave if they ran without + lock conflicts on the master, normally no lock conflicts on the slave happen + during parallel replication. However, there are a couple of corner cases + where it can happen, like these secondary-index operations: + + T1: INSERT INTO t1 VALUES (7, NULL); + T2: DELETE FROM t1 WHERE b <= 3; + + T1: UPDATE t1 SET secondary=NULL WHERE primary=1 + T2: DELETE t1 WHERE secondary <= 3 + + The DELETE takes a gap lock that can block the INSERT/UPDATE, but the row + locks set by INSERT/UPDATE do not block the DELETE. Thus, the execution + order of the transactions determine whether a lock conflict occurs or + not. Thus a lock conflict can occur on the slave where it did not on the + master. + + If this function returns true, normal locking should be done as required by + the binlogging and transaction isolation level in effect. But if it returns + false, the correct order will be enforced anyway, and InnoDB/XtraDB can + avoid taking the gap lock, preventing the lock conflict. + + Calling this function is just an optimisation to avoid unnecessary + deadlocks. If it was not used, a gap lock would be set that could eventually + cause a deadlock; the deadlock would be caught by thd_report_wait_for() and + the transaction T2 killed and rolled back (and later re-tried). +*/ +extern "C" int +thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd) +{ + rpl_group_info *rgi, *other_rgi; + + DBUG_EXECUTE_IF("disable_thd_need_ordering_with", return 1;); + if (!thd || !other_thd) + return 1; + rgi= thd->rgi_slave; + other_rgi= other_thd->rgi_slave; + if (!rgi || !other_rgi) + return 1; + if (!rgi->is_parallel_exec) + return 1; + if (rgi->rli != other_rgi->rli) + return 1; + if (rgi->current_gtid.domain_id != other_rgi->current_gtid.domain_id) + return 1; + if (!rgi->commit_id || rgi->commit_id != other_rgi->commit_id) + return 1; + DBUG_EXECUTE_IF("thd_need_ordering_with_force", return 1;); + /* + Otherwise, these two threads are doing parallel replication within the same + replication domain. Their commit order is already fixed, so we do not need + gap locks or similar to otherwise enforce ordering (and in fact such locks + could lead to unnecessary deadlocks and transaction retry). + */ + return 0; +} + + +/* + If the storage engine detects a deadlock, and needs to choose a victim + transaction to roll back, it can call this function to ask the upper + server layer for which of two possible transactions is prefered to be + aborted and rolled back. + + In parallel replication, if two transactions are running in parallel and + one is fixed to commit before the other, then the one that commits later + will be prefered as the victim - chosing the early transaction as a victim + will not resolve the deadlock anyway, as the later transaction still needs + to wait for the earlier to commit. + + Otherwise, a transaction that uses only transactional tables, and can thus + be safely rolled back, will be prefered as a deadlock victim over a + transaction that also modified non-transactional (eg. MyISAM) tables. + + The return value is -1 if the first transaction is prefered as a deadlock + victim, 1 if the second transaction is prefered, or 0 for no preference (in + which case the storage engine can make the choice as it prefers). +*/ +extern "C" int +thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2) +{ + rpl_group_info *rgi1, *rgi2; + bool nontrans1, nontrans2; + + if (!thd1 || !thd2) + return 0; + + /* + If the transactions are participating in the same replication domain in + parallel replication, then request to select the one that will commit + later (in the fixed commit order from the master) as the deadlock victim. + */ + rgi1= thd1->rgi_slave; + rgi2= thd2->rgi_slave; + if (rgi1 && rgi2 && + rgi1->is_parallel_exec && + rgi1->rli == rgi2->rli && + rgi1->current_gtid.domain_id == rgi2->current_gtid.domain_id) + return rgi1->gtid_sub_id < rgi2->gtid_sub_id ? 1 : -1; + + /* + If one transaction has modified non-transactional tables (so that it + cannot be safely rolled back), and the other has not, then prefer to + select the purely transactional one as the victim. + */ + nontrans1= thd1->transaction.all.modified_non_trans_table; + nontrans2= thd2->transaction.all.modified_non_trans_table; + if (nontrans1 && !nontrans2) + return 1; + else if (!nontrans1 && nontrans2) + return -1; + + /* No preferences, let the storage engine decide. */ + return 0; +} + + +extern "C" int thd_non_transactional_update(const MYSQL_THD thd) +{ + return(thd->transaction.all.modified_non_trans_table); +} + +extern "C" int thd_binlog_format(const MYSQL_THD thd) +{ +#ifdef WITH_WSREP + if (WSREP(thd)) + { + /* for wsrep binlog format is meaningful also when binlogging is off */ + return (int) WSREP_FORMAT(thd->variables.binlog_format); + } +#endif /* WITH_WSREP */ + if (mysql_bin_log.is_open() && (thd->variables.option_bits & OPTION_BIN_LOG)) + return (int) thd->variables.binlog_format; + else - return BINLOG_FORMAT_UNSPEC; ++ return BINLOG_FORMAT_UNSPEC; +} + +extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all) { DBUG_ASSERT(thd); thd->mark_transaction_to_rollback(all); diff --cc sql/sql_class.h index 0721252193d,cd1ac4fefd7..394575191e4 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@@ -2854,6 -2398,43 +2854,7 @@@ public query_id_t first_query_id; } binlog_evt_union; -#ifdef WITH_WSREP - const bool wsrep_applier; /* dedicated slave applier thread */ - bool wsrep_applier_closing; /* applier marked to close */ - bool wsrep_client_thread; /* to identify client threads*/ - enum wsrep_exec_mode wsrep_exec_mode; - query_id_t wsrep_last_query_id; - enum wsrep_query_state wsrep_query_state; - enum wsrep_conflict_state wsrep_conflict_state; - mysql_mutex_t LOCK_wsrep_thd; + mysql_cond_t COND_wsrep_thd; - // changed from wsrep_seqno_t to wsrep_trx_meta_t in wsrep API rev 75 - // wsrep_seqno_t wsrep_trx_seqno; - wsrep_trx_meta_t wsrep_trx_meta; - uint32 wsrep_rand; - Relay_log_info* wsrep_rli; - bool wsrep_converted_lock_session; - wsrep_ws_handle_t wsrep_ws_handle; -#ifdef WSREP_PROC_INFO - char wsrep_info[128]; /* string for dynamic proc info */ -#endif /* WSREP_PROC_INFO */ - ulong wsrep_retry_counter; // of autocommit - bool wsrep_PA_safe; - char* wsrep_retry_query; - size_t wsrep_retry_query_len; - enum enum_server_command wsrep_retry_command; - enum wsrep_consistency_check_mode - wsrep_consistency_check; - wsrep_stats_var* wsrep_status_vars; - int wsrep_mysql_replicated; - THD* wsrep_bf_thd; - const char* wsrep_TOI_pre_query; /* a query to apply before - the actual TOI query */ - size_t wsrep_TOI_pre_query_len; - void* wsrep_apply_format; - bool wsrep_apply_toi; /* applier processing in TOI */ - wsrep_gtid_t wsrep_sync_wait_gtid; -#endif /* WITH_WSREP */ /** Internal parser state. Note that since the parser is not re-entrant, we keep only one parser diff --cc sql/sql_insert.cc index af0321ce68f,64c9497fb7d..fa754d2da38 --- a/sql/sql_insert.cc +++ b/sql/sql_insert.cc @@@ -4324,18 -4304,47 +4324,47 @@@ bool select_create::send_eof( abort_result_set(); DBUG_RETURN(true); } - else + + /* + Do an implicit commit at end of statement for non-temporary + tables. This can fail, but we should unlock the table + nevertheless. + */ + if (!table->s->tmp_table) { - /* - Do an implicit commit at end of statement for non-temporary - tables. This can fail, but we should unlock the table - nevertheless. - */ - if (!table->s->tmp_table) - { + #ifdef WITH_WSREP + /* + append table level exclusive key for CTAS + */ + wsrep_key_arr_t key_arr= {0, 0}; + wsrep_prepare_keys_for_isolation(thd, + create_table->db, + create_table->table_name, + table_list, + &key_arr); + int rcode = wsrep->append_key( + wsrep, + &thd->wsrep_ws_handle, + key_arr.keys, //&wkey, + key_arr.keys_len, + WSREP_KEY_EXCLUSIVE, + false); + wsrep_keys_free(&key_arr); + if (rcode) { + DBUG_PRINT("wsrep", ("row key failed: %d", rcode)); + WSREP_ERROR("Appending table key for CTAS failed: %s, %d", + (wsrep_thd_query(thd)) ? + wsrep_thd_query(thd) : "void", rcode); + return true; + } + /* If commit fails, we should be able to reset the OK status. */ - thd->stmt_da->can_overwrite_status= TRUE; ++ thd->get_stmt_da()->set_overwrite_status(TRUE); + #endif /* WITH_WSREP */ - trans_commit_stmt(thd); + trans_commit_stmt(thd); + if (!(thd->variables.option_bits & OPTION_GTID_BEGIN)) trans_commit_implicit(thd); #ifdef WITH_WSREP - thd->stmt_da->can_overwrite_status= FALSE; ++ thd->get_stmt_da()->set_overwrite_status(FALSE); mysql_mutex_lock(&thd->LOCK_wsrep_thd); if (thd->wsrep_conflict_state != NO_CONFLICT) { diff --cc sql/sql_parse.cc index 6fe25961e65,553a6e7539d..f60134b6162 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@@ -1053,7 -851,7 +1053,7 @@@ bool do_command(THD *thd * bail out if DB snapshot has not been installed. We however, * allow queries "SET" and "SHOW", they are trapped later in execute_command */ - if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready && - if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready_get() && ++ if (thd->variables.wsrep_on && !thd->wsrep_applier && !wsrep_ready_get() && command != COM_QUERY && command != COM_PING && command != COM_QUIT && @@@ -2750,37 -2474,12 +2750,38 @@@ mysql_execute_command(THD *thd { WSREP_SYNC_WAIT(thd, WSREP_SYNC_WAIT_BEFORE_SHOW); execute_show_status(thd, all_tables); + -#ifdef WITH_WSREP - if (lex->sql_command == SQLCOM_SHOW_STATUS) wsrep_free_status(thd); -#endif /* WITH_WSREP */ break; } + case SQLCOM_SHOW_EXPLAIN: + { + if (!thd->security_ctx->priv_user[0] && + check_global_access(thd,PROCESS_ACL)) + break; + + /* + The select should use only one table, it's the SHOW EXPLAIN pseudo-table + */ + if (lex->sroutines.records || lex->query_tables->next_global) + { + my_message(ER_SET_CONSTANTS_ONLY, ER(ER_SET_CONSTANTS_ONLY), + MYF(0)); + goto error; + } + + Item **it= lex->value_list.head_ref(); + if (!(*it)->basic_const_item() || + (!(*it)->fixed && (*it)->fix_fields(lex->thd, it)) || + (*it)->check_cols(1)) + { + my_message(ER_SET_CONSTANTS_ONLY, ER(ER_SET_CONSTANTS_ONLY), + MYF(0)); + goto error; + } + } + /* fall through */ + case SQLCOM_SHOW_STATUS_PROC: + case SQLCOM_SHOW_STATUS_FUNC: case SQLCOM_SHOW_DATABASES: case SQLCOM_SHOW_TABLES: case SQLCOM_SHOW_TRIGGERS: @@@ -3782,8 -3325,7 +3783,8 @@@ end_with_restore_list case SQLCOM_INSERT_SELECT: { WSREP_SYNC_WAIT(thd, WSREP_SYNC_WAIT_BEFORE_INSERT_REPLACE); - select_result *sel_result; + select_insert *sel_result; + bool explain= MY_TEST(lex->describe); DBUG_ASSERT(first_table == all_tables && first_table != 0); if ((res= insert_precheck(thd, all_tables))) break; @@@ -4386,10 -3860,9 +4386,9 @@@ lex->spname->m_name); break; case SQLCOM_DROP_EVENT: - WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL) if (!(res= Events::drop_event(thd, lex->spname->m_db, lex->spname->m_name, - lex->drop_if_exists))) + lex->check_exists))) my_ok(thd); break; #else diff --cc sql/sql_plugin.cc index 013e00faeb9,81b59a5be90..b1ffa90dd2f --- a/sql/sql_plugin.cc +++ b/sql/sql_plugin.cc @@@ -2082,11 -2084,20 +2082,14 @@@ bool mysql_install_plugin(THD *thd, con bool error; int argc=orig_argc; char **argv=orig_argv; + unsigned long event_class_mask[MYSQL_AUDIT_CLASS_MASK_SIZE] = + { MYSQL_AUDIT_GENERAL_CLASSMASK }; DBUG_ENTER("mysql_install_plugin"); - if (opt_noacl) - { - my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--skip-grant-tables"); - DBUG_RETURN(TRUE); - } - tables.init_one_table("mysql", 5, "plugin", 6, "plugin", TL_WRITE); - if (check_table_access(thd, INSERT_ACL, &tables, FALSE, 1, FALSE)) + if (!opt_noacl && check_table_access(thd, INSERT_ACL, &tables, FALSE, 1, FALSE)) DBUG_RETURN(TRUE); + WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL); /* need to open before acquiring LOCK_plugin or it will deadlock */ if (! (table = open_ltable(thd, &tables, TL_WRITE, @@@ -2218,12 -2232,21 +2224,15 @@@ bool mysql_uninstall_plugin(THD *thd, c TABLE_LIST tables; LEX_STRING dl= *dl_arg; bool error= false; + unsigned long event_class_mask[MYSQL_AUDIT_CLASS_MASK_SIZE] = + { MYSQL_AUDIT_GENERAL_CLASSMASK }; DBUG_ENTER("mysql_uninstall_plugin"); - if (opt_noacl) - { - my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--skip-grant-tables"); - DBUG_RETURN(TRUE); - } - tables.init_one_table("mysql", 5, "plugin", 6, "plugin", TL_WRITE); - if (check_table_access(thd, DELETE_ACL, &tables, FALSE, 1, FALSE)) + if (!opt_noacl && check_table_access(thd, DELETE_ACL, &tables, FALSE, 1, FALSE)) DBUG_RETURN(TRUE); + WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL); /* need to open before acquiring LOCK_plugin or it will deadlock */ if (! (table= open_ltable(thd, &tables, TL_WRITE, MYSQL_LOCK_IGNORE_TIMEOUT))) diff --cc sql/sql_view.cc index 9fe4dd4849d,bbc5f002461..8fdd86535d1 --- a/sql/sql_view.cc +++ b/sql/sql_view.cc @@@ -429,18 -432,9 +429,19 @@@ bool mysql_create_view(THD *thd, TABLE_ lex->link_first_table_back(view, link_to_local); view->open_type= OT_BASE_ONLY; + WSREP_TO_ISOLATION_BEGIN(WSREP_MYSQL_DB, NULL, NULL); - if (open_and_lock_tables(thd, lex->query_tables, TRUE, 0)) + /* + ignore lock specs for CREATE statement + */ + if (lex->current_select->lock_type != TL_READ_DEFAULT) + { + lex->current_select->set_lock_for_tables(TL_READ_DEFAULT); + view->mdl_request.set_type(MDL_EXCLUSIVE); + } + + if (open_temporary_tables(thd, lex->query_tables) || + open_and_lock_tables(thd, lex->query_tables, TRUE, 0)) { view= lex->unlink_first_table(&link_to_local); res= TRUE; @@@ -686,8 -722,12 +687,12 @@@ lex->link_first_table_back(view, link_to_local); DBUG_RETURN(0); + #ifdef WITH_WSREP + error: + res= TRUE; + #endif /* WITH_WSREP */ err: - thd_proc_info(thd, "end"); + THD_STAGE_INFO(thd, stage_end); lex->link_first_table_back(view, link_to_local); unit->cleanup(); DBUG_RETURN(res || thd->is_error()); diff --cc sql/wsrep_hton.cc index a9dbc1a17c2,1676daab5fe..0a2264ac03c --- a/sql/wsrep_hton.cc +++ b/sql/wsrep_hton.cc @@@ -131,18 -115,30 +131,30 @@@ void wsrep_post_commit(THD* thd, bool a wsrep_cleanup_transaction(thd); break; } - case LOCAL_STATE: - { - /* - Non-InnoDB statements may have populated events in stmt cache => cleanup - */ - WSREP_DEBUG("cleanup transaction for LOCAL_STATE: %s", thd->query()); - wsrep_cleanup_transaction(thd); - break; - } - default: break; + case LOCAL_STATE: + { + /* non-InnoDB statements may have populated events in stmt cache + => cleanup + */ + WSREP_DEBUG("cleanup transaction for LOCAL_STATE"); + /* + Run post-rollback hook to clean up in the case if + some keys were populated for the transaction in provider + but during commit time there was no write set to replicate. + This may happen when client sets the SAVEPOINT and immediately + rolls back to savepoint after first operation. + */ + if (all && thd->wsrep_conflict_state != MUST_REPLAY && + wsrep && wsrep->post_rollback(wsrep, &thd->wsrep_ws_handle)) + { + WSREP_WARN("post_rollback fail: %llu %d", - (long long)thd->thread_id, thd->stmt_da->status()); ++ (long long)thd->thread_id, thd->get_stmt_da()->status()); + } + wsrep_cleanup_transaction(thd); + break; + } + default: break; } - } /* diff --cc sql/wsrep_mysqld.cc index 49988287933,54fdf430f86..bd397a9a012 --- a/sql/wsrep_mysqld.cc +++ b/sql/wsrep_mysqld.cc @@@ -931,76 -1019,84 +932,76 @@@ static bool wsrep_prepare_key_for_isola } /* Prepare key list from db/table and table_list */ - static bool wsrep_prepare_keys_for_isolation(THD* thd, - const char* db, - const char* table, - const TABLE_LIST* table_list, - wsrep_key_arr_t* ka) + bool wsrep_prepare_keys_for_isolation(THD* thd, + const char* db, + const char* table, + const TABLE_LIST* table_list, + wsrep_key_arr_t* ka) { - ka->keys= 0; - ka->keys_len= 0; + ka->keys= 0; + ka->keys_len= 0; - extern TABLE* find_temporary_table(THD*, const TABLE_LIST*); - - if (db || table) + if (db || table) + { + if (!(ka->keys= (wsrep_key_t*)my_malloc(sizeof(wsrep_key_t), MYF(0)))) { - TABLE_LIST tmp_table; - bzero((char*) &tmp_table,sizeof(tmp_table)); - tmp_table.table_name= (char*)db; - tmp_table.db= (char*)table; - if (!table || !find_temporary_table(thd, &tmp_table)) - { - if (!(ka->keys= (wsrep_key_t*)my_malloc(sizeof(wsrep_key_t), MYF(0)))) - { - WSREP_ERROR("Can't allocate memory for key_array"); - goto err; - } - ka->keys_len= 1; - if (!(ka->keys[0].key_parts= (wsrep_buf_t*) - my_malloc(sizeof(wsrep_buf_t)*2, MYF(0)))) - { - WSREP_ERROR("Can't allocate memory for key_parts"); - goto err; - } - ka->keys[0].key_parts_num= 2; - if (!wsrep_prepare_key_for_isolation( - db, table, - (wsrep_buf_t*)ka->keys[0].key_parts, - &ka->keys[0].key_parts_num)) - { - WSREP_ERROR("Preparing keys for isolation failed"); - goto err; - } - } + WSREP_ERROR("Can't allocate memory for key_array"); + goto err; + } + ka->keys_len= 1; + if (!(ka->keys[0].key_parts= (wsrep_buf_t*) + my_malloc(sizeof(wsrep_buf_t)*2, MYF(0)))) + { + WSREP_ERROR("Can't allocate memory for key_parts"); + goto err; + } + ka->keys[0].key_parts_num= 2; + if (!wsrep_prepare_key_for_isolation( + db, table, + (wsrep_buf_t*)ka->keys[0].key_parts, + &ka->keys[0].key_parts_num)) + { + WSREP_ERROR("Preparing keys for isolation failed (1)"); + goto err; } + } + + for (const TABLE_LIST* table= table_list; table; table= table->next_global) + { + wsrep_key_t* tmp; + if (ka->keys) + tmp= (wsrep_key_t*)my_realloc(ka->keys, + (ka->keys_len + 1) * sizeof(wsrep_key_t), + MYF(0)); + else + tmp= (wsrep_key_t*)my_malloc((ka->keys_len + 1) * sizeof(wsrep_key_t), MYF(0)); - for (const TABLE_LIST* table= table_list; table; table= table->next_global) + if (!tmp) { - if (!find_temporary_table(thd, table)) - { - wsrep_key_t* tmp; - tmp= (wsrep_key_t*)my_realloc( - ka->keys, (ka->keys_len + 1) * sizeof(wsrep_key_t), MYF(0)); - if (!tmp) - { - WSREP_ERROR("Can't allocate memory for key_array"); - goto err; - } - ka->keys= tmp; - if (!(ka->keys[ka->keys_len].key_parts= (wsrep_buf_t*) - my_malloc(sizeof(wsrep_buf_t)*2, MYF(0)))) - { - WSREP_ERROR("Can't allocate memory for key_parts"); - goto err; - } - ka->keys[ka->keys_len].key_parts_num= 2; - ++ka->keys_len; - if (!wsrep_prepare_key_for_isolation( - table->db, table->table_name, - (wsrep_buf_t*)ka->keys[ka->keys_len - 1].key_parts, - &ka->keys[ka->keys_len - 1].key_parts_num)) - { - WSREP_ERROR("Preparing keys for isolation failed"); - goto err; - } - } + WSREP_ERROR("Can't allocate memory for key_array"); + goto err; } - return true; + ka->keys= tmp; + if (!(ka->keys[ka->keys_len].key_parts= (wsrep_buf_t*) + my_malloc(sizeof(wsrep_buf_t)*2, MYF(0)))) + { + WSREP_ERROR("Can't allocate memory for key_parts"); + goto err; + } + ka->keys[ka->keys_len].key_parts_num= 2; + ++ka->keys_len; + if (!wsrep_prepare_key_for_isolation(table->db, table->table_name, + (wsrep_buf_t*)ka->keys[ka->keys_len - 1].key_parts, + &ka->keys[ka->keys_len - 1].key_parts_num)) + { + WSREP_ERROR("Preparing keys for isolation failed (2)"); + goto err; + } + } + return 0; err: wsrep_keys_free(ka); - return false; + return 1; } diff --cc sql/wsrep_mysqld.h index 6dabdb66022,56e3baae7cc..94c97f04aab --- a/sql/wsrep_mysqld.h +++ b/sql/wsrep_mysqld.h @@@ -139,9 -125,17 +139,10 @@@ extern const char* wsrep_provider_name extern const char* wsrep_provider_version; extern const char* wsrep_provider_vendor; -// Other wsrep global variables -extern my_bool wsrep_inited; // whether wsrep is initialized ? - int wsrep_show_status(THD *thd, SHOW_VAR *var, char *buff); + int wsrep_show_ready(THD *thd, SHOW_VAR *var, char *buff); void wsrep_free_status(THD *thd); -/* Filters out --wsrep-new-cluster oprtion from argv[] - * should be called in the very beginning of main() */ -void wsrep_filter_new_cluster (int* argc, char* argv[]); - int wsrep_init(); void wsrep_deinit(bool free_options); void wsrep_recover(); @@@ -255,8 -245,7 +256,9 @@@ extern wsrep_seqno_t wsrep_locked_seqno #define WSREP_PROVIDER_EXISTS \ (wsrep_provider && strncasecmp(wsrep_provider, WSREP_NONE, FN_REFLEN)) +#define WSREP_QUERY(thd) (thd->query()) + + extern my_bool wsrep_ready_get(); extern void wsrep_ready_wait(); enum wsrep_trx_status { @@@ -332,11 -316,23 +334,22 @@@ int wsrep_create_trigger_query(THD *thd int wsrep_create_event_query(THD *thd, uchar** buf, size_t* buf_len); int wsrep_alter_event_query(THD *thd, uchar** buf, size_t* buf_len); -struct xid_t; -void wsrep_set_SE_checkpoint(xid_t*); -void wsrep_get_SE_checkpoint(wsrep_uuid_t&, wsrep_seqno_t&); -void wsrep_xid_init(xid_t*, const wsrep_uuid_t*, wsrep_seqno_t); -const wsrep_uuid_t* wsrep_xid_uuid(const xid_t*); -wsrep_seqno_t wsrep_xid_seqno(const xid_t*); -extern "C" int wsrep_is_wsrep_xid(const void* xid); +#ifdef GTID_SUPPORT +void wsrep_init_sidno(const wsrep_uuid_t&); +#endif /* GTID_SUPPORT */ + +bool wsrep_node_is_donor(); +bool wsrep_node_is_synced(); + typedef struct wsrep_key_arr + { + wsrep_key_t* keys; + size_t keys_len; + } wsrep_key_arr_t; + bool wsrep_prepare_keys_for_isolation(THD* thd, + const char* db, + const char* table, + const TABLE_LIST* table_list, + wsrep_key_arr_t* ka); + void wsrep_keys_free(wsrep_key_arr_t* key_arr); #endif /* WSREP_MYSQLD_H */ diff --cc sql/wsrep_thd.cc index 307745ff1b0,4d665775f2d..328bcbd6be6 --- a/sql/wsrep_thd.cc +++ b/sql/wsrep_thd.cc @@@ -381,7 -287,7 +381,7 @@@ static void wsrep_replication_process(T case WSREP_TRX_MISSING: /* these suggests a bug in provider code */ WSREP_WARN("bad return from recv() call: %d", rcode); -- /* fall through to node shutdown */ ++ /* fall through */ case WSREP_FATAL: /* Cluster connectivity is lost. * diff --cc storage/heap/ha_heap.cc index c1dad6a9943,ec76d08bf97..29bf924dc26 --- a/storage/heap/ha_heap.cc +++ b/storage/heap/ha_heap.cc @@@ -91,16 -100,7 +91,7 @@@ ha_heap::ha_heap(handlerton *hton, TABL int ha_heap::open(const char *name, int mode, uint test_if_locked) { - if (table->s->reclength < sizeof (char*)) - { - MEM_UNDEFINED(table->s->default_values + table->s->reclength, - sizeof(char*) - table->s->reclength); - table->s->reclength= sizeof(char*); - MEM_UNDEFINED(table->record[0], table->s->reclength); - MEM_UNDEFINED(table->record[1], table->s->reclength); - } - - internal_table= test(test_if_locked & HA_OPEN_INTERNAL_TABLE); + internal_table= MY_TEST(test_if_locked & HA_OPEN_INTERNAL_TABLE); if (internal_table || (!(file= heap_open(name, mode)) && my_errno == ENOENT)) { HP_CREATE_INFO create_info; @@@ -723,7 -727,7 +714,7 @@@ heap_prepare_hp_create_info(TABLE *tabl } } } - mem_per_row+= MY_ALIGN(share->reclength + 1, sizeof(char*)); - mem_per_row+= MY_ALIGN(max(share->reclength, sizeof(char*)) + 1, sizeof(char*)); ++ mem_per_row+= MY_ALIGN(MY_MAX(share->reclength, sizeof(char*)) + 1, sizeof(char*)); if (table_arg->found_next_number_field) { keydef[share->next_number_index].flag|= HA_AUTO_KEY; diff --cc storage/heap/hp_create.c index d03c7c46f15,1daca0beeb5..29c031c466c --- a/storage/heap/hp_create.c +++ b/storage/heap/hp_create.c @@@ -58,9 -59,9 +59,9 @@@ int heap_create(const char *name, HP_CR /* We have to store sometimes uchar* del_link in records, - so the record length should be at least sizeof(uchar*) + so the visible_offset must be least at sizeof(uchar*) */ - set_if_bigger(reclength, sizeof (uchar*)); - visible_offset= max(reclength, sizeof (char*)); ++ visible_offset= MY_MAX(reclength, sizeof (char*)); for (i= key_segs= max_length= 0, keyinfo= keydef; i < keys; i++, keyinfo++) { diff --cc storage/innobase/handler/ha_innodb.cc index 5dbd7a1ca91,7aab200fed1..7e943782165 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@@ -1,10 -1,8 +1,10 @@@ /***************************************************************************** - Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved. + Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2012, Facebook Inc. - Copyright (c) 2013, 2017, MariaDB Corporation. ++Copyright (c) 2013, 2018, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@@ -1603,2924 -934,700 +1603,2928 @@@ innobase_release_temporary_latches if (!innodb_inited) { - return(0); + return(0); + } + + trx_t* trx = thd_to_trx(thd); + + if (trx != NULL) { + trx_search_latch_release_if_reserved(trx); + } + + return(0); +} + +/********************************************************************//** +Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth +time calls srv_active_wake_master_thread. This function should be used +when a single database operation may introduce a small need for +server utility activity, like checkpointing. */ +static inline +void +innobase_active_small(void) +/*=======================*/ +{ + innobase_active_counter++; + + if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) { + srv_active_wake_master_thread(); + } +} + +/********************************************************************//** +Converts an InnoDB error code to a MySQL error code and also tells to MySQL +about a possible transaction rollback inside InnoDB caused by a lock wait +timeout or a deadlock. +@return MySQL error code */ +static +int +convert_error_code_to_mysql( +/*========================*/ + dberr_t error, /*!< in: InnoDB error code */ + ulint flags, /*!< in: InnoDB table flags, or 0 */ + THD* thd) /*!< in: user thread handle or NULL */ +{ + switch (error) { + case DB_SUCCESS: + return(0); + + case DB_INTERRUPTED: + return(HA_ERR_ABORTED_BY_USER); + + case DB_FOREIGN_EXCEED_MAX_CASCADE: + ut_ad(thd); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_ROW_IS_REFERENCED, + "InnoDB: Cannot delete/update " + "rows with cascading foreign key " + "constraints that exceed max " + "depth of %d. Please " + "drop extra constraints and try " + "again", DICT_FK_MAX_RECURSIVE_LOAD); + + /* fall through */ + + case DB_ERROR: + default: + return(-1); /* unspecified error */ + + case DB_DUPLICATE_KEY: + /* Be cautious with returning this error, since + mysql could re-enter the storage layer to get + duplicated key info, the operation requires a + valid table handle and/or transaction information, + which might not always be available in the error + handling stage. */ + return(HA_ERR_FOUND_DUPP_KEY); + + case DB_READ_ONLY: + return(HA_ERR_TABLE_READONLY); + + case DB_FOREIGN_DUPLICATE_KEY: + return(HA_ERR_FOREIGN_DUPLICATE_KEY); + + case DB_MISSING_HISTORY: + return(HA_ERR_TABLE_DEF_CHANGED); + + case DB_RECORD_NOT_FOUND: + return(HA_ERR_NO_ACTIVE_RECORD); + + case DB_DEADLOCK: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, TRUE); + } + + return(HA_ERR_LOCK_DEADLOCK); + + case DB_LOCK_WAIT_TIMEOUT: + /* Starting from 5.0.13, we let MySQL just roll back the + latest SQL statement in a lock wait timeout. Previously, we + rolled back the whole transaction. */ + + if (thd) { + thd_mark_transaction_to_rollback( + thd, (bool) row_rollback_on_timeout); + } + + return(HA_ERR_LOCK_WAIT_TIMEOUT); + + case DB_NO_REFERENCED_ROW: + return(HA_ERR_NO_REFERENCED_ROW); + + case DB_ROW_IS_REFERENCED: + return(HA_ERR_ROW_IS_REFERENCED); + + case DB_CANNOT_ADD_CONSTRAINT: + case DB_CHILD_NO_INDEX: + case DB_PARENT_NO_INDEX: + return(HA_ERR_CANNOT_ADD_FOREIGN); + + case DB_CANNOT_DROP_CONSTRAINT: + + return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit + misleading, a new MySQL error + code should be introduced */ + + case DB_CORRUPTION: + return(HA_ERR_CRASHED); + + case DB_OUT_OF_FILE_SPACE: + return(HA_ERR_RECORD_FILE_FULL); + + case DB_TEMP_FILE_WRITE_FAILURE: + my_error(ER_GET_ERRMSG, MYF(0), + DB_TEMP_FILE_WRITE_FAILURE, + ut_strerr(DB_TEMP_FILE_WRITE_FAILURE), + "InnoDB"); + return(HA_ERR_INTERNAL_ERROR); + + case DB_TABLE_IN_FK_CHECK: + return(HA_ERR_TABLE_IN_FK_CHECK); + + case DB_TABLE_IS_BEING_USED: + return(HA_ERR_WRONG_COMMAND); + + case DB_TABLESPACE_DELETED: + case DB_TABLE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + + case DB_TABLESPACE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + + case DB_TOO_BIG_RECORD: { + /* If prefix is true then a 768-byte prefix is stored + locally for BLOB fields. Refer to dict_table_get_format() */ + bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A); + my_printf_error(ER_TOO_BIG_ROWSIZE, + "Row size too large (> %lu). Changing some columns " + "to TEXT or BLOB %smay help. In current row " + "format, BLOB prefix of %d bytes is stored inline.", + MYF(0), + page_get_free_space_of_empty(flags & + DICT_TF_COMPACT) / 2, + prefix ? "or using ROW_FORMAT=DYNAMIC " + "or ROW_FORMAT=COMPRESSED ": "", + prefix ? DICT_MAX_FIXED_COL_LEN : 0); + return(HA_ERR_TO_BIG_ROW); + } + + + case DB_TOO_BIG_FOR_REDO: + my_printf_error(ER_TOO_BIG_ROWSIZE, "%s" , MYF(0), + "The size of BLOB/TEXT data inserted" + " in one transaction is greater than" + " 10% of redo log size. Increase the" + " redo log size using innodb_log_file_size."); + return(HA_ERR_TO_BIG_ROW); + + case DB_TOO_BIG_INDEX_COL: + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)); + return(HA_ERR_INDEX_COL_TOO_LONG); + + case DB_NO_SAVEPOINT: + return(HA_ERR_NO_SAVEPOINT); + + case DB_LOCK_TABLE_FULL: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, TRUE); + } + + return(HA_ERR_LOCK_TABLE_FULL); + + case DB_FTS_INVALID_DOCID: + return(HA_FTS_INVALID_DOCID); + case DB_FTS_EXCEED_RESULT_CACHE_LIMIT: + return(HA_ERR_OUT_OF_MEM); + case DB_TOO_MANY_CONCURRENT_TRXS: + return(HA_ERR_TOO_MANY_CONCURRENT_TRXS); + case DB_UNSUPPORTED: + return(HA_ERR_UNSUPPORTED); + case DB_INDEX_CORRUPT: + return(HA_ERR_INDEX_CORRUPT); + case DB_UNDO_RECORD_TOO_BIG: + return(HA_ERR_UNDO_REC_TOO_BIG); + case DB_OUT_OF_MEMORY: + return(HA_ERR_OUT_OF_MEM); + case DB_TABLESPACE_EXISTS: + return(HA_ERR_TABLESPACE_EXISTS); + case DB_IDENTIFIER_TOO_LONG: + return(HA_ERR_INTERNAL_ERROR); + case DB_FTS_TOO_MANY_WORDS_IN_PHRASE: + return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE); + } +} + +/*************************************************************//** +Prints info of a THD object (== user session thread) to the given file. */ +UNIV_INTERN +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /*!< in: output stream */ + THD* thd, /*!< in: MySQL THD object */ + uint max_query_len) /*!< in: max query length to print, or 0 to + use the default max length */ +{ + char buffer[1024]; + + fputs(thd_get_error_context_description(thd, buffer, sizeof buffer, + max_query_len), f); + putc('\n', f); +} + +/******************************************************************//** +Get the error message format string. +@return the format string or 0 if not found. */ +UNIV_INTERN +const char* +innobase_get_err_msg( +/*=================*/ + int error_code) /*!< in: MySQL error code */ +{ + return(my_get_err_msg(error_code)); +} + +/******************************************************************//** +Get the variable length bounds of the given character set. */ +UNIV_INTERN +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /*!< in: MySQL charset-collation code */ + ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */ + ulint* mbmaxlen) /*!< out: maximum length of a char (in bytes) */ +{ + CHARSET_INFO* cs; + ut_ad(cset <= MAX_CHAR_COLL_NUM); + ut_ad(mbminlen); + ut_ad(mbmaxlen); + + cs = all_charsets[cset]; + if (cs) { + *mbminlen = cs->mbminlen; + *mbmaxlen = cs->mbmaxlen; + ut_ad(*mbminlen < DATA_MBMAX); + ut_ad(*mbmaxlen < DATA_MBMAX); + } else { + THD* thd = current_thd; + + if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) { + + /* Fix bug#46256: allow tables to be dropped if the + collation is not found, but issue a warning. */ + if ((global_system_variables.log_warnings) + && (cset != 0)){ + + sql_print_warning( + "Unknown collation #%lu.", cset); + } + } else { + + ut_a(cset == 0); + } + + *mbminlen = *mbmaxlen = 0; + } +} + +/******************************************************************//** +Converts an identifier to a table name. */ +UNIV_INTERN +void +innobase_convert_from_table_id( +/*===========================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len) /*!< in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, FN_REFLEN, &my_charset_filename, to, (uint) len, &errors); +} + +/********************************************************************** +Check if the length of the identifier exceeds the maximum allowed. +return true when length of identifier is too long. */ +UNIV_INTERN +my_bool +innobase_check_identifier_length( +/*=============================*/ + const char* id) /* in: FK identifier to check excluding the + database portion. */ +{ + int well_formed_error = 0; + CHARSET_INFO *cs = system_charset_info; + DBUG_ENTER("innobase_check_identifier_length"); + + size_t len = cs->cset->well_formed_len( + cs, id, id + strlen(id), + NAME_CHAR_LEN, &well_formed_error); + + if (well_formed_error || len == NAME_CHAR_LEN) { + my_error(ER_TOO_LONG_IDENT, MYF(0), id); + DBUG_RETURN(true); + } + DBUG_RETURN(false); +} + +/******************************************************************//** +Converts an identifier to UTF-8. */ +UNIV_INTERN +void +innobase_convert_from_id( +/*=====================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len) /*!< in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, FN_REFLEN, system_charset_info, to, (uint) len, &errors); +} + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. +@return 0 if a=b, <0 if a<b, >1 if a>b */ +UNIV_INTERN +int +innobase_strcasecmp( +/*================*/ + const char* a, /*!< in: first string to compare */ + const char* b) /*!< in: second string to compare */ +{ + if (!a) { + if (!b) { + return(0); + } else { + return(-1); + } + } else if (!b) { + return(1); + } + + return(my_strcasecmp(system_charset_info, a, b)); +} + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. The +second string contains wildcards. +@return 0 if a match is found, 1 if not */ +UNIV_INTERN +int +innobase_wildcasecmp( +/*=================*/ + const char* a, /*!< in: string to compare */ + const char* b) /*!< in: wildcard string to compare */ +{ + return(wild_case_compare(system_charset_info, a, b)); +} + +/******************************************************************//** +Strip dir name from a full path name and return only the file name +@return file name or "null" if no file name */ +UNIV_INTERN +const char* +innobase_basename( +/*==============*/ + const char* path_name) /*!< in: full path name */ +{ + const char* name = base_name(path_name); + + return((name) ? name : "null"); +} + +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +innobase_casedn_str( +/*================*/ + char* a) /*!< in/out: string to put in lower case */ +{ + my_casedn_str(system_charset_info, a); +} + +/**********************************************************************//** +Determines the connection character set. +@return connection character set */ +UNIV_INTERN +struct charset_info_st* +innobase_get_charset( +/*=================*/ + THD* mysql_thd) /*!< in: MySQL thread handle */ +{ + return(thd_charset(mysql_thd)); +} + +/**********************************************************************//** +Determines the current SQL statement. +@return SQL statement string */ +UNIV_INTERN +const char* +innobase_get_stmt( +/*==============*/ + THD* thd, /*!< in: MySQL thread handle */ + size_t* length) /*!< out: length of the SQL statement */ +{ + if (const LEX_STRING *stmt = thd_query_string(thd)) { + *length = stmt->length; + return stmt->str; + } + return NULL; +} + +/**********************************************************************//** +Get the current setting of the tdc_size global parameter. We do +a dirty read because for one there is no synchronization object and +secondly there is little harm in doing so even if we get a torn read. +@return value of tdc_size */ +UNIV_INTERN +ulint +innobase_get_table_cache_size(void) +/*===============================*/ +{ + return(tdc_size); +} + +/**********************************************************************//** +Get the current setting of the lower_case_table_names global parameter from +mysqld.cc. We do a dirty read because for one there is no synchronization +object and secondly there is little harm in doing so even if we get a torn +read. +@return value of lower_case_table_names */ +UNIV_INTERN +ulint +innobase_get_lower_case_table_names(void) +/*=====================================*/ +{ + return(lower_case_table_names); +} + +/** Create a temporary file in the location specified by the parameter +path. If the path is null, then it will be created in tmpdir. +@param[in] path location for creating temporary file +@return temporary file descriptor, or < 0 on error */ +UNIV_INTERN +int +innobase_mysql_tmpfile( + const char* path) +{ +#ifdef WITH_INNODB_DISALLOW_WRITES + os_event_wait(srv_allow_writes_event); +#endif /* WITH_INNODB_DISALLOW_WRITES */ + int fd2 = -1; + File fd; + + DBUG_EXECUTE_IF( + "innobase_tmpfile_creation_failure", + return(-1); + ); + + if (path == NULL) { + fd = mysql_tmpfile("ib"); + } else { + fd = mysql_tmpfile_path(path, "ib"); + } + + if (fd >= 0) { + /* Copy the file descriptor, so that the additional resources + allocated by create_temp_file() can be freed by invoking + my_close(). + + Because the file descriptor returned by this function + will be passed to fdopen(), it will be closed by invoking + fclose(), which in turn will invoke close() instead of + my_close(). */ + +#ifdef _WIN32 + /* Note that on Windows, the integer returned by mysql_tmpfile + has no relation to C runtime file descriptor. Here, we need + to call my_get_osfhandle to get the HANDLE and then convert it + to C runtime filedescriptor. */ + { + HANDLE hFile = my_get_osfhandle(fd); + HANDLE hDup; + BOOL bOK = DuplicateHandle( + GetCurrentProcess(), + hFile, GetCurrentProcess(), + &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); + if (bOK) { + fd2 = _open_osfhandle((intptr_t) hDup, 0); + } else { + my_osmaperr(GetLastError()); + fd2 = -1; + } + } ++#else ++#ifdef F_DUPFD_CLOEXEC ++ fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0); +#else + fd2 = dup(fd); ++#endif +#endif + if (fd2 < 0) { + DBUG_PRINT("error",("Got error %d on dup",fd2)); + my_errno=errno; + my_error(EE_OUT_OF_FILERESOURCES, + MYF(ME_BELL+ME_WAITTANG), + "ib*", my_errno); + } + my_close(fd, MYF(MY_WME)); + } + return(fd2); +} + +/*********************************************************************//** +Wrapper around MySQL's copy_and_convert function. +@return number of bytes copied to 'to' */ +UNIV_INTERN +ulint +innobase_convert_string( +/*====================*/ + void* to, /*!< out: converted string */ + ulint to_length, /*!< in: number of bytes reserved + for the converted string */ + CHARSET_INFO* to_cs, /*!< in: character set to convert to */ + const void* from, /*!< in: string to convert */ + ulint from_length, /*!< in: number of bytes to convert */ + CHARSET_INFO* from_cs, /*!< in: character set to convert + from */ + uint* errors) /*!< out: number of errors encountered + during the conversion */ +{ + return(copy_and_convert( + (char*) to, (uint32) to_length, to_cs, + (const char*) from, (uint32) from_length, from_cs, + errors)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +innobase_raw_format( +/*================*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint charset_coll, /*!< in: charset collation */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + /* XXX we use a hard limit instead of allocating + but_size bytes from the heap */ + CHARSET_INFO* data_cs; + char buf_tmp[8192]; + ulint buf_tmp_used; + uint num_errors; + + data_cs = all_charsets[charset_coll]; + + buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp), + system_charset_info, + data, data_len, data_cs, + &num_errors); + + return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size)); +} + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +UNIV_INTERN +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ +{ + ulonglong next_value; + ulonglong block = need * step; + + /* Should never be 0. */ + ut_a(need > 0); + ut_a(block > 0); + ut_a(max_value > 0); + + /* + Allow auto_increment to go over max_value up to max ulonglong. + This allows us to detect that all values are exhausted. + If we don't do this, we will return max_value several times + and get duplicate key errors instead of auto increment value + out of range. + */ + max_value= (~(ulonglong) 0); + + /* According to MySQL documentation, if the offset is greater than + the step then the offset is ignored. */ + if (offset > block) { + offset = 0; + } + + /* Check for overflow. Current can be > max_value if the value is + in reality a negative value.The visual studio compilers converts + large double values automatically into unsigned long long datatype + maximum value */ + + if (block >= max_value + || offset > max_value + || current >= max_value + || max_value - offset <= offset) { + + next_value = max_value; + } else { + ut_a(max_value > current); + + ulonglong free = max_value - current; + + if (free < offset || free - offset <= block) { + next_value = max_value; + } else { + next_value = 0; + } + } + + if (next_value == 0) { + ulonglong next; + + if (current >= offset) { + next = (current - offset) / step; + } else { + next = 0; + block -= step; + } + + ut_a(max_value > next); + next_value = next * step; + /* Check for multiplication overflow. */ + ut_a(next_value >= next); + ut_a(max_value > next_value); + + /* Check for overflow */ + if (max_value - next_value >= block) { + + next_value += block; + + if (max_value - next_value >= offset) { + next_value += offset; + } else { + next_value = max_value; + } + } else { + next_value = max_value; + } + } + + ut_a(next_value != 0); + ut_a(next_value <= max_value); + + return(next_value); +} + +/*********************************************************************//** +Initializes some fields in an InnoDB transaction object. */ +static +void +innobase_trx_init( +/*==============*/ + THD* thd, /*!< in: user thread handle */ + trx_t* trx) /*!< in/out: InnoDB transaction handle */ +{ + DBUG_ENTER("innobase_trx_init"); + DBUG_ASSERT(thd == trx->mysql_thd); + + trx->check_foreigns = !thd_test_options( + thd, OPTION_NO_FOREIGN_KEY_CHECKS); + + trx->check_unique_secondary = !thd_test_options( + thd, OPTION_RELAXED_UNIQUE_CHECKS); + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Allocates an InnoDB transaction for a MySQL handler object for DML. +@return InnoDB transaction handle */ +UNIV_INTERN +trx_t* +innobase_trx_allocate( +/*==================*/ + THD* thd) /*!< in: user thread handle */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_trx_allocate"); + DBUG_ASSERT(thd != NULL); + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + + trx = trx_allocate_for_mysql(); + + trx->mysql_thd = thd; + + innobase_trx_init(thd, trx); + + DBUG_RETURN(trx); +} + +/*********************************************************************//** +Gets the InnoDB transaction handle for a MySQL handler object, creates +an InnoDB transaction struct if the corresponding MySQL thread struct still +lacks one. +@return InnoDB transaction handle */ +static inline +trx_t* +check_trx_exists( +/*=============*/ + THD* thd) /*!< in: user thread handle */ +{ + trx_t*& trx = thd_to_trx(thd); + + if (trx == NULL) { + trx = innobase_trx_allocate(thd); + thd_set_ha_data(thd, innodb_hton_ptr, trx); + } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) { + mem_analyze_corruption(trx); + ut_error; + } + + innobase_trx_init(thd, trx); + + return(trx); +} + +/*********************************************************************//** +Note that a transaction has been registered with MySQL. +@return true if transaction is registered with MySQL 2PC coordinator */ +static inline +bool +trx_is_registered_for_2pc( +/*=========================*/ + const trx_t* trx) /* in: transaction */ +{ + return(trx->is_registered == 1); +} + +/*********************************************************************//** +Note that innobase_commit_ordered() was run. */ +static inline +void +trx_set_active_commit_ordered( +/*==============================*/ + trx_t* trx) /* in: transaction */ +{ + ut_a(trx_is_registered_for_2pc(trx)); + trx->active_commit_ordered = 1; +} + +/*********************************************************************//** +Note that a transaction has been registered with MySQL 2PC coordinator. */ +static inline +void +trx_register_for_2pc( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + trx->is_registered = 1; + ut_ad(trx->active_commit_ordered == 0); +} + +/*********************************************************************//** +Note that a transaction has been deregistered. */ +static inline +void +trx_deregister_from_2pc( +/*====================*/ + trx_t* trx) /* in: transaction */ +{ + trx->is_registered = 0; + trx->active_commit_ordered = 0; +} + +/*********************************************************************//** +Check whether a transaction has active_commit_ordered set */ +static inline +bool +trx_is_active_commit_ordered( +/*=========================*/ + const trx_t* trx) /* in: transaction */ +{ + return(trx->active_commit_ordered == 1); +} + +/*********************************************************************//** +Check if transaction is started. +@reutrn true if transaction is in state started */ +static +bool +trx_is_started( +/*===========*/ + trx_t* trx) /* in: transaction */ +{ + return(trx->state != TRX_STATE_NOT_STARTED); +} + +/*********************************************************************//** +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_create_info( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const HA_CREATE_INFO* create_info) /*!< in: create info */ +{ + ibool ps_on; + ibool ps_off; + + if (dict_table_is_temporary(innodb_table)) { + /* Temp tables do not use persistent stats. */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = create_info->table_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = create_info->table_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = create_info->stats_sample_pages; +} + +/*********************************************************************//** +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_table_share( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const TABLE_SHARE* table_share) /*!< in: table share */ +{ + ibool ps_on; + ibool ps_off; + + if (dict_table_is_temporary(innodb_table)) { + /* Temp tables do not use persistent stats */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = table_share->db_create_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = table_share->db_create_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = table_share->stats_sample_pages; +} + +/*********************************************************************//** +Construct ha_innobase handler. */ +UNIV_INTERN +ha_innobase::ha_innobase( +/*=====================*/ + handlerton* hton, + TABLE_SHARE* table_arg) + :handler(hton, table_arg), + int_table_flags(HA_REC_NOT_IN_SEQ | + HA_NULL_IN_KEY | HA_CAN_VIRTUAL_COLUMNS | + HA_CAN_INDEX_BLOBS | + HA_CAN_SQL_HANDLER | + HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | + HA_PRIMARY_KEY_IN_READ_INDEX | + HA_BINLOG_ROW_CAPABLE | + HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ | + HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT | + HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT), + start_of_scan(0), + num_write_row(0) +{} + +/*********************************************************************//** +Destruct ha_innobase handler. */ +UNIV_INTERN +ha_innobase::~ha_innobase() +/*======================*/ +{ +} + +/*********************************************************************//** +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +prebuilt struct. */ +UNIV_INTERN inline +void +ha_innobase::update_thd( +/*====================*/ + THD* thd) /*!< in: thd to use the handle */ +{ + trx_t* trx; + + DBUG_ENTER("ha_innobase::update_thd"); + DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p", + user_thd, thd)); + + /* The table should have been opened in ha_innobase::open(). */ + DBUG_ASSERT(prebuilt->table->n_ref_count > 0); + + trx = check_trx_exists(thd); + + if (prebuilt->trx != trx) { + + row_update_prebuilt_trx(prebuilt, trx); + } + + user_thd = thd; + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +prebuilt struct. */ +UNIV_INTERN +void +ha_innobase::update_thd() +/*=====================*/ +{ + THD* thd = ha_thd(); + + ut_ad(EQ_CURRENT_THD(thd)); + update_thd(thd); +} + +/*********************************************************************//** +Registers an InnoDB transaction with the MySQL 2PC coordinator, so that +the MySQL XA code knows to call the InnoDB prepare and commit, or rollback +for the transaction. This MUST be called for every transaction for which +the user may call commit or rollback. Calling this several times to register +the same transaction is allowed, too. This function also registers the +current SQL statement. */ +static inline +void +innobase_register_trx( +/*==================*/ + handlerton* hton, /* in: Innobase handlerton */ + THD* thd, /* in: MySQL thd (connection) object */ + trx_t* trx) /* in: transaction to register */ +{ + trans_register_ha(thd, FALSE, hton); + + if (!trx_is_registered_for_2pc(trx) + && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + trans_register_ha(thd, TRUE, hton); + } + + trx_register_for_2pc(trx); +} + +/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB + ------------------------------------------------------------ + +1) The use of the query cache for TBL is disabled when there is an +uncommitted change to TBL. + +2) When a change to TBL commits, InnoDB stores the current value of +its global trx id counter, let us denote it by INV_TRX_ID, to the table object +in the InnoDB data dictionary, and does only allow such transactions whose +id <= INV_TRX_ID to use the query cache. + +3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit +modification because an ON DELETE CASCADE, we invalidate the MySQL query cache +of TBL immediately. + +How this is implemented inside InnoDB: + +1) Since every modification always sets an IX type table lock on the InnoDB +table, it is easy to check if there can be uncommitted modifications for a +table: just check if there are locks in the lock list of the table. + +2) When a transaction inside InnoDB commits, it reads the global trx id +counter and stores the value INV_TRX_ID to the tables on which it had a lock. + +3) If there is an implicit table change from ON DELETE CASCADE or SET NULL, +InnoDB calls an invalidate method for the MySQL query cache for that table. + +How this is implemented inside sql_cache.cc: + +1) The query cache for an InnoDB table TBL is invalidated immediately at an +INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay +invalidation to the transaction commit. + +2) To store or retrieve a value from the query cache of an InnoDB table TBL, +any query must first ask InnoDB's permission. We must pass the thd as a +parameter because InnoDB will look at the trx id, if any, associated with +that thd. Also the full_name which is used as key to search for the table +object. The full_name is a string containing the normalized path to the +table in the canonical format. + +3) Use of the query cache for InnoDB tables is now allowed also when +AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer +put restrictions on the use of the query cache. +*/ + +/******************************************************************//** +The MySQL query cache uses this to check from InnoDB if the query cache at +the moment is allowed to operate on an InnoDB table. The SQL query must +be a non-locking SELECT. + +The query cache is allowed to operate on certain query only if this function +returns TRUE for all tables in the query. + +If thd is not in the autocommit state, this function also starts a new +transaction for thd if there is no active trx yet, and assigns a consistent +read view to it if there is no read view yet. + +Why a deadlock of threads is not possible: the query cache calls this function +at the start of a SELECT processing. Then the calling thread cannot be +holding any InnoDB semaphores. The calling thread is holding the +query cache mutex, and this function will reserve the InnoDB trx_sys->mutex. +Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above +the InnoDB trx_sys->mutex. +@return TRUE if permitted, FALSE if not; note that the value FALSE +does not mean we should invalidate the query cache: invalidation is +called explicitly */ +static +my_bool +innobase_query_caching_of_table_permitted( +/*======================================*/ + THD* thd, /*!< in: thd of the user who is trying to + store a result to the query cache or + retrieve it */ + char* full_name, /*!< in: normalized path to the table */ + uint full_name_len, /*!< in: length of the normalized path + to the table */ + ulonglong *unused) /*!< unused for this engine */ +{ + ibool is_autocommit; + trx_t* trx; + char norm_name[1000]; + + ut_a(full_name_len < 999); + + trx = check_trx_exists(thd); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE) { + /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every + plain SELECT if AUTOCOMMIT is not on. */ + + return((my_bool)FALSE); + } + + if (UNIV_UNLIKELY(trx->has_search_latch)) { + sql_print_error("The calling thread is holding the adaptive " + "search, latch though calling " + "innobase_query_caching_of_table_permitted."); + trx_print(stderr, trx, 1024); + } + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + is_autocommit = TRUE; + } else { + is_autocommit = FALSE; + + } + + if (is_autocommit && trx->n_mysql_tables_in_use == 0) { + /* We are going to retrieve the query result from the query + cache. This cannot be a store operation to the query cache + because then MySQL would have locks on tables already. + + TODO: if the user has used LOCK TABLES to lock the table, + then we open a transaction in the call of row_.. below. + That trx can stay open until UNLOCK TABLES. The same problem + exists even if we do not use the query cache. MySQL should be + modified so that it ALWAYS calls some cleanup function when + the processing of a query ends! + + We can imagine we instantaneously serialize this consistent + read trx to the current trx id counter. If trx2 would have + changed the tables of a query result stored in the cache, and + trx2 would have already committed, making the result obsolete, + then trx2 would have already invalidated the cache. Thus we + can trust the result in the cache is ok for this query. */ + + return((my_bool)TRUE); + } + + /* Normalize the table name to InnoDB format */ + normalize_table_name(norm_name, full_name); + + innobase_register_trx(innodb_hton_ptr, thd, trx); + + if (row_search_check_if_query_cache_permitted(trx, norm_name)) { + + /* printf("Query cache for %s permitted\n", norm_name); */ + + return((my_bool)TRUE); + } + + /* printf("Query cache for %s NOT permitted\n", norm_name); */ + + return((my_bool)FALSE); +} + +/*****************************************************************//** +Invalidates the MySQL query cache for the table. */ +UNIV_INTERN +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /*!< in: transaction which + modifies the table */ + const char* full_name, /*!< in: concatenation of + database name, null char NUL, + table name, null char NUL; + NOTE that in Windows this is + always in LOWER CASE! */ + ulint full_name_len) /*!< in: full name length where + also the null chars count */ +{ + /* Note that the sync0sync.h rank of the query cache mutex is just + above the InnoDB trx_sys_t->lock. The caller of this function must + not have latches of a lower rank. */ + +#ifdef HAVE_QUERY_CACHE + char qcache_key_name[2 * (NAME_LEN + 1)]; + size_t tabname_len; + size_t dbname_len; + + /* Construct the key("db-name\0table$name\0") for the query cache using + the path name("db@002dname\0table@0024name\0") of the table in its + canonical form. */ + dbname_len = filename_to_tablename(full_name, qcache_key_name, + sizeof(qcache_key_name)); + tabname_len = filename_to_tablename(full_name + strlen(full_name) + 1, + qcache_key_name + dbname_len + 1, + sizeof(qcache_key_name) + - dbname_len - 1); + + /* Argument TRUE below means we are using transactions */ + mysql_query_cache_invalidate4(trx->mysql_thd, + qcache_key_name, + (dbname_len + tabname_len + 2), + TRUE); +#endif +} + +/*****************************************************************//** +Convert an SQL identifier to the MySQL system_charset_info (UTF-8) +and quote it if needed. +@return pointer to the end of buf */ +static +char* +innobase_convert_identifier( +/*========================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: identifier to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ + ibool file_id)/*!< in: TRUE=id is a table or database name; + FALSE=id is an UTF-8 string */ +{ + char nz2[MAX_TABLE_NAME_LEN + 1]; + const char* s = id; + int q; + + if (file_id) { + + char nz[MAX_TABLE_NAME_LEN + 1]; + + /* Decode the table name. The MySQL function expects + a NUL-terminated string. The input and output strings + buffers must not be shared. */ + ut_a(idlen <= MAX_TABLE_NAME_LEN); + memcpy(nz, id, idlen); + nz[idlen] = 0; + + s = nz2; + idlen = explain_filename(thd, nz, nz2, sizeof nz2, + EXPLAIN_PARTITIONS_AS_COMMENT); + goto no_quote; + } + + /* See if the identifier needs to be quoted. */ + if (UNIV_UNLIKELY(!thd)) { + q = '"'; + } else { + q = get_quote_char_for_identifier(thd, s, (int) idlen); + } + + if (q == EOF) { +no_quote: + if (UNIV_UNLIKELY(idlen > buflen)) { + idlen = buflen; + } + memcpy(buf, s, idlen); + return(buf + idlen); + } + + /* Quote the identifier. */ + if (buflen < 2) { + return(buf); + } + + *buf++ = q; + buflen--; + + for (; idlen; idlen--) { + int c = *s++; + if (UNIV_UNLIKELY(c == q)) { + if (UNIV_UNLIKELY(buflen < 3)) { + break; + } + + *buf++ = c; + *buf++ = c; + buflen -= 2; + } else { + if (UNIV_UNLIKELY(buflen < 2)) { + break; + } + + *buf++ = c; + buflen--; + } + } + + *buf++ = q; + return(buf); +} + +/*****************************************************************//** +Convert a table or index name to the MySQL system_charset_info (UTF-8) +and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +char* +innobase_convert_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: identifier to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ + ibool table_id)/*!< in: TRUE=id is a table or database name; + FALSE=id is an index name */ +{ + char* s = buf; + const char* bufend = buf + buflen; + + if (table_id) { + const char* slash = (const char*) memchr(id, '/', idlen); + if (!slash) { + + goto no_db_name; + } + + /* Print the database name and table name separately. */ + s = innobase_convert_identifier(s, bufend - s, id, slash - id, + thd, TRUE); + if (UNIV_LIKELY(s < bufend)) { + *s++ = '.'; + s = innobase_convert_identifier(s, bufend - s, + slash + 1, idlen + - (slash - id) - 1, + thd, TRUE); + } + } else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) { + /* Temporary index name (smart ALTER TABLE) */ + const char temp_index_suffix[]= "--temporary--"; + + s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1, + thd, FALSE); + if (s - buf + (sizeof temp_index_suffix - 1) < buflen) { + memcpy(s, temp_index_suffix, + sizeof temp_index_suffix - 1); + s += sizeof temp_index_suffix - 1; + } + } else { +no_db_name: + s = innobase_convert_identifier(buf, buflen, id, idlen, + thd, table_id); + } + + return(s); +} + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table or +index name to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name, /*!< in: index or table name to format */ + ibool is_index_name) /*!< in: index name */ +{ + const char* bufend; + + bufend = innobase_convert_name(buf, buflen, name, strlen(name), + NULL, !is_index_name); + + ut_ad((ulint) (bufend - buf) < buflen); + + buf[bufend - buf] = '\0'; +} + +/**********************************************************************//** +Determines if the currently running transaction has been interrupted. +@return TRUE if interrupted */ +UNIV_INTERN +ibool +trx_is_interrupted( +/*===============*/ + const trx_t* trx) /*!< in: transaction */ +{ + return(trx && trx->mysql_thd && thd_kill_level(trx->mysql_thd)); +} + +/**********************************************************************//** +Determines if the currently running transaction is in strict mode. +@return TRUE if strict */ +UNIV_INTERN +ibool +trx_is_strict( +/*==========*/ + trx_t* trx) /*!< in: transaction */ +{ + return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode)); +} + +/**************************************************************//** +Resets some fields of a prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ +inline +void +ha_innobase::reset_template(void) +/*=============================*/ +{ + ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_ad(prebuilt->magic_n2 == prebuilt->magic_n); + + /* Force table to be freed in close_thread_table(). */ + DBUG_EXECUTE_IF("free_table_in_fts_query", + if (prebuilt->in_fts_query) { + table->m_needs_reopen = true; + } + ); + + prebuilt->keep_other_fields_on_keyread = 0; + prebuilt->read_just_key = 0; + prebuilt->in_fts_query = 0; + /* Reset index condition pushdown state. */ + if (prebuilt->idx_cond) { + prebuilt->idx_cond = NULL; + prebuilt->idx_cond_n_cols = 0; + /* Invalidate prebuilt->mysql_template + in ha_innobase::write_row(). */ + prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE; + } +} + +/*****************************************************************//** +Call this when you have opened a new table handle in HANDLER, before you +call index_read_idx() etc. Actually, we can let the cursor stay open even +over a transaction commit! Then you should call this before every operation, +fetch next etc. This function inits the necessary things even after a +transaction commit. */ +UNIV_INTERN +void +ha_innobase::init_table_handle_for_HANDLER(void) +/*============================================*/ +{ + /* If current thd does not yet have a trx struct, create one. + If the current handle does not yet have a prebuilt struct, create + one. Update the trx pointers in the prebuilt struct. Normally + this operation is done in external_lock. */ + + update_thd(ha_thd()); + + /* Initialize the prebuilt struct much like it would be inited in + external_lock */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + innobase_srv_conc_force_exit_innodb(prebuilt->trx); + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started_xa(prebuilt->trx); + + /* Assign a read view if the transaction does not have it yet */ + + trx_assign_read_view(prebuilt->trx); + + innobase_register_trx(ht, user_thd, prebuilt->trx); + + /* We did the necessary inits in this function, no need to repeat them + in row_search_for_mysql */ + + prebuilt->sql_stat_start = FALSE; + + /* We let HANDLER always to do the reads as consistent reads, even + if the trx isolation level would have been specified as SERIALIZABLE */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + + /* Always fetch all columns in the index record */ + + prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS; + + /* We want always to fetch all columns in the whole row? Or do + we???? */ + + prebuilt->used_in_HANDLER = TRUE; + reset_template(); +} + +/****************************************************************//** +Gives the file extension of an InnoDB single-table tablespace. */ +static const char* ha_innobase_exts[] = { + ".ibd", + ".isl", + NullS +}; + +/*********************************************************************//** +Opens an InnoDB database. +@return 0 on success, error code on failure */ +static +int +innobase_init( +/*==========*/ + void *p) /*!< in: InnoDB handlerton */ +{ + static char current_dir[3]; /*!< Set if using current lib */ + int err; + bool ret; + char *default_path; + uint format_id; + ulong num_pll_degree; + + DBUG_ENTER("innobase_init"); + handlerton *innobase_hton= (handlerton*) p; + innodb_hton_ptr = innobase_hton; + + innobase_hton->state = SHOW_OPTION_YES; + innobase_hton->db_type= DB_TYPE_INNODB; + innobase_hton->savepoint_offset = sizeof(trx_named_savept_t); + innobase_hton->close_connection = innobase_close_connection; + innobase_hton->savepoint_set = innobase_savepoint; + innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint; + innobase_hton->savepoint_rollback_can_release_mdl = + innobase_rollback_to_savepoint_can_release_mdl; + innobase_hton->savepoint_release = innobase_release_savepoint; + innobase_hton->prepare_ordered= NULL; + innobase_hton->commit_ordered= innobase_commit_ordered; + innobase_hton->commit = innobase_commit; + innobase_hton->rollback = innobase_rollback; + innobase_hton->prepare = innobase_xa_prepare; + innobase_hton->recover = innobase_xa_recover; + innobase_hton->commit_by_xid = innobase_commit_by_xid; + innobase_hton->rollback_by_xid = innobase_rollback_by_xid; + innobase_hton->commit_checkpoint_request=innobase_checkpoint_request; + innobase_hton->create_cursor_read_view = innobase_create_cursor_view; + innobase_hton->set_cursor_read_view = innobase_set_cursor_view; + innobase_hton->close_cursor_read_view = innobase_close_cursor_view; + innobase_hton->create = innobase_create_handler; + innobase_hton->drop_database = innobase_drop_database; + innobase_hton->panic = innobase_end; + + innobase_hton->start_consistent_snapshot = + innobase_start_trx_and_assign_read_view; + + innobase_hton->flush_logs = innobase_flush_logs; + innobase_hton->show_status = innobase_show_status; + innobase_hton->flags = + HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS; + + innobase_hton->release_temporary_latches = + innobase_release_temporary_latches; +#ifdef WITH_WSREP + innobase_hton->wsrep_abort_transaction=wsrep_abort_transaction; + innobase_hton->wsrep_set_checkpoint=innobase_wsrep_set_checkpoint; + innobase_hton->wsrep_get_checkpoint=innobase_wsrep_get_checkpoint; + innobase_hton->wsrep_fake_trx_id=wsrep_fake_trx_id; +#endif /* WITH_WSREP */ + innobase_hton->kill_query = innobase_kill_query; + + if (srv_file_per_table) + innobase_hton->tablefile_extensions = ha_innobase_exts; + + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); + +#ifndef DBUG_OFF + static const char test_filename[] = "-@"; + char test_tablename[sizeof test_filename + + sizeof(srv_mysql50_table_name_prefix) - 1]; + if ((sizeof(test_tablename)) - 1 + != filename_to_tablename(test_filename, + test_tablename, + sizeof(test_tablename), true) + || strncmp(test_tablename, + srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1) + || strcmp(test_tablename + + sizeof(srv_mysql50_table_name_prefix) - 1, + test_filename)) { + + sql_print_error("tablename encoding has been changed"); + + goto error; + } +#endif /* DBUG_OFF */ + + /* Check that values don't overflow on 32-bit systems. */ + if (sizeof(ulint) == 4) { + if (innobase_buffer_pool_size > UINT_MAX32) { + sql_print_error( + "innobase_buffer_pool_size can't be over 4GB" + " on 32-bit systems"); + + goto error; + } + } + + os_innodb_umask = (ulint) my_umask; + + /* First calculate the default path for innodb_data_home_dir etc., + in case the user has not given any value. + + Note that when using the embedded server, the datadirectory is not + necessarily the current directory of this program. */ + + if (mysqld_embedded) { + default_path = mysql_real_data_home; + fil_path_to_mysql_datadir = mysql_real_data_home; + } else { + /* It's better to use current lib, to keep paths short */ + current_dir[0] = FN_CURLIB; + current_dir[1] = FN_LIBCHAR; + current_dir[2] = 0; + default_path = current_dir; + } + + ut_a(default_path); + + /* Set InnoDB initialization parameters according to the values + read from MySQL .cnf file */ + + /*--------------- Data files -------------------------*/ + + /* The default dir for data files is the datadir of MySQL */ + + srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir : + default_path); + + /* Set default InnoDB data file size to 12 MB and let it be + auto-extending. Thus users can use InnoDB in >= 4.0 without having + to specify any startup options. */ + + if (!innobase_data_file_path) { + innobase_data_file_path = (char*) "ibdata1:12M:autoextend"; + } + + /* Since InnoDB edits the argument in the next call, we make another + copy of it: */ + + internal_innobase_data_file_path = my_strdup(innobase_data_file_path, + MYF(MY_FAE)); + + ret = (bool) srv_parse_data_file_paths_and_sizes( + internal_innobase_data_file_path); + if (ret == FALSE) { + sql_print_error( + "InnoDB: syntax error in innodb_data_file_path" + " or size specified is less than 1 megabyte"); +mem_free_and_error: + srv_free_paths_and_sizes(); + my_free(internal_innobase_data_file_path); + goto error; + } + + /* -------------- All log files ---------------------------*/ + + /* The default dir for log files is the datadir of MySQL */ + + if (!srv_log_group_home_dir) { + srv_log_group_home_dir = default_path; + } + +#ifdef UNIV_LOG_ARCHIVE + /* Since innodb_log_arch_dir has no relevance under MySQL, + starting from 4.0.6 we always set it the same as + innodb_log_group_home_dir: */ + + innobase_log_arch_dir = innobase_log_group_home_dir; + + srv_arch_dir = innobase_log_arch_dir; +#endif /* UNIG_LOG_ARCHIVE */ + + srv_normalize_path_for_win(srv_log_group_home_dir); + + if (strchr(srv_log_group_home_dir, ';')) { + sql_print_error("syntax error in innodb_log_group_home_dir"); + goto mem_free_and_error; + } + + if (innobase_mirrored_log_groups == 1) { + sql_print_warning( + "innodb_mirrored_log_groups is an unimplemented " + "feature and the variable will be completely " + "removed in a future version."); + } + + if (innobase_mirrored_log_groups > 1) { + sql_print_error( + "innodb_mirrored_log_groups is an unimplemented feature and " + "the variable will be completely removed in a future version. " + "Using values other than 1 is not supported."); + goto mem_free_and_error; + } + + if (innobase_mirrored_log_groups == 0) { + /* To throw a deprecation warning message when the option is + passed, the default was changed to '0' (as a workaround). Since + the only value accepted for this option is '1', reset it to 1 */ + innobase_mirrored_log_groups = 1; + } + + /* Validate the file format by animal name */ + if (innobase_file_format_name != NULL) { + + format_id = innobase_file_format_name_lookup( + innobase_file_format_name); + + if (format_id > UNIV_FORMAT_MAX) { + + sql_print_error("InnoDB: wrong innodb_file_format."); + + goto mem_free_and_error; + } + } else { + /* Set it to the default file format id. Though this + should never happen. */ + format_id = 0; + } + + srv_file_format = format_id; + + /* Given the type of innobase_file_format_name we have little + choice but to cast away the constness from the returned name. + innobase_file_format_name is used in the MySQL set variable + interface and so can't be const. */ + + innobase_file_format_name = + (char*) trx_sys_file_format_id_to_name(format_id); + + /* Check innobase_file_format_check variable */ + if (!innobase_file_format_check) { + + /* Set the value to disable checking. */ + srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1; + + } else { + + /* Set the value to the lowest supported format. */ + srv_max_file_format_at_startup = UNIV_FORMAT_MIN; + } + + /* Did the user specify a format name that we support? + As a side effect it will update the variable + srv_max_file_format_at_startup */ + if (innobase_file_format_validate_and_set( + innobase_file_format_max) < 0) { + + sql_print_error("InnoDB: invalid " + "innodb_file_format_max value: " + "should be any value up to %s or its " + "equivalent numeric id", + trx_sys_file_format_id_to_name( + UNIV_FORMAT_MAX)); + + goto mem_free_and_error; + } + + if (innobase_change_buffering) { + ulint use; + + for (use = 0; + use < UT_ARR_SIZE(innobase_change_buffering_values); + use++) { + if (!innobase_strcasecmp( + innobase_change_buffering, + innobase_change_buffering_values[use])) { + ibuf_use = (ibuf_use_t) use; + goto innobase_change_buffering_inited_ok; + } + } + + sql_print_error("InnoDB: invalid value " + "innodb_change_buffering=%s", + innobase_change_buffering); + goto mem_free_and_error; + } + +innobase_change_buffering_inited_ok: + ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values)); + innobase_change_buffering = (char*) + innobase_change_buffering_values[ibuf_use]; + + /* Check that interdependent parameters have sane values. */ + if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) { + sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct.\n" + "InnoDB: Setting" + " innodb_max_dirty_pages_pct_lwm to %lf\n", + srv_max_buf_pool_modified_pct); + + srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct; + } + + if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) { + + if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) { + /* Avoid overflow. */ + srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT; + } else { + /* The user has not set the value. We should + set it based on innodb_io_capacity. */ + srv_max_io_capacity = static_cast<ulong>( + ut_max(2 * srv_io_capacity, 2000)); + } + + } else if (srv_max_io_capacity < srv_io_capacity) { + sql_print_warning("InnoDB: innodb_io_capacity" + " cannot be set higher than" + " innodb_io_capacity_max.\n" + "InnoDB: Setting" + " innodb_io_capacity to %lu\n", + srv_max_io_capacity); + + srv_io_capacity = srv_max_io_capacity; + } + + if (!is_filename_allowed(srv_buf_dump_filename, + strlen(srv_buf_dump_filename), FALSE)) { + sql_print_error("InnoDB: innodb_buffer_pool_filename" + " cannot have colon (:) in the file name."); + goto mem_free_and_error; + } + + /* --------------------------------------------------*/ + + srv_file_flush_method_str = innobase_file_flush_method; + + srv_log_file_size = (ib_uint64_t) innobase_log_file_size; + +#ifdef UNIV_LOG_ARCHIVE + srv_log_archive_on = (ulint) innobase_log_archive; +#endif /* UNIV_LOG_ARCHIVE */ + + /* Check that the value of system variable innodb_page_size was + set correctly. Its value was put into srv_page_size. If valid, + return the associated srv_page_size_shift.*/ + srv_page_size_shift = innodb_page_size_validate(srv_page_size); + if (!srv_page_size_shift) { + sql_print_error("InnoDB: Invalid page size=%lu.\n", + srv_page_size); + goto mem_free_and_error; + } + if (UNIV_PAGE_SIZE_DEF != srv_page_size) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: innodb-page-size has been changed" + " from the default value %d to %lu.\n", + UNIV_PAGE_SIZE_DEF, srv_page_size); + } + + srv_log_buffer_size = (ulint) innobase_log_buffer_size; + + if (innobase_buffer_pool_instances == 0) { + innobase_buffer_pool_instances = 8; + +#if defined(__WIN__) && !defined(_WIN64) + if (innobase_buffer_pool_size > 1331 * 1024 * 1024) { + innobase_buffer_pool_instances + = ut_min(MAX_BUFFER_POOLS, + (long) (innobase_buffer_pool_size + / (128 * 1024 * 1024))); + } +#endif /* defined(__WIN__) && !defined(_WIN64) */ + } + srv_buf_pool_size = (ulint) innobase_buffer_pool_size; + srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances; + + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; + + if (innobase_additional_mem_pool_size + != 8*1024*1024L /* the default */ ) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Using " + "innodb_additional_mem_pool_size is DEPRECATED. " + "This option may be removed in future releases, " + "together with the option innodb_use_sys_malloc " + "and with the InnoDB's internal memory " + "allocator.\n"); + } + + if (!srv_use_sys_malloc ) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Setting " + "innodb_use_sys_malloc to FALSE is DEPRECATED. " + "This option may be removed in future releases, " + "together with the InnoDB's internal memory " + "allocator.\n"); + } + + srv_n_file_io_threads = (ulint) innobase_file_io_threads; + srv_n_read_io_threads = (ulint) innobase_read_io_threads; + srv_n_write_io_threads = (ulint) innobase_write_io_threads; + + srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; + + if (!innobase_use_checksums) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Setting " + "innodb_checksums to OFF is DEPRECATED. " + "This option may be removed in future releases. " + "You should set innodb_checksum_algorithm=NONE " + "instead.\n"); + srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE; + } + +#ifdef HAVE_LARGE_PAGES + if ((os_use_large_pages = (ibool) my_use_large_pages)) { + os_large_page_size = (ulint) opt_large_page_size; + } +#endif + + row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout; + + srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog; + if (innobase_locks_unsafe_for_binlog) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Using " + "innodb_locks_unsafe_for_binlog is DEPRECATED. " + "This option may be removed in future releases. " + "Please use READ COMMITTED transaction isolation " + "level instead, see " REFMAN "set-transaction.html.\n"); + } + + if (innobase_open_files < 10) { + innobase_open_files = 300; + if (srv_file_per_table && tc_size > 300) { + innobase_open_files = tc_size; + } + } + + if (innobase_open_files > (long) tc_size) { + fprintf(stderr, + "innodb_open_files should not be greater" + " than the open_files_limit.\n"); + innobase_open_files = tc_size; + } + + srv_max_n_open_files = (ulint) innobase_open_files; + srv_innodb_status = (ibool) innobase_create_status_file; + + srv_print_verbose_log = mysqld_embedded ? 0 : 1; + + /* Round up fts_sort_pll_degree to nearest power of 2 number */ + for (num_pll_degree = 1; + num_pll_degree < fts_sort_pll_degree; + num_pll_degree <<= 1) { + + /* No op */ + } + + fts_sort_pll_degree = num_pll_degree; + + /* Store the default charset-collation number of this MySQL + installation */ + + data_mysql_default_charset_coll = (ulint) default_charset_info->number; + + ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL == + my_charset_latin1.number); + ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number); + + /* Store the latin1_swedish_ci character ordering table to InnoDB. For + non-latin1_swedish_ci charsets we use the MySQL comparison functions, + and consequently we do not need to know the ordering internally in + InnoDB. */ + + srv_latin1_ordering = my_charset_latin1.sort_order; + + innobase_commit_concurrency_init_default(); + +#ifdef HAVE_POSIX_FALLOCATE + srv_use_posix_fallocate = (ibool) innobase_use_fallocate; +#endif + srv_use_atomic_writes = (ibool) innobase_use_atomic_writes; + + if (innobase_use_atomic_writes) { + fprintf(stderr, "InnoDB: using atomic writes.\n"); + + /* Force doublewrite buffer off, atomic writes replace it. */ + if (srv_use_doublewrite_buf) { + fprintf(stderr, "InnoDB: Switching off doublewrite buffer " + "because of atomic writes.\n"); + innobase_use_doublewrite = srv_use_doublewrite_buf = FALSE; + } + + /* Force O_DIRECT on Unixes (on Windows writes are always unbuffered)*/ +#ifndef _WIN32 + if(!innobase_file_flush_method || + !strstr(innobase_file_flush_method, "O_DIRECT")) { + innobase_file_flush_method = + srv_file_flush_method_str = (char*)"O_DIRECT"; + fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n"); + } +#endif +#ifdef HAVE_POSIX_FALLOCATE + /* Due to a bug in directFS, using atomics needs + * posix_fallocate to extend the file + * pwrite() past end of the file won't work + */ + srv_use_posix_fallocate = TRUE; +#endif + } + +#ifdef HAVE_PSI_INTERFACE + /* Register keys with MySQL performance schema */ + int count; + + count = array_elements(all_pthread_mutexes); + mysql_mutex_register("innodb", all_pthread_mutexes, count); + +# ifdef UNIV_PFS_MUTEX + count = array_elements(all_innodb_mutexes); + mysql_mutex_register("innodb", all_innodb_mutexes, count); +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK + count = array_elements(all_innodb_rwlocks); + mysql_rwlock_register("innodb", all_innodb_rwlocks, count); +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_THREAD + count = array_elements(all_innodb_threads); + mysql_thread_register("innodb", all_innodb_threads, count); +# endif /* UNIV_PFS_THREAD */ + +# ifdef UNIV_PFS_IO + count = array_elements(all_innodb_files); + mysql_file_register("innodb", all_innodb_files, count); +# endif /* UNIV_PFS_IO */ + + count = array_elements(all_innodb_conds); + mysql_cond_register("innodb", all_innodb_conds, count); +#endif /* HAVE_PSI_INTERFACE */ + + /* Since we in this module access directly the fields of a trx + struct, and due to different headers and flags it might happen that + ib_mutex_t has a different size in this module and in InnoDB + modules, we check at run time that the size is the same in + these compilation modules. */ + + err = innobase_start_or_create_for_mysql(); + + if (err != DB_SUCCESS) { + goto mem_free_and_error; } - trx = thd_to_trx(thd); + /* Adjust the innodb_undo_logs config object */ + innobase_undo_logs_init_default_max(); - if (trx != NULL) { - trx_search_latch_release_if_reserved(trx); + innobase_old_blocks_pct = static_cast<uint>( + buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE)); + + ibuf_max_size_update(innobase_change_buffer_max_size); + + innobase_open_tables = hash_create(200); + mysql_mutex_init(innobase_share_mutex_key, + &innobase_share_mutex, + MY_MUTEX_INIT_FAST); + mysql_mutex_init(commit_cond_mutex_key, + &commit_cond_m, MY_MUTEX_INIT_FAST); + mysql_cond_init(commit_cond_key, &commit_cond, NULL); + mysql_mutex_init(pending_checkpoint_mutex_key, + &pending_checkpoint_mutex, + MY_MUTEX_INIT_FAST); + innodb_inited= 1; +#ifdef MYSQL_DYNAMIC_PLUGIN + if (innobase_hton != p) { + innobase_hton = reinterpret_cast<handlerton*>(p); + *innobase_hton = *innodb_hton_ptr; } +#endif /* MYSQL_DYNAMIC_PLUGIN */ - return(0); + /* Get the current high water mark format. */ + innobase_file_format_max = (char*) trx_sys_file_format_max_get(); + + /* Currently, monitor counter information are not persistent. */ + memset(monitor_set_tbl, 0, sizeof monitor_set_tbl); + + memset(innodb_counter_value, 0, sizeof innodb_counter_value); + + /* Do this as late as possible so server is fully starts up, + since we might get some initial stats if user choose to turn + on some counters from start up */ + if (innobase_enable_monitor_counter) { + innodb_enable_monitor_at_startup( + innobase_enable_monitor_counter); + } + + /* Turn on monitor counters that are default on */ + srv_mon_default_on(); + + DBUG_RETURN(FALSE); +error: + DBUG_RETURN(TRUE); } -#ifdef WITH_WSREP -static int -wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd, - my_bool signal); -static void -wsrep_fake_trx_id(handlerton* hton, THD *thd); -static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid); -static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid); -#endif -/********************************************************************//** -Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth -time calls srv_active_wake_master_thread. This function should be used -when a single database operation may introduce a small need for -server utility activity, like checkpointing. */ -static inline +/** Shut down the InnoDB storage engine. +@return 0 */ +static +int +innobase_end(handlerton*, ha_panic_function) +{ + DBUG_ENTER("innobase_end"); + + if (innodb_inited) { + + THD *thd= current_thd; + if (thd) { // may be UNINSTALL PLUGIN statement + trx_t* trx = thd_to_trx(thd); + if (trx) { + trx_free_for_mysql(trx); + } + } + + srv_fast_shutdown = (ulint) innobase_fast_shutdown; + + innodb_inited = 0; + hash_table_free(innobase_open_tables); + innobase_open_tables = NULL; + innodb_shutdown(); + srv_free_paths_and_sizes(); + my_free(internal_innobase_data_file_path); + mysql_mutex_destroy(&innobase_share_mutex); + mysql_mutex_destroy(&commit_cond_m); + mysql_cond_destroy(&commit_cond); + mysql_mutex_destroy(&pending_checkpoint_mutex); + } + + DBUG_RETURN(0); +} + +/****************************************************************//** +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes +the logs, and the name of this function should be innobase_checkpoint. +@return TRUE if error */ +static +bool +innobase_flush_logs( +/*================*/ + handlerton* hton) /*!< in/out: InnoDB handlerton */ +{ + bool result = 0; + + DBUG_ENTER("innobase_flush_logs"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (!srv_read_only_mode) { + log_buffer_flush_to_disk(); + } + + DBUG_RETURN(result); +} + +/*****************************************************************//** +Commits a transaction in an InnoDB database. */ +static void -innobase_active_small(void) -/*=======================*/ +innobase_commit_low( +/*================*/ + trx_t* trx) /*!< in: transaction handle */ { - innobase_active_counter++; +#ifdef WITH_WSREP + THD* thd = (THD*)trx->mysql_thd; + const char* tmp = 0; + if (wsrep_on((void*)thd)) { +#ifdef WSREP_PROC_INFO + char info[64]; + info[sizeof(info) - 1] = '\0'; + snprintf(info, sizeof(info) - 1, + "innobase_commit_low():trx_commit_for_mysql(%lld)", + (long long) wsrep_thd_trx_seqno(thd)); + tmp = thd_proc_info(thd, info); - if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) { - srv_active_wake_master_thread(); +#else + tmp = thd_proc_info(thd, "innobase_commit_low()"); +#endif /* WSREP_PROC_INFO */ + } +#endif /* WITH_WSREP */ + if (trx_is_started(trx)) { + + trx_commit_for_mysql(trx); } +#ifdef WITH_WSREP + if (wsrep_on((void*)thd)) { thd_proc_info(thd, tmp); } +#endif /* WITH_WSREP */ } -/********************************************************************//** -Converts an InnoDB error code to a MySQL error code and also tells to MySQL -about a possible transaction rollback inside InnoDB caused by a lock wait -timeout or a deadlock. -@return MySQL error code */ -extern "C" UNIV_INTERN +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. +@return 0 */ +static int -convert_error_code_to_mysql( -/*========================*/ - int error, /*!< in: InnoDB error code */ - ulint flags, /*!< in: InnoDB table flags, or 0 */ - THD* thd) /*!< in: user thread handle or NULL */ +innobase_start_trx_and_assign_read_view( +/*====================================*/ + handlerton* hton, /*!< in: Innodb handlerton */ + THD* thd) /*!< in: MySQL thread handle of the user for + whom the transaction should be committed */ { - switch (error) { - case DB_SUCCESS: - return(0); + trx_t* trx; - case DB_INTERRUPTED: - return(HA_ERR_ABORTED_BY_USER); + DBUG_ENTER("innobase_start_trx_and_assign_read_view"); + DBUG_ASSERT(hton == innodb_hton_ptr); - case DB_FOREIGN_EXCEED_MAX_CASCADE: - push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, - HA_ERR_ROW_IS_REFERENCED, - "InnoDB: Cannot delete/update " - "rows with cascading foreign key " - "constraints that exceed max " - "depth of %d. Please " - "drop extra constraints and try " - "again", DICT_FK_MAX_RECURSIVE_LOAD); + /* Create a new trx struct for thd, if it does not yet have one */ - /* fall through */ + trx = check_trx_exists(thd); - case DB_ERROR: - default: - return(-1); /* unspecified error */ + /* This is just to play safe: release a possible FIFO ticket and + search latch. Since we can potentially reserve the trx_sys->mutex, + we have to release the search system latch first to obey the latching + order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started_xa(trx); + + /* Assign a read view if the transaction does not have it yet. + Do this only if transaction is using REPEATABLE READ isolation + level. */ + trx->isolation_level = innobase_map_isolation_level( + thd_get_trx_isolation(thd)); + + if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) { + trx_assign_read_view(trx); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_UNSUPPORTED, + "InnoDB: WITH CONSISTENT SNAPSHOT " + "was ignored because this phrase " + "can only be used with " + "REPEATABLE READ isolation level."); + } + + /* Set the MySQL flag to mark that there is an active transaction */ + + innobase_register_trx(hton, current_thd, trx); + + DBUG_RETURN(0); +} + +static +void +innobase_commit_ordered_2( +/*============*/ + trx_t* trx, /*!< in: Innodb transaction */ + THD* thd) /*!< in: MySQL thread handle */ +{ + DBUG_ENTER("innobase_commit_ordered_2"); + + /* We need current binlog position for mysqlbackup to work. + Note, the position is current because commit_ordered is guaranteed + to be called in same sequenece as writing to binlog. */ + +retry: + if (innobase_commit_concurrency > 0) { + mysql_mutex_lock(&commit_cond_m); + commit_threads++; + + if (commit_threads > innobase_commit_concurrency) { + commit_threads--; + mysql_cond_wait(&commit_cond, + &commit_cond_m); + mysql_mutex_unlock(&commit_cond_m); + goto retry; + } + else { + mysql_mutex_unlock(&commit_cond_m); + } + } + + unsigned long long pos; + thd_binlog_pos(thd, &trx->mysql_log_file_name, &pos); + trx->mysql_log_offset= static_cast<ib_int64_t>(pos); + /* Don't do write + flush right now. For group commit + to work we want to do the flush in the innobase_commit() + method, which runs without holding any locks. */ + trx->flush_log_later = TRUE; + innobase_commit_low(trx); + trx->flush_log_later = FALSE; + + if (innobase_commit_concurrency > 0) { + mysql_mutex_lock(&commit_cond_m); + commit_threads--; + mysql_cond_signal(&commit_cond); + mysql_mutex_unlock(&commit_cond_m); + } + + DBUG_VOID_RETURN; +} - case DB_DUPLICATE_KEY: - /* Be cautious with returning this error, since - mysql could re-enter the storage layer to get - duplicated key info, the operation requires a - valid table handle and/or transaction information, - which might not always be available in the error - handling stage. */ - return(HA_ERR_FOUND_DUPP_KEY); +/*****************************************************************//** +Perform the first, fast part of InnoDB commit. - case DB_FOREIGN_DUPLICATE_KEY: - return(HA_ERR_FOREIGN_DUPLICATE_KEY); +Doing it in this call ensures that we get the same commit order here +as in binlog and any other participating transactional storage engines. - case DB_MISSING_HISTORY: - return(HA_ERR_TABLE_DEF_CHANGED); +Note that we want to do as little as really needed here, as we run +under a global mutex. The expensive fsync() is done later, in +innobase_commit(), without a lock so group commit can take place. - case DB_RECORD_NOT_FOUND: - return(HA_ERR_NO_ACTIVE_RECORD); +Note also that this method can be called from a different thread than +the one handling the rest of the transaction. */ +static +void +innobase_commit_ordered( +/*============*/ + handlerton *hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the user for whom + the transaction should be committed */ + bool all) /*!< in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + trx_t* trx; + DBUG_ENTER("innobase_commit_ordered"); + DBUG_ASSERT(hton == innodb_hton_ptr); - case DB_DEADLOCK: - /* Since we rolled back the whole transaction, we must - tell it also to MySQL so that MySQL knows to empty the - cached binlog for this transaction */ + trx = check_trx_exists(thd); - if (thd) { - thd_mark_transaction_to_rollback(thd, TRUE); - } + /* Since we will reserve the kernel mutex, we must not be holding the + search system latch, or we will disobey the latching order. But we + already released it in innobase_xa_prepare() (if not before), so just + have an assert here.*/ + ut_ad(!trx->has_search_latch); - return(HA_ERR_LOCK_DEADLOCK); + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { + /* We cannot throw error here; instead we will catch this error + again in innobase_commit() and report it from there. */ + DBUG_VOID_RETURN; + } - case DB_LOCK_WAIT_TIMEOUT: - /* Starting from 5.0.13, we let MySQL just roll back the - latest SQL statement in a lock wait timeout. Previously, we - rolled back the whole transaction. */ + /* commit_ordered is only called when committing the whole transaction + (or an SQL statement when autocommit is on). */ + DBUG_ASSERT(all || + (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); - if (thd) { - thd_mark_transaction_to_rollback( - thd, (bool)row_rollback_on_timeout); - } + innobase_commit_ordered_2(trx, thd); - return(HA_ERR_LOCK_WAIT_TIMEOUT); + trx_set_active_commit_ordered(trx); - case DB_NO_REFERENCED_ROW: - return(HA_ERR_NO_REFERENCED_ROW); + DBUG_VOID_RETURN; +} - case DB_ROW_IS_REFERENCED: - return(HA_ERR_ROW_IS_REFERENCED); +/*****************************************************************//** +Commits a transaction in an InnoDB database or marks an SQL statement +ended. +@return 0 */ +static +int +innobase_commit( +/*============*/ + handlerton* hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + bool commit_trx) /*!< in: true - commit transaction + false - the current SQL statement + ended */ +{ + trx_t* trx; - case DB_CANNOT_ADD_CONSTRAINT: - case DB_CHILD_NO_INDEX: - case DB_PARENT_NO_INDEX: - return(HA_ERR_CANNOT_ADD_FOREIGN); + DBUG_ENTER("innobase_commit"); + DBUG_ASSERT(hton == innodb_hton_ptr); + DBUG_PRINT("trans", ("ending transaction")); - case DB_CANNOT_DROP_CONSTRAINT: + trx = check_trx_exists(thd); - return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit - misleading, a new MySQL error - code should be introduced */ + /* Since we will reserve the trx_sys->mutex, we have to release + the search system latch first to obey the latching order. */ - case DB_CORRUPTION: - return(HA_ERR_CRASHED); + if (trx->has_search_latch && !trx_is_active_commit_ordered(trx)) { + trx_search_latch_release_if_reserved(trx); + } - case DB_OUT_OF_FILE_SPACE: - return(HA_ERR_RECORD_FILE_FULL); + /* Transaction is deregistered only in a commit or a rollback. If + it is deregistered we know there cannot be resources to be freed + and we could return immediately. For the time being, we play safe + and do the cleanup though there should be nothing to clean up. */ - case DB_TABLE_IN_FK_CHECK: - return(HA_ERR_TABLE_IN_FK_CHECK); + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { - case DB_TABLE_IS_BEING_USED: - return(HA_ERR_WRONG_COMMAND); + sql_print_error("Transaction not registered for MySQL 2PC, " + "but transaction is active"); + } - case DB_TABLE_NOT_FOUND: - return(HA_ERR_NO_SUCH_TABLE); + if (commit_trx + || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { - case DB_TOO_BIG_RECORD: { - /* If prefix is true then a 768-byte prefix is stored - locally for BLOB fields. Refer to dict_table_get_format() */ - bool prefix = ((flags & DICT_TF_FORMAT_MASK) - >> DICT_TF_FORMAT_SHIFT) < UNIV_FORMAT_B; - my_printf_error(ER_TOO_BIG_ROWSIZE, - "Row size too large (> %lu). Changing some columns " - "to TEXT or BLOB %smay help. In current row " - "format, BLOB prefix of %d bytes is stored inline.", - MYF(0), - page_get_free_space_of_empty(flags & - DICT_TF_COMPACT) / 2, - prefix ? "or using ROW_FORMAT=DYNAMIC " - "or ROW_FORMAT=COMPRESSED ": "", - prefix ? DICT_MAX_FIXED_COL_LEN : 0); - return(HA_ERR_TO_BIG_ROW); - } + /* Run the fast part of commit if we did not already. */ + if (!trx_is_active_commit_ordered(trx)) { + innobase_commit_ordered_2(trx, thd); + } - case DB_TOO_BIG_INDEX_COL: - my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), - DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)); - return(HA_ERR_INDEX_COL_TOO_LONG); + /* We were instructed to commit the whole transaction, or + this is an SQL statement end and autocommit is on */ - case DB_NO_SAVEPOINT: - return(HA_ERR_NO_SAVEPOINT); + /* At this point commit order is fixed and transaction is + visible to others. So we can wakeup other commits waiting for + this one, to allow then to group commit with us. */ + thd_wakeup_subsequent_commits(thd, 0); - case DB_LOCK_TABLE_FULL: - /* Since we rolled back the whole transaction, we must - tell it also to MySQL so that MySQL knows to empty the - cached binlog for this transaction */ + /* We did the first part already in innobase_commit_ordered(), + Now finish by doing a write + flush of logs. */ + trx_commit_complete_for_mysql(trx); + trx_deregister_from_2pc(trx); + } else { + /* We just mark the SQL statement ended and do not do a + transaction commit */ - if (thd) { - thd_mark_transaction_to_rollback(thd, TRUE); - } + /* If we had reserved the auto-inc lock for some + table in this SQL statement we release it now */ - return(HA_ERR_LOCK_TABLE_FULL); + lock_unlock_table_autoinc(trx); - case DB_PRIMARY_KEY_IS_NULL: - return(ER_PRIMARY_CANT_HAVE_NULL); + /* Store the current undo_no of the transaction so that we + know where to roll back if we have to roll back the next + SQL statement */ - case DB_TOO_MANY_CONCURRENT_TRXS: - /* New error code HA_ERR_TOO_MANY_CONCURRENT_TRXS is only - available in 5.1.38 and later, but the plugin should still - work with previous versions of MySQL. */ -#ifdef HA_ERR_TOO_MANY_CONCURRENT_TRXS - return(HA_ERR_TOO_MANY_CONCURRENT_TRXS); -#else /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */ - return(HA_ERR_RECORD_FILE_FULL); -#endif /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */ - case DB_UNSUPPORTED: - return(HA_ERR_UNSUPPORTED); - case DB_INDEX_CORRUPT: - return(HA_ERR_INDEX_CORRUPT); - case DB_UNDO_RECORD_TOO_BIG: - return(HA_ERR_UNDO_REC_TOO_BIG); - case DB_OUT_OF_MEMORY: - return(HA_ERR_OUT_OF_MEM); - case DB_IDENTIFIER_TOO_LONG: - return(HA_ERR_INTERNAL_ERROR); + trx_mark_sql_stat_end(trx); } -} -/*************************************************************//** -Prints info of a THD object (== user session thread) to the given file. */ -extern "C" UNIV_INTERN -void -innobase_mysql_print_thd( -/*=====================*/ - FILE* f, /*!< in: output stream */ - void* thd, /*!< in: pointer to a MySQL THD object */ - uint max_query_len) /*!< in: max query length to print, or 0 to - use the default max length */ -{ - char buffer[1024]; + trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */ - fputs(thd_security_context((THD*) thd, buffer, sizeof buffer, - max_query_len), f); - putc('\n', f); + /* This is a statement level variable. */ + trx->fts_next_doc_id = 0; + + innobase_srv_conc_force_exit_innodb(trx); + + DBUG_RETURN(0); } -/******************************************************************//** -Get the variable length bounds of the given character set. */ -extern "C" UNIV_INTERN -void -innobase_get_cset_width( -/*====================*/ - ulint cset, /*!< in: MySQL charset-collation code */ - ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */ - ulint* mbmaxlen) /*!< out: maximum length of a char (in bytes) */ +/*****************************************************************//** +Rolls back a transaction or the latest SQL statement. +@return 0 or error number */ +static +int +innobase_rollback( +/*==============*/ + handlerton* hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back */ + bool rollback_trx) /*!< in: TRUE - rollback entire + transaction FALSE - rollback the current + statement only */ { - CHARSET_INFO* cs; - ut_ad(cset < 256); - ut_ad(mbminlen); - ut_ad(mbmaxlen); - - cs = all_charsets[cset]; - if (cs) { - *mbminlen = cs->mbminlen; - *mbmaxlen = cs->mbmaxlen; - ut_ad(*mbminlen < DATA_MBMAX); - ut_ad(*mbmaxlen < DATA_MBMAX); - } else { - THD* thd = current_thd; + dberr_t error; + trx_t* trx; - if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) { + DBUG_ENTER("innobase_rollback"); + DBUG_ASSERT(hton == innodb_hton_ptr); + DBUG_PRINT("trans", ("aborting transaction")); - /* Fix bug#46256: allow tables to be dropped if the - collation is not found, but issue a warning. */ - if ((global_system_variables.log_warnings) - && (cset != 0)){ + trx = check_trx_exists(thd); - sql_print_warning( - "Unknown collation #%lu.", cset); - } - } else { + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ - ut_a(cset == 0); - } + trx_search_latch_release_if_reserved(trx); - *mbminlen = *mbmaxlen = 0; - } -} + innobase_srv_conc_force_exit_innodb(trx); -/******************************************************************//** -Converts an identifier to a table name. */ -extern "C" UNIV_INTERN -void -innobase_convert_from_table_id( -/*===========================*/ - struct charset_info_st* cs, /*!< in: the 'from' character set */ - char* to, /*!< out: converted identifier */ - const char* from, /*!< in: identifier to convert */ - ulint len) /*!< in: length of 'to', in bytes */ -{ - uint errors; + trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */ - strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors); -} + /* If we had reserved the auto-inc lock for some table (if + we come here to roll back the latest SQL statement) we + release it now before a possibly lengthy rollback */ -/********************************************************************** -Check if the length of the identifier exceeds the maximum allowed. -return true when length of identifier is too long. */ -extern "C" -my_bool -innobase_check_identifier_length( -/*=============================*/ - const char* id) /* in: FK identifier to check excluding the - database portion. */ -{ - int well_formed_error = 0; - CHARSET_INFO *cs = system_charset_info; - DBUG_ENTER("innobase_check_identifier_length"); + lock_unlock_table_autoinc(trx); - uint res = cs->cset->well_formed_len(cs, id, id + strlen(id), - NAME_CHAR_LEN, - &well_formed_error); + /* This is a statement level variable. */ + trx->fts_next_doc_id = 0; - if (well_formed_error || res == NAME_CHAR_LEN) { - my_error(ER_TOO_LONG_IDENT, MYF(0), id); - DBUG_RETURN(true); + if (rollback_trx + || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + error = trx_rollback_for_mysql(trx); + trx_deregister_from_2pc(trx); + } else { + error = trx_rollback_last_sql_stat_for_mysql(trx); } - DBUG_RETURN(false); + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); } -/******************************************************************//** -Converts an identifier to UTF-8. */ -extern "C" UNIV_INTERN -void -innobase_convert_from_id( -/*=====================*/ - struct charset_info_st* cs, /*!< in: the 'from' character set */ - char* to, /*!< out: converted identifier */ - const char* from, /*!< in: identifier to convert */ - ulint len) /*!< in: length of 'to', in bytes */ +/*****************************************************************//** +Rolls back a transaction +@return 0 or error number */ +static +int +innobase_rollback_trx( +/*==================*/ + trx_t* trx) /*!< in: transaction */ { - uint errors; + dberr_t error = DB_SUCCESS; - strconvert(cs, from, system_charset_info, to, (uint) len, &errors); -} + DBUG_ENTER("innobase_rollback_trx"); + DBUG_PRINT("trans", ("aborting transaction")); -/********************************************************************** -Converts an identifier from my_charset_filename to UTF-8 charset. -@return result string length, as returned by strconvert() */ -extern "C" -uint -innobase_convert_to_system_charset( -/*===============================*/ - char* to, /* out: converted identifier */ - const char* from, /* in: identifier to convert */ - ulint len, /* in: length of 'to', in bytes */ - uint* errors) /* out: error return */ -{ - CHARSET_INFO* cs1 = &my_charset_filename; - CHARSET_INFO* cs2 = system_charset_info; + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ - return(strconvert(cs1, from, cs2, to, len, errors)); -} + trx_search_latch_release_if_reserved(trx); -/******************************************************************//** -Compares NUL-terminated UTF-8 strings case insensitively. -@return 0 if a=b, <0 if a<b, >1 if a>b */ -extern "C" UNIV_INTERN -int -innobase_strcasecmp( -/*================*/ - const char* a, /*!< in: first string to compare */ - const char* b) /*!< in: second string to compare */ -{ - return(my_strcasecmp(system_charset_info, a, b)); -} + innobase_srv_conc_force_exit_innodb(trx); -/******************************************************************//** -Strip dir name from a full path name and return only the file name -@return file name or "null" if no file name */ -extern "C" UNIV_INTERN -const char* -innobase_basename( -/*==============*/ - const char* path_name) /*!< in: full path name */ -{ - const char* name = base_name(path_name); + /* If we had reserved the auto-inc lock for some table (if + we come here to roll back the latest SQL statement) we + release it now before a possibly lengthy rollback */ - return((name) ? name : "null"); + lock_unlock_table_autoinc(trx); + + if (!trx->read_only) { + error = trx_rollback_for_mysql(trx); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); } -/******************************************************************//** -Makes all characters in a NUL-terminated UTF-8 string lower case. */ -extern "C" UNIV_INTERN + +struct pending_checkpoint { + struct pending_checkpoint *next; + handlerton *hton; + void *cookie; + ib_uint64_t lsn; +}; +static struct pending_checkpoint *pending_checkpoint_list; +static struct pending_checkpoint *pending_checkpoint_list_end; + +/*****************************************************************//** +Handle a commit checkpoint request from server layer. +We put the request in a queue, so that we can notify upper layer about +checkpoint complete when we have flushed the redo log. +If we have already flushed all relevant redo log, we notify immediately.*/ +static void -innobase_casedn_str( -/*================*/ - char* a) /*!< in/out: string to put in lower case */ +innobase_checkpoint_request( + handlerton *hton, + void *cookie) { - my_casedn_str(system_charset_info, a); -} + ib_uint64_t lsn; + ib_uint64_t flush_lsn; + struct pending_checkpoint * entry; + + /* Do the allocation outside of lock to reduce contention. The normal + case is that not everything is flushed, so we will need to enqueue. */ + entry = static_cast<struct pending_checkpoint *> + (my_malloc(sizeof(*entry), MYF(MY_WME))); + if (!entry) { + sql_print_error("Failed to allocate %u bytes." + " Commit checkpoint will be skipped.", + static_cast<unsigned>(sizeof(*entry))); + return; + } -/**********************************************************************//** -Determines the connection character set. -@return connection character set */ -extern "C" UNIV_INTERN -struct charset_info_st* -innobase_get_charset( -/*=================*/ - void* mysql_thd) /*!< in: MySQL thread handle */ -{ - return(thd_charset((THD*) mysql_thd)); + entry->next = NULL; + entry->hton = hton; + entry->cookie = cookie; + + mysql_mutex_lock(&pending_checkpoint_mutex); + lsn = log_get_lsn(); + flush_lsn = log_get_flush_lsn(); + if (lsn > flush_lsn) { + /* Put the request in queue. + When the log gets flushed past the lsn, we will remove the + entry from the queue and notify the upper layer. */ + entry->lsn = lsn; + if (pending_checkpoint_list_end) { + pending_checkpoint_list_end->next = entry; + /* There is no need to order the entries in the list + by lsn. The upper layer can accept notifications in + any order, and short delays in notifications do not + significantly impact performance. */ + } else { + pending_checkpoint_list = entry; + } + pending_checkpoint_list_end = entry; + entry = NULL; + } + mysql_mutex_unlock(&pending_checkpoint_mutex); + + if (entry) { + /* We are already flushed. Notify the checkpoint immediately. */ + commit_checkpoint_notify_ha(entry->hton, entry->cookie); + my_free(entry); + } } -/**********************************************************************//** -Determines the current SQL statement. -@return SQL statement string */ -extern "C" UNIV_INTERN -const char* -innobase_get_stmt( -/*==============*/ - void* mysql_thd, /*!< in: MySQL thread handle */ - size_t* length) /*!< out: length of the SQL statement */ +/*****************************************************************//** +Log code calls this whenever log has been written and/or flushed up +to a new position. We use this to notify upper layer of a new commit +checkpoint when necessary.*/ +UNIV_INTERN +void +innobase_mysql_log_notify( +/*===============*/ + ib_uint64_t write_lsn, /*!< in: LSN written to log file */ + ib_uint64_t flush_lsn) /*!< in: LSN flushed to disk */ { - LEX_STRING* stmt; + struct pending_checkpoint * pending; + struct pending_checkpoint * entry; + struct pending_checkpoint * last_ready; + + /* It is safe to do a quick check for NULL first without lock. + Even if we should race, we will at most skip one checkpoint and + take the next one, which is harmless. */ + if (!pending_checkpoint_list) + return; - stmt = thd_query_string((THD*) mysql_thd); - *length = stmt->length; - return(stmt->str); -} + mysql_mutex_lock(&pending_checkpoint_mutex); + pending = pending_checkpoint_list; + if (!pending) + { + mysql_mutex_unlock(&pending_checkpoint_mutex); + return; + } -/**********************************************************************//** -Get the current setting of the lower_case_table_names global parameter from -mysqld.cc. We do a dirty read because for one there is no synchronization -object and secondly there is little harm in doing so even if we get a torn -read. -@return value of lower_case_table_names */ -extern "C" UNIV_INTERN -ulint -innobase_get_lower_case_table_names(void) -/*=====================================*/ -{ - return(lower_case_table_names); -} + last_ready = NULL; + for (entry = pending; entry != NULL; entry = entry -> next) + { + /* Notify checkpoints up until the first entry that has not + been fully flushed to the redo log. Since we do not maintain + the list ordered, in principle there could be more entries + later than were also flushed. But there is no harm in + delaying notifications for those a bit. And in practise, the + list is unlikely to have more than one element anyway, as we + flush the redo log at least once every second. */ + if (entry->lsn > flush_lsn) + break; + last_ready = entry; + } -/*********************************************************************//** -Creates a temporary file. -@return temporary file descriptor, or < 0 on error */ -extern "C" UNIV_INTERN -int -innobase_mysql_tmpfile(void) -/*========================*/ -{ -#ifdef WITH_INNODB_DISALLOW_WRITES - os_event_wait(srv_allow_writes_event); -#endif /* WITH_INNODB_DISALLOW_WRITES */ - int fd2 = -1; - File fd; + if (last_ready) + { + /* We found some pending checkpoints that are now flushed to + disk. So remove them from the list. */ + pending_checkpoint_list = entry; + if (!entry) + pending_checkpoint_list_end = NULL; + } - DBUG_EXECUTE_IF( - "innobase_tmpfile_creation_failure", - return(-1); - ); + mysql_mutex_unlock(&pending_checkpoint_mutex); - fd = mysql_tmpfile("ib"); + if (!last_ready) + return; - if (fd >= 0) { - /* Copy the file descriptor, so that the additional resources - allocated by create_temp_file() can be freed by invoking - my_close(). + /* Now that we have released the lock, notify upper layer about all + commit checkpoints that have now completed. */ + for (;;) { + entry = pending; + pending = pending->next; - Because the file descriptor returned by this function - will be passed to fdopen(), it will be closed by invoking - fclose(), which in turn will invoke close() instead of - my_close(). */ + commit_checkpoint_notify_ha(entry->hton, entry->cookie); -#ifdef _WIN32 - /* Note that on Windows, the integer returned by mysql_tmpfile - has no relation to C runtime file descriptor. Here, we need - to call my_get_osfhandle to get the HANDLE and then convert it - to C runtime filedescriptor. */ - { - HANDLE hFile = my_get_osfhandle(fd); - HANDLE hDup; - BOOL bOK = - DuplicateHandle(GetCurrentProcess(), hFile, GetCurrentProcess(), - &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); - if(bOK) { - fd2 = _open_osfhandle((intptr_t)hDup,0); - } - else { - my_osmaperr(GetLastError()); - fd2 = -1; - } - } -#else -#ifdef F_DUPFD_CLOEXEC - fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0); -#else - fd2 = dup(fd); -#endif -#endif - if (fd2 < 0) { - DBUG_PRINT("error",("Got error %d on dup",fd2)); - my_errno=errno; - my_error(EE_OUT_OF_FILERESOURCES, - MYF(ME_BELL+ME_WAITTANG), - "ib*", my_errno); - } - my_close(fd, MYF(MY_WME)); + my_free(entry); + if (entry == last_ready) + break; } - return(fd2); -} - -/*********************************************************************//** -Wrapper around MySQL's copy_and_convert function. -@return number of bytes copied to 'to' */ -extern "C" UNIV_INTERN -ulint -innobase_convert_string( -/*====================*/ - void* to, /*!< out: converted string */ - ulint to_length, /*!< in: number of bytes reserved - for the converted string */ - CHARSET_INFO* to_cs, /*!< in: character set to convert to */ - const void* from, /*!< in: string to convert */ - ulint from_length, /*!< in: number of bytes to convert */ - CHARSET_INFO* from_cs, /*!< in: character set to convert from */ - uint* errors) /*!< out: number of errors encountered - during the conversion */ -{ - return(copy_and_convert((char*)to, (uint32) to_length, to_cs, - (const char*)from, (uint32) from_length, from_cs, - errors)); } -/*******************************************************************//** -Formats the raw data in "data" (in InnoDB on-disk format) that is of -type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes -the result to "buf". The result is converted to "system_charset_info". -Not more than "buf_size" bytes are written to "buf". -The result is always NUL-terminated (provided buf_size > 0) and the -number of bytes that were written to "buf" is returned (including the -terminating NUL). -@return number of bytes that were written */ -extern "C" UNIV_INTERN -ulint -innobase_raw_format( -/*================*/ - const char* data, /*!< in: raw data */ - ulint data_len, /*!< in: raw data length - in bytes */ - ulint charset_coll, /*!< in: charset collation */ - char* buf, /*!< out: output buffer */ - ulint buf_size) /*!< in: output buffer size - in bytes */ +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback_to_savepoint( +/*===========================*/ + handlerton* hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back to savepoint */ + void* savepoint) /*!< in: savepoint data */ { - /* XXX we use a hard limit instead of allocating - but_size bytes from the heap */ - CHARSET_INFO* data_cs; - char buf_tmp[8192]; - ulint buf_tmp_used; - uint num_errors; + ib_int64_t mysql_binlog_cache_pos; + dberr_t error; + trx_t* trx; + char name[64]; - data_cs = all_charsets[charset_coll]; + DBUG_ENTER("innobase_rollback_to_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); - buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp), - system_charset_info, - data, data_len, data_cs, - &num_errors); + trx = check_trx_exists(thd); - return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size)); + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + /* TODO: use provided savepoint data area to store savepoint data */ + + longlong2str((ulint) savepoint, name, 36); + + error = trx_rollback_to_savepoint_for_mysql( + trx, name, &mysql_binlog_cache_pos); + + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_rollback(trx, name); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); } -/*********************************************************************//** -Compute the next autoinc value. +/*****************************************************************//** +Check whether innodb state allows to safely release MDL locks after +rollback to savepoint. +When binlog is on, MDL locks acquired after savepoint unit are not +released if there are any locks held in InnoDB. +@return true if it is safe, false if its not safe. */ +static +bool +innobase_rollback_to_savepoint_can_release_mdl( +/*===========================================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd) /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back to savepoint */ +{ + trx_t* trx; -For MySQL replication the autoincrement values can be partitioned among -the nodes. The offset is the start or origin of the autoincrement value -for a particular node. For n nodes the increment will be n and the offset -will be in the interval [1, n]. The formula tries to allocate the next -value for a particular node. + DBUG_ENTER("innobase_rollback_to_savepoint_can_release_mdl"); + DBUG_ASSERT(hton == innodb_hton_ptr); -Note: This function is also called with increment set to the number of -values we want to reserve for multi-value inserts e.g., + trx = check_trx_exists(thd); + ut_ad(trx); - INSERT INTO T VALUES(), (), (); + /* If transaction has not acquired any locks then it is safe + to release MDL after rollback to savepoint */ + if (!(UT_LIST_GET_LEN(trx->lock.trx_locks))) { + DBUG_RETURN(true); + } -innobase_next_autoinc() will be called with increment set to 3 where -autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for -the multi-value INSERT above. -@return the next value */ + DBUG_RETURN(false); +} + +/*****************************************************************//** +Release transaction savepoint name. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ static -ulonglong -innobase_next_autoinc( -/*==================*/ - ulonglong current, /*!< in: Current value */ - ulonglong need, /*!< in: count of values needed */ - ulonglong step, /*!< in: AUTOINC increment step */ - ulonglong offset, /*!< in: AUTOINC offset */ - ulonglong max_value) /*!< in: max value for type */ +int +innobase_release_savepoint( +/*=======================*/ + handlerton* hton, /*!< in: handlerton for Innodb */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction's + savepoint should be released */ + void* savepoint) /*!< in: savepoint data */ { - ulonglong next_value; - ulonglong block = need * step; + dberr_t error; + trx_t* trx; + char name[64]; - /* Should never be 0. */ - ut_a(need > 0); - ut_a(block > 0); - ut_a(max_value > 0); + DBUG_ENTER("innobase_release_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); - /* - Allow auto_increment to go over max_value up to max ulonglong. - This allows us to detect that all values are exhausted. - If we don't do this, we will return max_value several times - and get duplicate key errors instead of auto increment value - out of range. - */ - max_value= (~(ulonglong) 0); + trx = check_trx_exists(thd); - /* According to MySQL documentation, if the offset is greater than - the step then the offset is ignored. */ - if (offset > block) { - offset = 0; + if (trx->state == TRX_STATE_NOT_STARTED) { + trx_start_if_not_started(trx); } - /* Check for overflow. Current can be > max_value if the value is - in reality a negative value.The visual studio compilers converts - large double values automatically into unsigned long long datatype - maximum value */ - if (block >= max_value - || offset > max_value - || current >= max_value - || max_value - offset <= offset) { + /* TODO: use provided savepoint data area to store savepoint data */ - next_value = max_value; - } else { - ut_a(max_value > current); + longlong2str((ulint) savepoint, name, 36); - ulonglong free = max_value - current; + error = trx_release_savepoint_for_mysql(trx, name); - if (free < offset || free - offset <= block) { - next_value = max_value; - } else { - next_value = 0; - } + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_release(trx, name); } - if (next_value == 0) { - ulonglong next; + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); +} - if (current >= offset) { - next = (current - offset) / step; - } else { - next = 0; - block -= step; - } +/*****************************************************************//** +Sets a transaction savepoint. +@return always 0, that is, always succeeds */ +static +int +innobase_savepoint( +/*===============*/ + handlerton* hton, /*!< in: handle to the Innodb handlerton */ + THD* thd, /*!< in: handle to the MySQL thread */ + void* savepoint) /*!< in: savepoint data */ +{ + dberr_t error; + trx_t* trx; - ut_a(max_value > next); - next_value = next * step; - /* Check for multiplication overflow. */ - ut_a(next_value >= next); - ut_a(max_value > next_value); + DBUG_ENTER("innobase_savepoint"); + DBUG_ASSERT(hton == innodb_hton_ptr); - /* Check for overflow */ - if (max_value - next_value >= block) { + /* In the autocommit mode there is no sense to set a savepoint + (unless we are in sub-statement), so SQL layer ensures that + this method is never called in such situation. */ - next_value += block; + trx = check_trx_exists(thd); - if (max_value - next_value >= offset) { - next_value += offset; - } else { - next_value = max_value; - } - } else { - next_value = max_value; - } - } + /* Release a possible FIFO ticket and search latch. Since we will + reserve the trx_sys->mutex, we have to release the search system + latch first to obey the latching order. */ - ut_a(next_value != 0); - ut_a(next_value <= max_value); + trx_search_latch_release_if_reserved(trx); - return(next_value); + innobase_srv_conc_force_exit_innodb(trx); + + /* Cannot happen outside of transaction */ + DBUG_ASSERT(trx_is_registered_for_2pc(trx)); + + /* TODO: use provided savepoint data area to store savepoint data */ + char name[64]; + longlong2str((ulint) savepoint,name,36); + + error = trx_savepoint_for_mysql(trx, name, (ib_int64_t)0); + + if (error == DB_SUCCESS && trx->fts_trx != NULL) { + fts_savepoint_take(trx, trx->fts_trx, name); + } + + DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL)); } -/*********************************************************************//** -Initializes some fields in an InnoDB transaction object. */ +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ static -void -innobase_trx_init( -/*==============*/ - THD* thd, /*!< in: user thread handle */ - trx_t* trx) /*!< in/out: InnoDB transaction handle */ +int +innobase_close_connection( +/*======================*/ + handlerton* hton, /*!< in: innobase handlerton */ + THD* thd) /*!< in: handle to the MySQL thread of the user + whose resources should be free'd */ { - DBUG_ENTER("innobase_trx_init"); - DBUG_ASSERT(thd == trx->mysql_thd); + trx_t* trx; - trx->check_foreigns = !thd_test_options( - thd, OPTION_NO_FOREIGN_KEY_CHECKS); + DBUG_ENTER("innobase_close_connection"); + DBUG_ASSERT(hton == innodb_hton_ptr); + trx = thd_to_trx(thd); - trx->check_unique_secondary = !thd_test_options( - thd, OPTION_RELAXED_UNIQUE_CHECKS); + ut_a(trx); - DBUG_VOID_RETURN; + if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) { + + sql_print_error("Transaction not registered for MySQL 2PC, " + "but transaction is active"); + } + + if (trx_is_started(trx) && global_system_variables.log_warnings) { + + sql_print_warning( + "MySQL is closing a connection that has an active " + "InnoDB transaction. " TRX_ID_FMT " row modifications " + "will roll back.", + trx->undo_no); + } + + innobase_rollback_trx(trx); + + trx_free_for_mysql(trx); + + DBUG_RETURN(0); } -/*********************************************************************//** -Allocates an InnoDB transaction for a MySQL handler object. -@return InnoDB transaction handle */ -extern "C" UNIV_INTERN -trx_t* -innobase_trx_allocate( -/*==================*/ - THD* thd) /*!< in: user thread handle */ +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +UNIV_INTERN +int +innobase_close_thd( +/*===============*/ + THD* thd) /*!< in: handle to the MySQL thread of the user + whose resources should be free'd */ +{ + trx_t* trx = thd_to_trx(thd); + + if (!trx) { + return(0); + } + + return(innobase_close_connection(innodb_hton_ptr, thd)); +} + +UNIV_INTERN void lock_cancel_waiting_and_release(lock_t* lock); + +/*****************************************************************//** +Cancel any pending lock request associated with the current THD. */ +static +void +innobase_kill_query( +/*======================*/ + handlerton* hton, /*!< in: innobase handlerton */ + THD* thd, /*!< in: MySQL thread being killed */ + enum thd_kill_levels level) /*!< in: kill level */ { trx_t* trx; @@@ -7149,995 -3773,862 +7153,993 @@@ build_template_field } } - if (col_type != mtype) { - /* Column Type mismatches */ - DBUG_RETURN(FALSE); + ib_logf(IB_LOG_LEVEL_INFO, + "Looking for field %lu name %s from table %s", + i, + (tb_col_name ? tb_col_name : "NULL"), + clust_index->table->name); + + + for(ulint j=0; j < clust_index->n_user_defined_cols; j++) { + dict_field_t* ifield = &(clust_index->fields[j]); + ib_logf(IB_LOG_LEVEL_INFO, + "InnoDB Table %s field %lu name %s", + clust_index->table->name, + j, + (ifield ? ifield->name : "NULL")); } - innodb_idx_fld++; + for(ulint j=0; j < table->s->stored_fields; j++) { + ib_logf(IB_LOG_LEVEL_INFO, + "MySQL table %s field %lu name %s", + table->s->table_name.str, + j, + table->field[j]->field_name); + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "Clustered record field for column %lu" + " not found table n_user_defined %d" + " index n_user_defined %d" + " InnoDB table %s field name %s" + " MySQL table %s field name %s n_fields %d" + " query %s", + i, + clust_index->n_user_defined_cols, + clust_index->table->n_cols - DATA_N_SYS_COLS, + clust_index->table->name, + (field ? field->name : "NULL"), + table->s->table_name.str, + (tb_col_name ? tb_col_name : "NULL"), + table->s->stored_fields, + innobase_get_stmt(current_thd, &size)); + + ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED); + } + + if (dict_index_is_clust(index)) { + templ->rec_field_no = templ->clust_rec_field_no; + } else { + templ->rec_field_no = dict_index_get_nth_col_pos(index, i); } - DBUG_RETURN(TRUE); -} + if (field->real_maybe_null()) { + templ->mysql_null_byte_offset = + field->null_offset(); -/*******************************************************************//** -This function builds a translation table in INNOBASE_SHARE -structure for fast index location with mysql array number from its -table->key_info structure. This also provides the necessary translation -between the key order in mysql key_info and Innodb ib_table->indexes if -they are not fully matched with each other. -Note we do not have any mutex protecting the translation table -building based on the assumption that there is no concurrent -index creation/drop and DMLs that requires index lookup. All table -handle will be closed before the index creation/drop. -@return TRUE if index translation table built successfully */ -static -ibool -innobase_build_index_translation( -/*=============================*/ - const TABLE* table, /*!< in: table in MySQL data - dictionary */ - dict_table_t* ib_table, /*!< in: table in Innodb data - dictionary */ - INNOBASE_SHARE* share) /*!< in/out: share structure - where index translation table - will be constructed in. */ -{ - ulint mysql_num_index; - ulint ib_num_index; - dict_index_t** index_mapping; - ibool ret = TRUE; + templ->mysql_null_bit_mask = (ulint) field->null_bit; + } else { + templ->mysql_null_bit_mask = 0; + } - DBUG_ENTER("innobase_build_index_translation"); + templ->mysql_col_offset = (ulint) get_field_offset(table, field); - mutex_enter(&dict_sys->mutex); + templ->mysql_col_len = (ulint) field->pack_length(); + templ->type = col->mtype; + templ->mysql_type = (ulint) field->type(); - mysql_num_index = table->s->keys; - ib_num_index = UT_LIST_GET_LEN(ib_table->indexes); + if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { + templ->mysql_length_bytes = (ulint) + (((Field_varstring*) field)->length_bytes); + } - index_mapping = share->idx_trans_tbl.index_mapping; + templ->charset = dtype_get_charset_coll(col->prtype); + templ->mbminlen = dict_col_get_mbminlen(col); + templ->mbmaxlen = dict_col_get_mbmaxlen(col); + templ->is_unsigned = col->prtype & DATA_UNSIGNED; - /* If there exists inconsistency between MySQL and InnoDB dictionary - (metadata) information, the number of index defined in MySQL - could exceed that in InnoDB, do not build index translation - table in such case */ - if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) { - ret = FALSE; - goto func_exit; + if (!dict_index_is_clust(index) + && templ->rec_field_no == ULINT_UNDEFINED) { + prebuilt->need_to_access_clustered = TRUE; } - /* If index entry count is non-zero, nothing has - changed since last update, directly return TRUE */ - if (share->idx_trans_tbl.index_count) { - /* Index entry count should still match mysql_num_index */ - ut_a(share->idx_trans_tbl.index_count == mysql_num_index); - goto func_exit; + if (prebuilt->mysql_prefix_len < templ->mysql_col_offset + + templ->mysql_col_len) { + prebuilt->mysql_prefix_len = templ->mysql_col_offset + + templ->mysql_col_len; } - /* The number of index increased, rebuild the mapping table */ - if (mysql_num_index > share->idx_trans_tbl.array_size) { - index_mapping = (dict_index_t**) my_realloc(index_mapping, - mysql_num_index * - sizeof(*index_mapping), - MYF(MY_ALLOW_ZERO_PTR)); + if (templ->type == DATA_BLOB) { + prebuilt->templ_contains_blob = TRUE; + } + + return(templ); +} + +/**************************************************************//** +Builds a 'template' to the prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ +UNIV_INTERN +void +ha_innobase::build_template( +/*========================*/ + bool whole_row) /*!< in: true=ROW_MYSQL_WHOLE_ROW, + false=ROW_MYSQL_REC_FIELDS */ +{ + dict_index_t* index; + dict_index_t* clust_index; + ulint n_stored_fields; + ibool fetch_all_in_key = FALSE; + ibool fetch_primary_key_cols = FALSE; + ulint i, sql_idx; + + if (prebuilt->select_lock_type == LOCK_X) { + /* We always retrieve the whole clustered index record if we + use exclusive row level locks, for example, if the read is + done in an UPDATE statement. */ + + whole_row = true; + } else if (!whole_row) { + if (prebuilt->hint_need_to_fetch_extra_cols + == ROW_RETRIEVE_ALL_COLS) { + + /* We know we must at least fetch all columns in the + key, or all columns in the table */ + + if (prebuilt->read_just_key) { + /* MySQL has instructed us that it is enough + to fetch the columns in the key; looks like + MySQL can set this flag also when there is + only a prefix of the column in the key: in + that case we retrieve the whole column from + the clustered index */ + + fetch_all_in_key = TRUE; + } else { + whole_row = true; + } + } else if (prebuilt->hint_need_to_fetch_extra_cols + == ROW_RETRIEVE_PRIMARY_KEY) { + /* We must at least fetch all primary key cols. Note + that if the clustered index was internally generated + by InnoDB on the row id (no primary key was + defined), then row_search_for_mysql() will always + retrieve the row id to a special buffer in the + prebuilt struct. */ - if (!index_mapping) { - /* Report an error if index_mapping continues to be - NULL and mysql_num_index is a non-zero value */ - sql_print_error("InnoDB: fail to allocate memory for " - "index translation table. Number of " - "Index:%lu, array size:%lu", - mysql_num_index, - share->idx_trans_tbl.array_size); - ret = FALSE; - goto func_exit; + fetch_primary_key_cols = TRUE; } - - share->idx_trans_tbl.array_size = mysql_num_index; } - /* For each index in the mysql key_info array, fetch its - corresponding InnoDB index pointer into index_mapping - array. */ - for (ulint count = 0; count < mysql_num_index; count++) { + clust_index = dict_table_get_first_index(prebuilt->table); - /* Fetch index pointers into index_mapping according to mysql - index sequence */ - index_mapping[count] = dict_table_get_index_on_name( - ib_table, table->key_info[count].name); + index = whole_row ? clust_index : prebuilt->index; - if (!index_mapping[count]) { - sql_print_error("Cannot find index %s in InnoDB " - "index dictionary.", - table->key_info[count].name); - ret = FALSE; - goto func_exit; - } + prebuilt->need_to_access_clustered = (index == clust_index); - /* Double check fetched index has the same - column info as those in mysql key_info. */ - if (!innobase_match_index_columns(&table->key_info[count], - index_mapping[count])) { - sql_print_error("Found index %s whose column info " - "does not match that of MySQL.", - table->key_info[count].name); - ret = FALSE; - goto func_exit; - } - } + /* Either prebuilt->index should be a secondary index, or it + should be the clustered index. */ + ut_ad(dict_index_is_clust(index) == (index == clust_index)); - /* Successfully built the translation table */ - share->idx_trans_tbl.index_count = mysql_num_index; + /* Below we check column by column if we need to access + the clustered index. */ -func_exit: - if (!ret) { - /* Build translation table failed. */ - my_free(index_mapping); + n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */ - share->idx_trans_tbl.array_size = 0; - share->idx_trans_tbl.index_count = 0; - index_mapping = NULL; + if (!prebuilt->mysql_template) { + prebuilt->mysql_template = (mysql_row_templ_t*) + mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t)); } - share->idx_trans_tbl.index_mapping = index_mapping; + prebuilt->template_type = whole_row + ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS; + prebuilt->null_bitmap_len = table->s->null_bytes; - mutex_exit(&dict_sys->mutex); + /* Prepare to build prebuilt->mysql_template[]. */ + prebuilt->templ_contains_blob = FALSE; + prebuilt->mysql_prefix_len = 0; + prebuilt->n_template = 0; + prebuilt->idx_cond_n_cols = 0; - DBUG_RETURN(ret); -} + /* Note that in InnoDB, i is the column number in the table. + MySQL calls columns 'fields'. */ -/*******************************************************************//** -This function uses index translation table to quickly locate the -requested index structure. -Note we do not have mutex protection for the index translatoin table -access, it is based on the assumption that there is no concurrent -translation table rebuild (fter create/drop index) and DMLs that -require index lookup. -@return dict_index_t structure for requested index. NULL if -fail to locate the index structure. */ -static -dict_index_t* -innobase_index_lookup( -/*==================*/ - INNOBASE_SHARE* share, /*!< in: share structure for index - translation table. */ - uint keynr) /*!< in: index number for the requested - index */ -{ - if (!share->idx_trans_tbl.index_mapping - || keynr >= share->idx_trans_tbl.index_count) { - return(NULL); - } + if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) { + /* Push down an index condition or an end_range check. */ + for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { - return(share->idx_trans_tbl.index_mapping[keynr]); -} + while (!table->field[sql_idx]->stored_in_db) { + sql_idx++; + } -/************************************************************************ -Set the autoinc column max value. This should only be called once from -ha_innobase::open(). Therefore there's no need for a covering lock. */ -UNIV_INTERN -void -ha_innobase::innobase_initialize_autoinc() -/*======================================*/ -{ - ulonglong auto_inc; - const Field* field = table->found_next_number_field; + const ibool index_contains + = dict_index_contains_col_or_prefix(index, i); + + /* Test if an end_range or an index condition + refers to the field. Note that "index" and + "index_contains" may refer to the clustered index. + Index condition pushdown is relative to prebuilt->index + (the index that is being looked up first). */ + + /* When join_read_always_key() invokes this + code via handler::ha_index_init() and + ha_innobase::index_init(), end_range is not + yet initialized. Because of that, we must + always check for index_contains, instead of + the subset + field->part_of_key.is_set(active_index) + which would be acceptable if end_range==NULL. */ + if (build_template_needs_field_in_icp( + index, prebuilt, index_contains, i)) { + /* Needed in ICP */ + const Field* field; + mysql_row_templ_t* templ; + + if (whole_row) { + field = table->field[sql_idx]; + } else { + field = build_template_needs_field( + index_contains, + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, sql_idx); + if (!field) { + continue; + } + } - if (field != NULL) { - auto_inc = innobase_get_int_col_max_value(field); - } else { - /* We have no idea what's been passed in to us as the - autoinc column. We set it to the 0, effectively disabling - updates to the table. */ - auto_inc = 0; + templ = build_template_field( + prebuilt, clust_index, index, + table, field, i); + prebuilt->idx_cond_n_cols++; + ut_ad(prebuilt->idx_cond_n_cols + == prebuilt->n_template); + + if (index == prebuilt->index) { + templ->icp_rec_field_no + = templ->rec_field_no; + } else { + templ->icp_rec_field_no + = dict_index_get_nth_col_pos( + prebuilt->index, i); + } - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Unable to determine the AUTOINC " - "column name\n"); - } + if (dict_index_is_clust(prebuilt->index)) { + ut_ad(templ->icp_rec_field_no + != ULINT_UNDEFINED); + /* If the primary key includes + a column prefix, use it in + index condition pushdown, + because the condition is + evaluated before fetching any + off-page (externally stored) + columns. */ + if (templ->icp_rec_field_no + < prebuilt->index->n_uniq) { + /* This is a key column; + all set. */ + continue; + } + } else if (templ->icp_rec_field_no + != ULINT_UNDEFINED) { + continue; + } - if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { - /* If the recovery level is set so high that writes - are disabled we force the AUTOINC counter to 0 - value effectively disabling writes to the table. - Secondly, we avoid reading the table in case the read - results in failure due to a corrupted table/index. + /* This is a column prefix index. + The column prefix can be used in + an end_range comparison. */ + + templ->icp_rec_field_no + = dict_index_get_nth_col_or_prefix_pos( + prebuilt->index, i, TRUE); + ut_ad(templ->icp_rec_field_no + != ULINT_UNDEFINED); + + /* Index condition pushdown can be used on + all columns of a secondary index, and on + the PRIMARY KEY columns. On the clustered + index, it must never be used on other than + PRIMARY KEY columns, because those columns + may be stored off-page, and we will not + fetch externally stored columns before + checking the index condition. */ + /* TODO: test the above with an assertion + like this. Note that index conditions are + currently pushed down as part of the + "optimizer phase" while end_range is done + as part of the execution phase. Therefore, + we were unable to use an accurate condition + for end_range in the "if" condition above, + and the following assertion would fail. + ut_ad(!dict_index_is_clust(prebuilt->index) + || templ->rec_field_no + < prebuilt->index->n_uniq); + */ + } + } - We will not return an error to the client, so that the - tables can be dumped with minimal hassle. If an error - were returned in this case, the first attempt to read - the table would fail and subsequent SELECTs would succeed. */ - auto_inc = 0; - } else if (field == NULL) { - /* This is a far more serious error, best to avoid - opening the table and return failure. */ - my_error(ER_AUTOINC_READ_FAILED, MYF(0)); + ut_ad(prebuilt->idx_cond_n_cols > 0); + ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template); + + /* Include the fields that are not needed in index condition + pushdown. */ + for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { + + while (!table->field[sql_idx]->stored_in_db) { + sql_idx++; + } + + const ibool index_contains + = dict_index_contains_col_or_prefix(index, i); + + if (!build_template_needs_field_in_icp( + index, prebuilt, index_contains, i)) { + /* Not needed in ICP */ + const Field* field; + + if (whole_row) { + field = table->field[sql_idx]; + } else { + field = build_template_needs_field( + index_contains, + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, sql_idx); + if (!field) { + continue; + } + } + + build_template_field(prebuilt, + clust_index, index, + table, field, i); + } + } + + prebuilt->idx_cond = this; } else { - dict_index_t* index; - const char* col_name; - ulonglong read_auto_inc; - ulint err; + /* No index condition pushdown */ + prebuilt->idx_cond = NULL; - update_thd(ha_thd()); + for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { + const Field* field; - ut_a(prebuilt->trx == thd_to_trx(user_thd)); + while (!table->field[sql_idx]->stored_in_db) { + sql_idx++; + } - col_name = field->field_name; - index = innobase_get_index(table->s->next_number_index); + if (whole_row) { + field = table->field[sql_idx]; + } else { + field = build_template_needs_field( + dict_index_contains_col_or_prefix( + index, i), + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, sql_idx); + if (!field) { + continue; + } + } - /* Execute SELECT MAX(col_name) FROM TABLE; */ - err = row_search_max_autoinc(index, col_name, &read_auto_inc); + build_template_field(prebuilt, clust_index, index, + table, field, i); + } + } - switch (err) { - case DB_SUCCESS: { - ulonglong col_max_value; + if (index != clust_index && prebuilt->need_to_access_clustered) { + /* Change rec_field_no's to correspond to the clustered index + record */ + for (i = 0; i < prebuilt->n_template; i++) { - col_max_value = innobase_get_int_col_max_value(field); + mysql_row_templ_t* templ + = &prebuilt->mysql_template[i]; - /* At the this stage we do not know the increment - nor the offset, so use a default increment of 1. */ + templ->rec_field_no = templ->clust_rec_field_no; + } + } +} - auto_inc = innobase_next_autoinc( - read_auto_inc, 1, 1, 0, col_max_value); +/********************************************************************//** +This special handling is really to overcome the limitations of MySQL's +binlogging. We need to eliminate the non-determinism that will arise in +INSERT ... SELECT type of statements, since MySQL binlog only stores the +min value of the autoinc interval. Once that is fixed we can get rid of +the special lock handling. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_lock_autoinc(void) +/*====================================*/ +{ + DBUG_ENTER("ha_innobase::innobase_lock_autoinc"); + dberr_t error = DB_SUCCESS; - break; + ut_ad(!srv_read_only_mode); + + switch (innobase_autoinc_lock_mode) { + case AUTOINC_NO_LOCKING: + /* Acquire only the AUTOINC mutex. */ + dict_table_autoinc_lock(prebuilt->table); + break; + + case AUTOINC_NEW_STYLE_LOCKING: + /* For simple (single/multi) row INSERTs/REPLACEs and RBR + events, we fallback to the old style only if another + transaction has already acquired the AUTOINC lock on + behalf of a LOAD FILE or INSERT ... SELECT etc. type of + statement. */ + if (thd_sql_command(user_thd) == SQLCOM_INSERT + || thd_sql_command(user_thd) == SQLCOM_REPLACE + || thd_sql_command(user_thd) == SQLCOM_END // RBR event + ) { + dict_table_t* ib_table = prebuilt->table; + + /* Acquire the AUTOINC mutex. */ + dict_table_autoinc_lock(ib_table); + + /* We need to check that another transaction isn't + already holding the AUTOINC lock on the table. */ + if (ib_table->n_waiting_or_granted_auto_inc_locks) { + /* Release the mutex to avoid deadlocks and + fall back to old style locking. */ + dict_table_autoinc_unlock(ib_table); + } else { + /* Do not fall back to old style locking. */ + break; + } } - case DB_RECORD_NOT_FOUND: - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: MySQL and InnoDB data " - "dictionaries are out of sync.\n" - "InnoDB: Unable to find the AUTOINC column " - "%s in the InnoDB table %s.\n" - "InnoDB: We set the next AUTOINC column " - "value to 0,\n" - "InnoDB: in effect disabling the AUTOINC " - "next value generation.\n" - "InnoDB: You can either set the next " - "AUTOINC value explicitly using ALTER TABLE\n" - "InnoDB: or fix the data dictionary by " - "recreating the table.\n", - col_name, index->table->name); + /* Use old style locking. */ + /* fall through */ + case AUTOINC_OLD_STYLE_LOCKING: + DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used", + ut_ad(0);); + error = row_lock_table_autoinc_for_mysql(prebuilt); - /* This will disable the AUTOINC generation. */ - auto_inc = 0; + if (error == DB_SUCCESS) { - /* We want the open to succeed, so that the user can - take corrective action. ie. reads should succeed but - updates should fail. */ - err = DB_SUCCESS; - break; - default: - /* row_search_max_autoinc() should only return - one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */ - ut_error; + /* Acquire the AUTOINC mutex. */ + dict_table_autoinc_lock(prebuilt->table); } + break; + + default: + ut_error; } - dict_table_autoinc_initialize(prebuilt->table, auto_inc); + DBUG_RETURN(error); } -/*****************************************************************//** -Creates and opens a handle to a table which already exists in an InnoDB -database. -@return 1 if error, 0 if success */ -UNIV_INTERN -int -ha_innobase::open( -/*==============*/ - const char* name, /*!< in: table name */ - int mode, /*!< in: not used */ - uint test_if_locked) /*!< in: not used */ +/********************************************************************//** +Reset the autoinc value in the table. +@return DB_SUCCESS if all went well else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_reset_autoinc( +/*================================*/ + ulonglong autoinc) /*!< in: value to store */ { - dict_table_t* ib_table; - char norm_name[1000]; - THD* thd; - char* is_part = NULL; - ibool par_case_name_set = FALSE; - char par_case_name[MAX_FULL_NAME_LEN + 1]; - dict_err_ignore_t ignore_err = DICT_ERR_IGNORE_NONE; + dberr_t error; - DBUG_ENTER("ha_innobase::open"); + error = innobase_lock_autoinc(); - UT_NOT_USED(mode); - UT_NOT_USED(test_if_locked); + if (error == DB_SUCCESS) { - thd = ha_thd(); + dict_table_autoinc_initialize(prebuilt->table, autoinc); - /* Under some cases MySQL seems to call this function while - holding btr_search_latch. This breaks the latching order as - we acquire dict_sys->mutex below and leads to a deadlock. */ - if (thd != NULL) { - innobase_release_temporary_latches(ht, thd); + dict_table_autoinc_unlock(prebuilt->table); } - normalize_table_name(norm_name, name); - - user_thd = NULL; + return(error); +} - if (!(share=get_share(name))) { +/********************************************************************//** +Store the autoinc value in the table. The autoinc value is only set if +it's greater than the existing autoinc value in the table. +@return DB_SUCCESS if all went well else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_set_max_autoinc( +/*==================================*/ + ulonglong auto_inc) /*!< in: value to store */ +{ + dberr_t error; - DBUG_RETURN(1); - } + error = innobase_lock_autoinc(); - /* Will be allocated if it is needed in ::update_row() */ - upd_buf = NULL; - upd_buf_size = 0; + if (error == DB_SUCCESS) { - /* We look for pattern #P# to see if the table is partitioned - MySQL table. */ -#ifdef __WIN__ - is_part = strstr(norm_name, "#p#"); -#else - is_part = strstr(norm_name, "#P#"); -#endif /* __WIN__ */ + dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc); - /* Check whether FOREIGN_KEY_CHECKS is set to 0. If so, the table - can be opened even if some FK indexes are missing. If not, the table - can't be opened in the same situation */ - if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) { - ignore_err = DICT_ERR_IGNORE_FK_NOKEY; + dict_table_autoinc_unlock(prebuilt->table); } - /* Get pointer to a table object in InnoDB dictionary cache */ - ib_table = dict_table_get(norm_name, TRUE, ignore_err); + return(error); +} - if (NULL == ib_table) { - if (is_part) { - /* MySQL partition engine hard codes the file name - separator as "#P#". The text case is fixed even if - lower_case_table_names is set to 1 or 2. This is true - for sub-partition names as well. InnoDB always - normalises file names to lower case on Windows, this - can potentially cause problems when copying/moving - tables between platforms. +/********************************************************************//** +Stores a row in an InnoDB database, to the table specified in this +handle. +@return error code */ +UNIV_INTERN +int +ha_innobase::write_row( +/*===================*/ + uchar* record) /*!< in: a row in MySQL format */ +{ + dberr_t error; + int error_result= 0; + ibool auto_inc_used= FALSE; +#ifdef WITH_WSREP + ibool auto_inc_inserted= FALSE; /* if NULL was inserted */ +#endif + ulint sql_command; + trx_t* trx = thd_to_trx(user_thd); - 1) If boot against an installation from Windows - platform, then its partition table name could - be all be in lower case in system tables. So we - will need to check lower case name when load table. + DBUG_ENTER("ha_innobase::write_row"); - 2) If we boot an installation from other case - sensitive platform in Windows, we might need to - check the existence of table name without lowering - case them in the system table. */ - if (innobase_get_lower_case_table_names() == 1) { + if (high_level_read_only) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (prebuilt->trx != trx) { + sql_print_error("The transaction object for the table handle " + "is at %p, but for the current thread it is at " + "%p", + (const void*) prebuilt->trx, (const void*) trx); - if (!par_case_name_set) { -#ifndef __WIN__ - /* Check for the table using lower - case name, including the partition - separator "P" */ - memcpy(par_case_name, norm_name, - strlen(norm_name)); - par_case_name[strlen(norm_name)] = 0; - innobase_casedn_str(par_case_name); -#else - /* On Windows platfrom, check - whether there exists table name in - system table whose name is - not being normalized to lower case */ - normalize_table_name_low( - par_case_name, name, FALSE); -#endif - par_case_name_set = TRUE; - } + fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr); + ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200); + fputs("\n" + "InnoDB: Dump of 200 bytes around ha_data: ", + stderr); + ut_print_buf(stderr, ((const byte*) trx) - 100, 200); + putc('\n', stderr); + ut_error; + } else if (!trx_is_started(trx)) { + ++trx->will_lock; + } - ib_table = dict_table_get( - par_case_name, TRUE, ignore_err); - } - if (ib_table) { -#ifndef __WIN__ - sql_print_warning("Partition table %s opened " - "after converting to lower " - "case. The table may have " - "been moved from a case " - "in-sensitive file system. " - "Please recreate table in " - "the current file system\n", - norm_name); -#else - sql_print_warning("Partition table %s opened " - "after skipping the step to " - "lower case the table name. " - "The table may have been " - "moved from a case sensitive " - "file system. Please " - "recreate table in the " - "current file system\n", - norm_name); -#endif - goto table_opened; - } - } + ha_statistic_increment(&SSV::ha_write_count); - if (is_part) { - sql_print_error("Failed to open table %s.\n", - norm_name); - } + sql_command = thd_sql_command(user_thd); - sql_print_error("Cannot find or open table %s from\n" - "the internal data dictionary of InnoDB " - "though the .frm file for the\n" - "table exists. Maybe you have deleted and " - "recreated InnoDB data\n" - "files but have forgotten to delete the " - "corresponding .frm files\n" - "of InnoDB tables, or you have moved .frm " - "files to another database?\n" - "or, the table contains indexes that this " - "version of the engine\n" - "doesn't support.\n" - "See " REFMAN "innodb-troubleshooting.html\n" - "how you can resolve the problem.\n", - norm_name); - free_share(share); - my_errno = ENOENT; + if ((sql_command == SQLCOM_ALTER_TABLE + || sql_command == SQLCOM_OPTIMIZE + || sql_command == SQLCOM_CREATE_INDEX +#ifdef WITH_WSREP + || (wsrep_on(user_thd) && wsrep_load_data_splitting && + sql_command == SQLCOM_LOAD && + !thd_test_options( + user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) +#endif /* WITH_WSREP */ + || sql_command == SQLCOM_DROP_INDEX) + && num_write_row >= 10000) { +#ifdef WITH_WSREP + if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) { + WSREP_DEBUG("forced trx split for LOAD: %s", + wsrep_thd_query(user_thd)); + } +#endif /* WITH_WSREP */ + /* ALTER TABLE is COMMITted at every 10000 copied rows. + The IX table lock for the original table has to be re-issued. + As this method will be called on a temporary table where the + contents of the original table is being copied to, it is + a bit tricky to determine the source table. The cursor + position in the source table need not be adjusted after the + intermediate COMMIT, since writes by other transactions are + being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */ - DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); - } + dict_table_t* src_table; + enum lock_mode mode; -table_opened: + num_write_row = 0; - if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) { - sql_print_error("MySQL is trying to open a table handle but " - "the .ibd file for\ntable %s does not exist.\n" - "Have you deleted the .ibd file from the " - "database directory under\nthe MySQL datadir, " - "or have you used DISCARD TABLESPACE?\n" - "See " REFMAN "innodb-troubleshooting.html\n" - "how you can resolve the problem.\n", - norm_name); - free_share(share); - my_errno = ENOENT; + /* Commit the transaction. This will release the table + locks, so they have to be acquired again. */ - dict_table_decrement_handle_count(ib_table, FALSE); - DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); - } + /* Altering an InnoDB table */ + /* Get the source table. */ + src_table = lock_get_src_table( + prebuilt->trx, prebuilt->table, &mode); + if (!src_table) { +no_commit: + /* Unknown situation: do not commit */ + /* + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ALTER TABLE is holding lock" + " on %lu tables!\n", + prebuilt->trx->mysql_n_tables_locked); + */ + ; + } else if (src_table == prebuilt->table) { +#ifdef WITH_WSREP + if (wsrep_on(user_thd) && + wsrep_load_data_splitting && + sql_command == SQLCOM_LOAD && + !thd_test_options(user_thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) + { + switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1)) + { + case WSREP_TRX_OK: + break; + case WSREP_TRX_SIZE_EXCEEDED: + case WSREP_TRX_CERT_FAIL: + case WSREP_TRX_ERROR: + DBUG_RETURN(1); + } - prebuilt = row_create_prebuilt(ib_table, table->s->reclength); + if (binlog_hton->commit(binlog_hton, user_thd, 1)) + DBUG_RETURN(1); + wsrep_post_commit(user_thd, TRUE); + } +#endif /* WITH_WSREP */ + /* Source table is not in InnoDB format: + no need to re-acquire locks on it. */ - prebuilt->default_rec = table->s->default_values; - ut_ad(prebuilt->default_rec); + /* Altering to InnoDB format */ + innobase_commit(ht, user_thd, 1); + /* Note that this transaction is still active. */ + trx_register_for_2pc(prebuilt->trx); + /* We will need an IX lock on the destination table. */ + prebuilt->sql_stat_start = TRUE; + } else { +#ifdef WITH_WSREP + if (wsrep_on(user_thd) && + wsrep_load_data_splitting && + sql_command == SQLCOM_LOAD && + !thd_test_options(user_thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) + { + switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1)) + { + case WSREP_TRX_OK: + break; + case WSREP_TRX_SIZE_EXCEEDED: + case WSREP_TRX_CERT_FAIL: + case WSREP_TRX_ERROR: + DBUG_RETURN(1); + } - /* Looks like MySQL-3.23 sometimes has primary key number != 0 */ + if (binlog_hton->commit(binlog_hton, user_thd, 1)) + DBUG_RETURN(1); + wsrep_post_commit(user_thd, TRUE); + } +#endif /* WITH_WSREP */ + /* Ensure that there are no other table locks than + LOCK_IX and LOCK_AUTO_INC on the destination table. */ - primary_key = table->s->primary_key; - key_used_on_scan = primary_key; + if (!lock_is_table_exclusive(prebuilt->table, + prebuilt->trx)) { + goto no_commit; + } - if (!innobase_build_index_translation(table, ib_table, share)) { - sql_print_error("Build InnoDB index translation table for" - " Table %s failed", name); + /* Commit the transaction. This will release the table + locks, so they have to be acquired again. */ + innobase_commit(ht, user_thd, 1); + /* Note that this transaction is still active. */ + trx_register_for_2pc(prebuilt->trx); + /* Re-acquire the table lock on the source table. */ + row_lock_table_for_mysql(prebuilt, src_table, mode); + /* We will need an IX lock on the destination table. */ + prebuilt->sql_stat_start = TRUE; + } } - /* Allocate a buffer for a 'row reference'. A row reference is - a string of bytes of length ref_length which uniquely specifies - a row in our table. Note that MySQL may also compare two row - references for equality by doing a simple memcmp on the strings - of length ref_length! */ - - if (!row_table_got_default_clust_index(ib_table)) { - - prebuilt->clust_index_was_generated = FALSE; + num_write_row++; - if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) { - sql_print_error("Table %s has a primary key in " - "InnoDB data dictionary, but not " - "in MySQL!", name); + /* This is the case where the table has an auto-increment column */ + if (table->next_number_field && record == table->record[0]) { - /* This mismatch could cause further problems - if not attended, bring this to the user's attention - by printing a warning in addition to log a message - in the errorlog */ - push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, - ER_NO_SUCH_INDEX, - "InnoDB: Table %s has a " - "primary key in InnoDB data " - "dictionary, but not in " - "MySQL!", name); + /* Reset the error code before calling + innobase_get_auto_increment(). */ + prebuilt->autoinc_error = DB_SUCCESS; - /* If primary_key >= MAX_KEY, its (primary_key) - value could be out of bound if continue to index - into key_info[] array. Find InnoDB primary index, - and assign its key_length to ref_length. - In addition, since MySQL indexes are sorted starting - with primary index, unique index etc., initialize - ref_length to the first index key length in - case we fail to find InnoDB cluster index. +#ifdef WITH_WSREP + auto_inc_inserted= (table->next_number_field->val_int() == 0); +#endif - Please note, this will not resolve the primary - index mismatch problem, other side effects are - possible if users continue to use the table. - However, we allow this table to be opened so - that user can adopt necessary measures for the - mismatch while still being accessible to the table - date. */ - ref_length = table->key_info[0].key_length; + if ((error_result = update_auto_increment())) { + /* We don't want to mask autoinc overflow errors. */ - /* Find correspoinding cluster index - key length in MySQL's key_info[] array */ - for (ulint i = 0; i < table->s->keys; i++) { - dict_index_t* index; - index = innobase_get_index(i); - if (dict_index_is_clust(index)) { - ref_length = - table->key_info[i].key_length; - } + /* Handle the case where the AUTOINC sub-system + failed during initialization. */ + if (prebuilt->autoinc_error == DB_UNSUPPORTED) { + error_result = ER_AUTOINC_READ_FAILED; + /* Set the error message to report too. */ + my_error(ER_AUTOINC_READ_FAILED, MYF(0)); + goto func_exit; + } else if (prebuilt->autoinc_error != DB_SUCCESS) { + error = prebuilt->autoinc_error; + goto report_error; } - } else { - /* MySQL allocates the buffer for ref. - key_info->key_length includes space for all key - columns + one byte for each column that may be - NULL. ref_length must be as exact as possible to - save space, because all row reference buffers are - allocated based on ref_length. */ - - ref_length = table->key_info[primary_key].key_length; - } - } else { - if (primary_key != MAX_KEY) { - sql_print_error( - "Table %s has no primary key in InnoDB data " - "dictionary, but has one in MySQL! If you " - "created the table with a MySQL version < " - "3.23.54 and did not define a primary key, " - "but defined a unique key with all non-NULL " - "columns, then MySQL internally treats that " - "key as the primary key. You can fix this " - "error by dump + DROP + CREATE + reimport " - "of the table.", name); - /* This mismatch could cause further problems - if not attended, bring this to the user attention - by printing a warning in addition to log a message - in the errorlog */ - push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, - ER_NO_SUCH_INDEX, - "InnoDB: Table %s has no " - "primary key in InnoDB data " - "dictionary, but has one in " - "MySQL!", name); + /* MySQL errors are passed straight back. */ + goto func_exit; } - prebuilt->clust_index_was_generated = TRUE; - - ref_length = DATA_ROW_ID_LEN; - - /* If we automatically created the clustered index, then - MySQL does not know about it, and MySQL must NOT be aware - of the index used on scan, to make it avoid checking if we - update the column of the index. That is why we assert below - that key_used_on_scan is the undefined value MAX_KEY. - The column is the row id in the automatical generation case, - and it will never be updated anyway. */ - - if (key_used_on_scan != MAX_KEY) { - sql_print_warning( - "Table %s key_used_on_scan is %lu even " - "though there is no primary key inside " - "InnoDB.", name, (ulong) key_used_on_scan); - } + auto_inc_used = TRUE; } - /* Index block size in InnoDB: used by MySQL in query optimization */ - stats.block_size = 16 * 1024; - - /* Init table lock structure */ - thr_lock_data_init(&share->lock,&lock,(void*) 0); + if (prebuilt->mysql_template == NULL + || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) { - if (prebuilt->table) { - /* We update the highest file format in the system table - space, if this table has higher file format setting. */ + /* Build the template used in converting quickly between + the two database formats */ - trx_sys_file_format_max_upgrade( - (const char**) &innobase_file_format_max, - dict_table_get_format(prebuilt->table)); + build_template(true); } - /* Only if the table has an AUTOINC column. */ - if (prebuilt->table != NULL && table->found_next_number_field != NULL) { - dict_table_autoinc_lock(prebuilt->table); - - /* Since a table can already be "open" in InnoDB's internal - data dictionary, we only init the autoinc counter once, the - first time the table is loaded. We can safely reuse the - autoinc value from a previous MySQL open. */ - if (dict_table_autoinc_read(prebuilt->table) == 0) { - - innobase_initialize_autoinc(); - } + innobase_srv_conc_enter_innodb(prebuilt->trx); - dict_table_autoinc_unlock(prebuilt->table); - } + error = row_insert_for_mysql((byte*) record, prebuilt); + DEBUG_SYNC(user_thd, "ib_after_row_insert"); - info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + /* Handle duplicate key errors */ + if (auto_inc_used) { + ulonglong auto_inc; + ulonglong col_max_value; - DBUG_RETURN(0); -} + /* Note the number of rows processed for this statement, used + by get_auto_increment() to determine the number of AUTO-INC + values to reserve. This is only useful for a mult-value INSERT + and is a statement level counter.*/ + if (trx->n_autoinc_rows > 0) { + --trx->n_autoinc_rows; + } -UNIV_INTERN -handler* -ha_innobase::clone( -/*===============*/ - const char* name, /*!< in: table name */ - MEM_ROOT* mem_root) /*!< in: memory context */ -{ - ha_innobase* new_handler; + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); - DBUG_ENTER("ha_innobase::clone"); + /* Get the value that MySQL attempted to store in the table.*/ + auto_inc = table->next_number_field->val_uint(); - new_handler = static_cast<ha_innobase*>(handler::clone(name, - mem_root)); - if (new_handler) { - DBUG_ASSERT(new_handler->prebuilt != NULL); - DBUG_ASSERT(new_handler->user_thd == user_thd); - DBUG_ASSERT(new_handler->prebuilt->trx == prebuilt->trx); + switch (error) { + case DB_DUPLICATE_KEY: - new_handler->prebuilt->select_lock_type - = prebuilt->select_lock_type; - } + /* A REPLACE command and LOAD DATA INFILE REPLACE + handle a duplicate key error themselves, but we + must update the autoinc counter if we are performing + those statements. */ - DBUG_RETURN(new_handler); -} + switch (sql_command) { + case SQLCOM_LOAD: + if (trx->duplicates) { -UNIV_INTERN -uint -ha_innobase::max_supported_key_part_length() const -{ - /* A table format specific index column length check will be performed - at ha_innobase::add_index() and row_create_index_for_mysql() */ - return(innobase_large_prefix - ? REC_VERSION_56_MAX_INDEX_COL_LEN - : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1); -} + goto set_max_autoinc; + } + break; -/******************************************************************//** -Closes a handle to an InnoDB table. -@return 0 */ -UNIV_INTERN -int -ha_innobase::close(void) -/*====================*/ -{ - THD* thd; + case SQLCOM_REPLACE: + case SQLCOM_INSERT_SELECT: + case SQLCOM_REPLACE_SELECT: + goto set_max_autoinc; - DBUG_ENTER("ha_innobase::close"); +#ifdef WITH_WSREP + /* workaround for LP bug #355000, retrying the insert */ + case SQLCOM_INSERT: - thd = ha_thd(); - if (thd != NULL) { - innobase_release_temporary_latches(ht, thd); - } + WSREP_DEBUG("DUPKEY error for autoinc\n" + "THD %ld, value %llu, off %llu inc %llu", + wsrep_thd_thread_id(current_thd), + auto_inc, + prebuilt->autoinc_offset, + prebuilt->autoinc_increment); - row_prebuilt_free(prebuilt, FALSE); + if (wsrep_on(current_thd) && + auto_inc_inserted && + wsrep_drupal_282555_workaround && + wsrep_thd_retry_counter(current_thd) == 0 && + !thd_test_options(current_thd, + OPTION_NOT_AUTOCOMMIT | + OPTION_BEGIN)) { + WSREP_DEBUG( + "retrying insert: %s", + (*wsrep_thd_query(current_thd)) ? + wsrep_thd_query(current_thd) : + (char *)"void"); + error= DB_SUCCESS; + wsrep_thd_set_conflict_state( + current_thd, MUST_ABORT); + innobase_srv_conc_exit_innodb(prebuilt->trx); + /* jump straight to func exit over + * later wsrep hooks */ + goto func_exit; + } + break; +#endif /* WITH_WSREP */ - if (upd_buf != NULL) { - ut_ad(upd_buf_size != 0); - my_free(upd_buf); - upd_buf = NULL; - upd_buf_size = 0; - } + default: + break; + } - free_share(share); + break; - /* Tell InnoDB server that there might be work for - utility threads: */ + case DB_SUCCESS: + /* If the actual value inserted is greater than + the upper limit of the interval, then we try and + update the table upper limit. Note: last_value + will be 0 if get_auto_increment() was not called.*/ - srv_active_wake_master_thread(); + if (auto_inc >= prebuilt->autoinc_last_value) { +set_max_autoinc: + /* This should filter out the negative + values set explicitly by the user. */ + if (auto_inc <= col_max_value) { + ut_a(prebuilt->autoinc_increment > 0); - DBUG_RETURN(0); -} + ulonglong offset; + ulonglong increment; + dberr_t err; -/* The following accessor functions should really be inside MySQL code! */ + offset = prebuilt->autoinc_offset; + increment = prebuilt->autoinc_increment; -/**************************************************************//** -Gets field offset for a field in a table. -@return offset */ -static inline -uint -get_field_offset( -/*=============*/ - TABLE* table, /*!< in: MySQL table object */ - Field* field) /*!< in: MySQL field object */ -{ - return((uint) (field->ptr - table->record[0])); -} + auto_inc = innobase_next_autoinc( + auto_inc, + 1, increment, offset, + col_max_value); -/**************************************************************//** -Checks if a field in a record is SQL NULL. Uses the record format -information in table to track the null bit in record. -@return 1 if NULL, 0 otherwise */ -static inline -uint -field_in_record_is_null( -/*====================*/ - TABLE* table, /*!< in: MySQL table object */ - Field* field, /*!< in: MySQL field object */ - char* record) /*!< in: a row in MySQL format */ -{ - int null_offset; + err = innobase_set_max_autoinc( + auto_inc); + + if (err != DB_SUCCESS) { + error = err; + } + } + } + break; + default: + break; + } + } - if (!field->null_ptr) { + innobase_srv_conc_exit_innodb(prebuilt->trx); - return(0); +report_error: + if (error == DB_TABLESPACE_DELETED) { + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); } - null_offset = (uint) ((char*) field->null_ptr - - (char*) table->record[0]); + error_result = convert_error_code_to_mysql(error, + prebuilt->table->flags, + user_thd); - if (record[null_offset] & field->null_bit) { +#ifdef WITH_WSREP - if (!error_result && - wsrep_thd_exec_mode(user_thd) == LOCAL_STATE && - wsrep_on(user_thd) && - !wsrep_consistency_check(user_thd) && - !wsrep_thd_skip_append_keys(user_thd)) - { - if (wsrep_append_keys(user_thd, false, record, NULL)) - { ++ if (!error_result ++ && wsrep_on(user_thd) ++ && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE ++ && !wsrep_consistency_check(user_thd) ++ && !wsrep_thd_skip_append_keys(user_thd)) { ++ if (wsrep_append_keys(user_thd, false, record, NULL)) { + DBUG_PRINT("wsrep", ("row key failed")); + error_result = HA_ERR_INTERNAL_ERROR; + goto wsrep_error; + } + } +wsrep_error: +#endif /* WITH_WSREP */ - return(1); + if (error_result == HA_FTS_INVALID_DOCID) { + my_error(HA_FTS_INVALID_DOCID, MYF(0)); } - return(0); +func_exit: + innobase_active_small(); + + DBUG_RETURN(error_result); } -/*************************************************************//** -InnoDB uses this function to compare two data fields for which the data type -is such that we must use MySQL code to compare them. NOTE that the prototype -of this function is in rem0cmp.c in InnoDB source code! If you change this -function, remember to update the prototype there! -@return 1, 0, -1, if a is greater, equal, less than b, respectively */ -extern "C" UNIV_INTERN -int -innobase_mysql_cmp( -/*===============*/ - int mysql_type, /*!< in: MySQL type */ - uint charset_number, /*!< in: number of the charset */ - const unsigned char* a, /*!< in: data field */ - unsigned int a_length, /*!< in: data field length, - not UNIV_SQL_NULL */ - const unsigned char* b, /*!< in: data field */ - unsigned int b_length) /*!< in: data field length, - not UNIV_SQL_NULL */ +/**********************************************************************//** +Checks which fields have changed in a row and stores information +of them to an update vector. +@return DB_SUCCESS or error code */ +static +dberr_t +calc_row_difference( +/*================*/ + upd_t* uvect, /*!< in/out: update vector */ + uchar* old_row, /*!< in: old row in MySQL format */ + uchar* new_row, /*!< in: new row in MySQL format */ + TABLE* table, /*!< in: table in MySQL data + dictionary */ + uchar* upd_buff, /*!< in: buffer to use */ + ulint buff_len, /*!< in: buffer length */ + row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */ + THD* thd) /*!< in: user thread */ { - CHARSET_INFO* charset; - enum_field_types mysql_tp; - int ret; + uchar* original_upd_buff = upd_buff; + Field* field; + enum_field_types field_mysql_type; + uint n_fields; + ulint o_len; + ulint n_len; + ulint col_pack_len; + const byte* new_mysql_row_col; + const byte* o_ptr; + const byte* n_ptr; + byte* buf; + upd_field_t* ufield; + ulint col_type; + ulint n_changed = 0; + dfield_t dfield; + dict_index_t* clust_index; + uint sql_idx, innodb_idx= 0; + ibool changes_fts_column = FALSE; + ibool changes_fts_doc_col = FALSE; + trx_t* trx = thd_to_trx(thd); + doc_id_t doc_id = FTS_NULL_DOC_ID; - DBUG_ASSERT(a_length != UNIV_SQL_NULL); - DBUG_ASSERT(b_length != UNIV_SQL_NULL); + ut_ad(!srv_read_only_mode); - mysql_tp = (enum_field_types) mysql_type; + n_fields = table->s->fields; + clust_index = dict_table_get_first_index(prebuilt->table); - switch (mysql_tp) { + /* We use upd_buff to convert changed fields */ + buf = (byte*) upd_buff; - case MYSQL_TYPE_BIT: - case MYSQL_TYPE_STRING: - case MYSQL_TYPE_VAR_STRING: - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - case MYSQL_TYPE_VARCHAR: - /* Use the charset number to pick the right charset struct for - the comparison. Since the MySQL function get_charset may be - slow before Bar removes the mutex operation there, we first - look at 2 common charsets directly. */ + for (sql_idx = 0; sql_idx < n_fields; sql_idx++) { + field = table->field[sql_idx]; + if (!field->stored_in_db) + continue; - if (charset_number == default_charset_info->number) { - charset = default_charset_info; - } else if (charset_number == my_charset_latin1.number) { - charset = &my_charset_latin1; - } else { - charset = get_charset(charset_number, MYF(MY_WME)); + o_ptr = (const byte*) old_row + get_field_offset(table, field); + n_ptr = (const byte*) new_row + get_field_offset(table, field); - if (charset == NULL) { - sql_print_error("InnoDB needs charset %lu for doing " - "a comparison, but MySQL cannot " - "find that charset.", - (ulong) charset_number); - ut_a(0); - } - } + /* Use new_mysql_row_col and col_pack_len save the values */ - /* Starting from 4.1.3, we use strnncollsp() in comparisons of - non-latin1_swedish_ci strings. NOTE that the collation order - changes then: 'b\0\0...' is ordered BEFORE 'b ...'. Users - having indexes on such data need to rebuild their tables! */ + new_mysql_row_col = n_ptr; + col_pack_len = field->pack_length(); - ret = charset->coll->strnncollsp(charset, - a, a_length, - b, b_length, 0); - if (ret < 0) { - return(-1); - } else if (ret > 0) { - return(1); - } else { - return(0); - } - default: - ut_error; - } + o_len = col_pack_len; + n_len = col_pack_len; - return(0); -} -#ifdef WITH_WSREP -extern "C" UNIV_INTERN -int -wsrep_innobase_mysql_sort( -/*===============*/ - /* out: str contains sort string */ - int mysql_type, /* in: MySQL type */ - uint charset_number, /* in: number of the charset */ - unsigned char* str, /* in: data field */ - unsigned int str_length, /* in: data field length, - not UNIV_SQL_NULL */ - unsigned int buf_length) /* in: total str buffer length */ + /* We use o_ptr and n_ptr to dig up the actual data for + comparison. */ -{ - CHARSET_INFO* charset; - enum_field_types mysql_tp; - int ret_length = str_length; + field_mysql_type = field->type(); - DBUG_ASSERT(str_length != UNIV_SQL_NULL); + col_type = prebuilt->table->cols[innodb_idx].mtype; - mysql_tp = (enum_field_types) mysql_type; + switch (col_type) { - switch (mysql_tp) { + case DATA_BLOB: + o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len); + n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len); - case MYSQL_TYPE_BIT: - case MYSQL_TYPE_STRING: - case MYSQL_TYPE_VAR_STRING: - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - case MYSQL_TYPE_VARCHAR: - { - uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN]; - uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN; + break; - /* Use the charset number to pick the right charset struct for - the comparison. Since the MySQL function get_charset may be - slow before Bar removes the mutex operation there, we first - look at 2 common charsets directly. */ + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_VARMYSQL: + if (field_mysql_type == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR where + the real payload data length is stored in + 1 or 2 bytes */ - if (charset_number == default_charset_info->number) { - charset = default_charset_info; - } else if (charset_number == my_charset_latin1.number) { - charset = &my_charset_latin1; - } else { - charset = get_charset(charset_number, MYF(MY_WME)); + o_ptr = row_mysql_read_true_varchar( + &o_len, o_ptr, + (ulint) + (((Field_varstring*) field)->length_bytes)); + + n_ptr = row_mysql_read_true_varchar( + &n_len, n_ptr, + (ulint) + (((Field_varstring*) field)->length_bytes)); + } + + break; + default: + ; + } - if (charset == NULL) { - sql_print_error("InnoDB needs charset %lu for doing " - "a comparison, but MySQL cannot " - "find that charset.", - (ulong) charset_number); - ut_a(0); + if (field_mysql_type == MYSQL_TYPE_LONGLONG + && prebuilt->table->fts + && innobase_strcasecmp( + field->field_name, FTS_DOC_ID_COL_NAME) == 0) { + doc_id = (doc_id_t) mach_read_from_n_little_endian( + n_ptr, 8); + if (doc_id == 0) { + return(DB_FTS_INVALID_DOCID); } } @@@ -13725,277 -9359,313 +13727,278 @@@ ha_innobase::get_parent_foreign_key_lis trx_search_latch_release_if_reserved(prebuilt->trx); - ib_table = prebuilt->table; - - if (flag & HA_STATUS_TIME) { - if (called_from_analyze || innobase_stats_on_metadata) { - /* In sql_show we call with this flag: update - then statistics so that they are up-to-date */ - - prebuilt->trx->op_info = "updating table statistics"; + mutex_enter(&(dict_sys->mutex)); - DEBUG_SYNC_C("info_before_stats_update"); + for (dict_foreign_set::iterator it + = prebuilt->table->referenced_set.begin(); + it != prebuilt->table->referenced_set.end(); + ++it) { - dict_update_statistics( - ib_table, - FALSE, /* update even if initialized */ - FALSE /* update even if not changed too much */); + foreign = *it; - prebuilt->trx->op_info = "returning various info to MySQL"; + pf_key_info = get_foreign_key_info(thd, foreign); + if (pf_key_info) { + f_key_list->push_back(pf_key_info); } - } - if (flag & HA_STATUS_VARIABLE) { - - ulint page_size; - - dict_table_stats_lock(ib_table, RW_S_LATCH); + mutex_exit(&(dict_sys->mutex)); - n_rows = ib_table->stat_n_rows; + prebuilt->trx->op_info = ""; - /* Because we do not protect stat_n_rows by any mutex in a - delete, it is theoretically possible that the value can be - smaller than zero! TODO: fix this race. + return(0); +} - The MySQL optimizer seems to assume in a left join that n_rows - is an accurate estimate if it is zero. Of course, it is not, - since we do not have any locks on the rows yet at this phase. - Since SHOW TABLE STATUS seems to call this function with the - HA_STATUS_TIME flag set, while the left join optimizer does not - set that flag, we add one to a zero value if the flag is not - set. That way SHOW TABLE STATUS will show the best estimate, - while the optimizer never sees the table empty. */ +/*****************************************************************//** +Checks if ALTER TABLE may change the storage engine of the table. +Changing storage engines is not allowed for tables for which there +are foreign key constraints (parent or child tables). +@return TRUE if can switch engines */ +UNIV_INTERN +bool +ha_innobase::can_switch_engines(void) +/*=================================*/ +{ + bool can_switch; - if (n_rows < 0) { - n_rows = 0; - } + DBUG_ENTER("ha_innobase::can_switch_engines"); + update_thd(); - if (n_rows == 0 && !(flag & HA_STATUS_TIME)) { - n_rows++; - } + prebuilt->trx->op_info = + "determining if there are foreign key constraints"; + row_mysql_freeze_data_dictionary(prebuilt->trx); - /* Fix bug#40386: Not flushing query cache after truncate. - n_rows can not be 0 unless the table is empty, set to 1 - instead. The original problem of bug#29507 is actually - fixed in the server code. */ - if (thd_sql_command(user_thd) == SQLCOM_TRUNCATE) { + can_switch = prebuilt->table->referenced_set.empty() + && prebuilt->table->foreign_set.empty(); - n_rows = 1; + row_mysql_unfreeze_data_dictionary(prebuilt->trx); + prebuilt->trx->op_info = ""; - /* We need to reset the prebuilt value too, otherwise - checks for values greater than the last value written - to the table will fail and the autoinc counter will - not be updated. This will force write_row() into - attempting an update of the table's AUTOINC counter. */ + DBUG_RETURN(can_switch); +} - prebuilt->autoinc_last_value = 0; - } +/*******************************************************************//** +Checks if a table is referenced by a foreign key. The MySQL manual states that +a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a +delete is then allowed internally to resolve a duplicate key conflict in +REPLACE, not an update. +@return > 0 if referenced by a FOREIGN KEY */ +UNIV_INTERN +uint +ha_innobase::referenced_by_foreign_key(void) +/*========================================*/ +{ + if (dict_table_is_referenced_by_foreign_key(prebuilt->table)) { - page_size = dict_table_zip_size(ib_table); - if (page_size == 0) { - page_size = UNIV_PAGE_SIZE; - } + return(1); + } - stats.records = (ha_rows)n_rows; - stats.deleted = 0; - stats.data_file_length - = ((ulonglong) ib_table->stat_clustered_index_size) - * page_size; - stats.index_file_length = - ((ulonglong) ib_table->stat_sum_of_other_index_sizes) - * page_size; + return(0); +} - dict_table_stats_unlock(ib_table, RW_S_LATCH); +/*******************************************************************//** +Frees the foreign key create info for a table stored in InnoDB, if it is +non-NULL. */ +UNIV_INTERN +void +ha_innobase::free_foreign_key_create_info( +/*======================================*/ + char* str) /*!< in, own: create info string to free */ +{ + if (str) { + my_free(str); + } +} - /* Since fsp_get_available_space_in_free_extents() is - acquiring latches inside InnoDB, we do not call it if we - are asked by MySQL to avoid locking. Another reason to - avoid the call is that it uses quite a lot of CPU. - See Bug#38185. */ - if (flag & HA_STATUS_NO_LOCK - || !(flag & HA_STATUS_VARIABLE_EXTRA)) { - /* We do not update delete_length if no - locking is requested so the "old" value can - remain. delete_length is initialized to 0 in - the ha_statistics' constructor. Also we only - need delete_length to be set when - HA_STATUS_VARIABLE_EXTRA is set */ - } else if (UNIV_UNLIKELY - (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE)) { - /* Avoid accessing the tablespace if - innodb_crash_recovery is set to a high value. */ - stats.delete_length = 0; - } else { - ullint avail_space; +/*******************************************************************//** +Tells something additional to the handler about how to do things. +@return 0 or error number */ +UNIV_INTERN +int +ha_innobase::extra( +/*===============*/ + enum ha_extra_function operation) + /*!< in: HA_EXTRA_FLUSH or some other flag */ +{ + check_trx_exists(ha_thd()); - avail_space = fsp_get_available_space_in_free_extents( - ib_table->space); + /* Warning: since it is not sure that MySQL calls external_lock + before calling this function, the trx field in prebuilt can be + obsolete! */ - if (avail_space == ULLINT_UNDEFINED) { - THD* thd; + switch (operation) { + case HA_EXTRA_FLUSH: + if (prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } + break; + case HA_EXTRA_RESET_STATE: + reset_template(); + thd_to_trx(ha_thd())->duplicates = 0; + break; + case HA_EXTRA_NO_KEYREAD: + prebuilt->read_just_key = 0; + break; + case HA_EXTRA_KEYREAD: + prebuilt->read_just_key = 1; + break; + case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + prebuilt->keep_other_fields_on_keyread = 1; + break; - thd = ha_thd(); + /* IMPORTANT: prebuilt->trx can be obsolete in + this method, because it is not sure that MySQL + calls external_lock before this method with the + parameters below. We must not invoke update_thd() + either, because the calling threads may change. + CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */ + case HA_EXTRA_INSERT_WITH_UPDATE: + thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE; + break; + case HA_EXTRA_NO_IGNORE_DUP_KEY: + thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_IGNORE; + break; + case HA_EXTRA_WRITE_CAN_REPLACE: + thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE; + break; + case HA_EXTRA_WRITE_CANNOT_REPLACE: + thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE; + break; + default:/* Do nothing */ + ; + } - push_warning_printf( - thd, - MYSQL_ERROR::WARN_LEVEL_WARN, - ER_CANT_GET_STAT, - "InnoDB: Trying to get the free " - "space for table %s but its " - "tablespace has been discarded or " - "the .ibd file is missing. Setting " - "the free space to zero.", - ib_table->name); + return(0); +} - stats.delete_length = 0; - } else { - stats.delete_length = avail_space * 1024; - } - } +/******************************************************************//** +*/ +UNIV_INTERN +int +ha_innobase::reset() +/*================*/ +{ + if (prebuilt->blob_heap) { + row_mysql_prebuilt_free_blob_heap(prebuilt); + } - stats.check_time = 0; - stats.mrr_length_per_rec= ref_length + 8; // 8 = max(sizeof(void *)); + reset_template(); + ds_mrr.dsmrr_close(); + /* TODO: This should really be reset in reset_template() but for now + it's safer to do it explicitly here. */ - if (stats.records == 0) { - stats.mean_rec_length = 0; - } else { - stats.mean_rec_length = (ulong) (stats.data_file_length / stats.records); - } - } + /* This is a statement level counter. */ + prebuilt->autoinc_last_value = 0; - if (flag & HA_STATUS_CONST) { - ulong i; - /* Verify the number of index in InnoDB and MySQL - matches up. If prebuilt->clust_index_was_generated - holds, InnoDB defines GEN_CLUST_INDEX internally */ - ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes) - - prebuilt->clust_index_was_generated; + return(0); +} - if (table->s->keys != num_innodb_index) { - sql_print_error("Table %s contains %lu " - "indexes inside InnoDB, which " - "is different from the number of " - "indexes %u defined in the MySQL ", - ib_table->name, num_innodb_index, - table->s->keys); - } +/******************************************************************//** +MySQL calls this function at the start of each SQL statement inside LOCK +TABLES. Inside LOCK TABLES the ::external_lock method does not work to +mark SQL statement borders. Note also a special case: if a temporary table +is created inside LOCK TABLES, MySQL has not called external_lock() at all +on that table. +MySQL-5.0 also calls this before each statement in an execution of a stored +procedure. To make the execution more deterministic for binlogging, MySQL-5.0 +locks all tables involved in a stored procedure with full explicit table +locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the +procedure. +@return 0 or error code */ +UNIV_INTERN +int +ha_innobase::start_stmt( +/*====================*/ + THD* thd, /*!< in: handle to the user thread */ + thr_lock_type lock_type) +{ + trx_t* trx; + DBUG_ENTER("ha_innobase::start_stmt"); - dict_table_stats_lock(ib_table, RW_S_LATCH); + update_thd(thd); - for (i = 0; i < table->s->keys; i++) { - ulong j; - rec_per_key = 1; - /* We could get index quickly through internal - index mapping with the index translation table. - The identity of index (match up index name with - that of table->key_info[i]) is already verified in - innobase_get_index(). */ - index = innobase_get_index(i); + trx = prebuilt->trx; - if (index == NULL) { - sql_print_error("Table %s contains fewer " - "indexes inside InnoDB than " - "are defined in the MySQL " - ".frm file. Have you mixed up " - ".frm files from different " - "installations? See " - REFMAN - "innodb-troubleshooting.html\n", - ib_table->name); - break; - } + /* Here we release the search latch and the InnoDB thread FIFO ticket + if they were reserved. They should have been released already at the + end of the previous statement, but because inside LOCK TABLES the + lock count method does not work to mark the end of a SELECT statement, + that may not be the case. We MUST release the search latch before an + INSERT, for example. */ - for (j = 0; j < table->key_info[i].key_parts; j++) { + trx_search_latch_release_if_reserved(trx); - if (j + 1 > index->n_uniq) { - sql_print_error( -"Index %s of %s has %lu columns unique inside InnoDB, but MySQL is asking " -"statistics for %lu columns. Have you mixed up .frm files from different " -"installations? " -"See " REFMAN "innodb-troubleshooting.html\n", - index->name, - ib_table->name, - (unsigned long) - index->n_uniq, j + 1); - break; - } + innobase_srv_conc_force_exit_innodb(trx); - rec_per_key = innodb_rec_per_key( - index, j, stats.records); + /* Reset the AUTOINC statement level counter for multi-row INSERTs. */ + trx->n_autoinc_rows = 0; - /* Since MySQL seems to favor table scans - too much over index searches, we pretend - index selectivity is 2 times better than - our estimate: */ + prebuilt->sql_stat_start = TRUE; + prebuilt->hint_need_to_fetch_extra_cols = 0; + reset_template(); - rec_per_key = rec_per_key / 2; + if (dict_table_is_temporary(prebuilt->table) + && prebuilt->mysql_has_locked + && prebuilt->select_lock_type == LOCK_NONE) { + dberr_t error; - if (rec_per_key == 0) { - rec_per_key = 1; - } + switch (thd_sql_command(thd)) { + case SQLCOM_INSERT: + case SQLCOM_UPDATE: + case SQLCOM_DELETE: ++ case SQLCOM_REPLACE: + init_table_handle_for_HANDLER(); + prebuilt->select_lock_type = LOCK_X; + prebuilt->stored_select_lock_type = LOCK_X; + error = row_lock_table_for_mysql(prebuilt, NULL, 1); - table->key_info[i].rec_per_key[j]= - rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 : - (ulong) rec_per_key; + if (error != DB_SUCCESS) { + int st = convert_error_code_to_mysql( + error, 0, thd); + DBUG_RETURN(st); } - - KEY *key_info= table->key_info+i; - key_part_map ext_key_part_map= - key_info->ext_key_part_map; - - if (key_info->key_parts != key_info->ext_key_parts) { - - KEY *pk_key_info= key_info+ - table->s->primary_key; - uint k = key_info->key_parts; - ha_rows k_rec_per_key = rec_per_key; - uint pk_parts = pk_key_info->key_parts; - - index= innobase_get_index( - table->s->primary_key); - - n_rows= ib_table->stat_n_rows; - - for (j = 0; j < pk_parts; j++) { - - if (ext_key_part_map & 1<<j) { - - rec_per_key = - innodb_rec_per_key(index, - j, stats.records); - - if (rec_per_key == 0) { - rec_per_key = 1; - } - else if (rec_per_key > 1) { - rec_per_key = - (ha_rows) - (k_rec_per_key * - (double)rec_per_key / - n_rows); - } - - key_info->rec_per_key[k++]= - rec_per_key >= ~(ulong) 0 ? - ~(ulong) 0 : - (ulong) rec_per_key; - - } - } - } + break; } + } - dict_table_stats_unlock(ib_table, RW_S_LATCH); - - my_snprintf(path, sizeof(path), "%s/%s%s", - mysql_data_home, - table->s->normalized_path.str, - reg_ext); + if (!prebuilt->mysql_has_locked) { + /* This handle is for a temporary table created inside + this same LOCK TABLES; since MySQL does NOT call external_lock + in this case, we must use x-row locks inside InnoDB to be + prepared for an update of a row */ - unpack_filename(path,path); + prebuilt->select_lock_type = LOCK_X; - /* Note that we do not know the access time of the table, - nor the CHECK TABLE time, nor the UPDATE or INSERT time. */ + } else if (trx->isolation_level != TRX_ISO_SERIALIZABLE + && thd_sql_command(thd) == SQLCOM_SELECT + && lock_type == TL_READ) { - if (os_file_get_status(path,&stat_info)) { - stats.create_time = (ulong) stat_info.ctime; - } - } + /* For other than temporary tables, we obtain + no lock for consistent read (plain SELECT). */ - if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { + prebuilt->select_lock_type = LOCK_NONE; + } else { + /* Not a consistent read: restore the + select_lock_type value. The value of + stored_select_lock_type was decided in: + 1) ::store_lock(), + 2) ::external_lock(), + 3) ::init_table_handle_for_HANDLER(), and + 4) ::transactional_table_lock(). */ - goto func_exit; + ut_a(prebuilt->stored_select_lock_type != LOCK_NONE_UNSET); + prebuilt->select_lock_type = prebuilt->stored_select_lock_type; } - if (flag & HA_STATUS_ERRKEY) { - const dict_index_t* err_index; - - ut_a(prebuilt->trx); - ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); - - err_index = trx_get_error_info(prebuilt->trx); + *trx->detailed_error = 0; - if (err_index) { - errkey = innobase_get_mysql_key_number_for_index( - share, table, ib_table, err_index); - } else { - errkey = (unsigned int) prebuilt->trx->error_key_num; - } - } + innobase_register_trx(ht, thd, trx); - if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) { - stats.auto_increment_value = innobase_peek_autoinc(); + if (!trx_is_started(trx)) { + ++trx->will_lock; } -func_exit: - prebuilt->trx->op_info = (char*)""; - DBUG_RETURN(0); } @@@ -17689,22 -12815,15 +17692,24 @@@ wsrep_innobase_kill_one_trx wsrep_thd_thread_id(thd), victim_trx->id); - WSREP_DEBUG("Aborting query: %s", - (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void"); + WSREP_DEBUG("Aborting query: %s conf %d trx: %lu", + (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void", + wsrep_thd_conflict_state(thd), + wsrep_thd_ws_handle(thd)->trx_id); wsrep_thd_LOCK(thd); + DBUG_EXECUTE_IF("sync.wsrep_after_BF_victim_lock", + { + const char act[]= + "now " + "wait_for signal.wsrep_after_BF_victim_lock"; + DBUG_ASSERT(!debug_sync_set_action(bf_thd, + STRING_WITH_LEN(act))); + };); + if (wsrep_thd_query_state(thd) == QUERY_EXITING) { - WSREP_DEBUG("kill trx EXITING for %llu", victim_trx->id); + WSREP_DEBUG("kill trx EXITING for %lu", victim_trx->id); wsrep_thd_UNLOCK(thd); DBUG_RETURN(0); } @@@ -17751,15 -12870,13 +17756,15 @@@ } else { rcode = wsrep->abort_pre_commit( wsrep, bf_seqno, - (wsrep_trx_id_t)victim_trx->id + (wsrep_trx_id_t)wsrep_thd_ws_handle(thd)->trx_id ); + switch (rcode) { case WSREP_WARNING: - WSREP_DEBUG("cancel commit warning: %llu", + WSREP_DEBUG("cancel commit warning: %lu", victim_trx->id); wsrep_thd_UNLOCK(thd); + wsrep_thd_awake(thd, signal); DBUG_RETURN(1); break; case WSREP_OK: @@@ -17871,40 -12988,34 +17876,42 @@@ break; } default: - WSREP_WARN("bad wsrep query state: %d", + WSREP_WARN("bad wsrep query state: %d", wsrep_thd_query_state(thd)); + wsrep_thd_UNLOCK(thd); break; } - wsrep_thd_UNLOCK(thd); - + DBUG_RETURN(0); } -static int -wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd, - my_bool signal) + +static +int +wsrep_abort_transaction( + handlerton* hton, + THD *bf_thd, + THD *victim_thd, + my_bool signal) { DBUG_ENTER("wsrep_innobase_abort_thd"); - trx_t* victim_trx = thd_to_trx(victim_thd); - trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL; + + trx_t* victim_trx = thd_to_trx(victim_thd); + trx_t* bf_trx = (bf_thd) ? thd_to_trx(bf_thd) : NULL; - WSREP_DEBUG("abort transaction: BF: %s victim: %s", - wsrep_thd_query(bf_thd), - wsrep_thd_query(victim_thd)); + WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %d", + wsrep_thd_query(bf_thd), + wsrep_thd_query(victim_thd), + wsrep_thd_conflict_state(victim_thd)); - ut_ad(!mutex_own(&kernel_mutex)); - - if (victim_trx) - { - int rcode = wsrep_innobase_kill_one_trx( - bf_thd, bf_trx, victim_trx, signal, FALSE); + if (victim_trx) { + lock_mutex_enter(); + trx_mutex_enter(victim_trx); + victim_trx->abort_type = TRX_WSREP_ABORT; + int rcode = wsrep_innobase_kill_one_trx(bf_thd, bf_trx, + victim_trx, signal); + trx_mutex_exit(victim_trx); + lock_mutex_exit(); + victim_trx->abort_type = TRX_SERVER_ABORT; wsrep_srv_conc_cancel_wait(victim_trx); DBUG_RETURN(rcode); } else { diff --cc storage/innobase/os/os0file.cc index df096dcc6fd,00000000000..d4b8e82b0d8 mode 100644,000000..100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@@ -1,5785 -1,0 +1,5785 @@@ +/*********************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. - Copyright (c) 2013, 2017, MariaDB Corporation. ++Copyright (c) 2013, 2018, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file os/os0file.cc +The interface to the operating system file i/o primitives + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" + +#ifdef UNIV_NONINL +#include "os0file.ic" +#endif + +#include "ut0mem.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "srv0mon.h" +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" +# include "os0thread.h" +#else /* !UNIV_HOTBACKUP */ +# ifdef __WIN__ +/* Add includes for the _stat() call to compile on Windows */ +# include <sys/types.h> +# include <sys/stat.h> +# include <errno.h> +# endif /* __WIN__ */ +#endif /* !UNIV_HOTBACKUP */ + +#if defined(LINUX_NATIVE_AIO) +#include <libaio.h> +#endif + +/** Insert buffer segment id */ +static const ulint IO_IBUF_SEGMENT = 0; + +/** Log segment id */ +static const ulint IO_LOG_SEGMENT = 1; + +/* This specifies the file permissions InnoDB uses when it creates files in +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to +my_umask */ + +#ifndef __WIN__ +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +#else +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = 0; +#endif /* __WIN__ */ + +#ifndef UNIV_HOTBACKUP +/* We use these mutexes to protect lseek + file i/o operation, if the +OS does not provide an atomic pread or pwrite, or similar */ +#define OS_FILE_N_SEEK_MUTEXES 16 +UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; + +/* In simulated aio, merge at most this many consecutive i/os */ +#define OS_AIO_MERGE_N_CONSECUTIVE 64 + +#ifdef WITH_INNODB_DISALLOW_WRITES +#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event) +#else +#define WAIT_ALLOW_WRITES() do { } while (0) +#endif /* WITH_INNODB_DISALLOW_WRITES */ + +/********************************************************************** + +InnoDB AIO Implementation: +========================= + +We support native AIO for windows and linux. For rest of the platforms +we simulate AIO by special io-threads servicing the IO-requests. + +Simulated AIO: +============== + +In platforms where we 'simulate' AIO following is a rough explanation +of the high level design. +There are four io-threads (for ibuf, log, read, write). +All synchronous IO requests are serviced by the calling thread using +os_file_write/os_file_read. The Asynchronous requests are queued up +in an array (there are four such arrays) by the calling thread. +Later these requests are picked up by the io-thread and are serviced +synchronously. + +Windows native AIO: +================== + +If srv_use_native_aio is not set then windows follow the same +code as simulated AIO. If the flag is set then native AIO interface +is used. On windows, one of the limitation is that if a file is opened +for AIO no synchronous IO can be done on it. Therefore we have an +extra fifth array to queue up synchronous IO requests. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. No thread is +required for the sync array. +If a synchronous IO request is made, it is first queued in the sync +array. Then the calling thread itself waits on the request, thus +making the call synchronous. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +Linux native AIO: +================= + +If we have libaio installed on the system and innodb_use_native_aio +is set to TRUE we follow the code path of native AIO, otherwise we +do simulated AIO. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. +If a synchronous IO request is made, it is handled by calling +os_file_write/os_file_read. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +**********************************************************************/ + +/** Flag: enable debug printout for asynchronous i/o */ +UNIV_INTERN ibool os_aio_print_debug = FALSE; + +#ifdef UNIV_PFS_IO +/* Keys to register InnoDB I/O with performance schema */ +UNIV_INTERN mysql_pfs_key_t innodb_file_data_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_log_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key; +#endif /* UNIV_PFS_IO */ + +/** The asynchronous i/o array slot structure */ +struct os_aio_slot_t{ + ibool is_read; /*!< TRUE if a read operation */ + ulint pos; /*!< index of the slot in the aio + array */ + ibool reserved; /*!< TRUE if this slot is reserved */ + time_t reservation_time;/*!< time when reserved */ + ulint len; /*!< length of the block to read or + write */ + byte* buf; /*!< buffer used in i/o */ + ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */ + os_offset_t offset; /*!< file offset in bytes */ + pfs_os_file_t file; /*!< file where to read or write */ + const char* name; /*!< file name or path */ + ibool io_already_done;/*!< used only in simulated aio: + TRUE if the physical i/o already + made and only the slot message + needs to be passed to the caller + of os_aio_simulated_handle */ + fil_node_t* message1; /*!< message which is given by the */ + void* message2; /*!< the requester of an aio operation + and which can be used to identify + which pending aio operation was + completed */ +#ifdef WIN_ASYNC_IO + HANDLE handle; /*!< handle object we need in the + OVERLAPPED struct */ + OVERLAPPED control; /*!< Windows control block for the + aio request */ +#elif defined(LINUX_NATIVE_AIO) + struct iocb control; /* Linux control block for aio */ + int n_bytes; /* bytes written/read. */ + int ret; /* AIO return code */ +#endif /* WIN_ASYNC_IO */ +}; + +/** The asynchronous i/o array structure */ +struct os_aio_array_t{ + os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */ + os_event_t not_full; + /*!< The event which is set to the + signaled state when there is space in + the aio outside the ibuf segment; + os_event_set() and os_event_reset() + are protected by os_aio_array_t::mutex */ + os_event_t is_empty; + /*!< The event which is set to the + signaled state when there are no + pending i/os in this array; + os_event_set() and os_event_reset() + are protected by os_aio_array_t::mutex */ + ulint n_slots;/*!< Total number of slots in the aio + array. This must be divisible by + n_threads. */ + ulint n_segments; + /*!< Number of segments in the aio + array of pending aio requests. A + thread can wait separately for any one + of the segments. */ + ulint cur_seg;/*!< We reserve IO requests in round + robin fashion to different segments. + This points to the segment that is to + be used to service next IO request. */ + ulint n_reserved; + /*!< Number of reserved slots in the + aio array outside the ibuf segment */ + os_aio_slot_t* slots; /*!< Pointer to the slots in the array */ +#ifdef __WIN__ + HANDLE* handles; + /*!< Pointer to an array of OS native + event handles where we copied the + handles from slots, in the same + order. This can be used in + WaitForMultipleObjects; used only in + Windows */ +#endif /* __WIN__ */ + +#if defined(LINUX_NATIVE_AIO) + io_context_t* aio_ctx; + /* completion queue for IO. There is + one such queue per segment. Each thread + will work on one ctx exclusively. */ + struct io_event* aio_events; + /* The array to collect completed IOs. + There is one such event for each + possible pending IO. The size of the + array is equal to n_slots. */ +#endif /* LINUX_NATIV_AIO */ +}; + +#if defined(LINUX_NATIVE_AIO) +/** timeout for each io_getevents() call = 500ms. */ +#define OS_AIO_REAP_TIMEOUT (500000000UL) + +/** time to sleep, in microseconds if io_setup() returns EAGAIN. */ +#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL) + +/** number of attempts before giving up on io_setup(). */ +#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5 +#endif + +/** Array of events used in simulated aio. */ +static os_event_t* os_aio_segment_wait_events; + +/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These +are NULL when the module has not yet been initialized. @{ */ +static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */ +static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */ +static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */ +static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */ +static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */ +/* @} */ + +/** Number of asynchronous I/O segments. Set by os_aio_init(). */ +static ulint os_aio_n_segments = ULINT_UNDEFINED; + +/** If the following is TRUE, read i/o handler threads try to +wait until a batch of new read requests have been posted */ +static ibool os_aio_recommend_sleep_for_read_threads = FALSE; +#endif /* !UNIV_HOTBACKUP */ + +UNIV_INTERN ulint os_n_file_reads = 0; +UNIV_INTERN ulint os_bytes_read_since_printout = 0; +UNIV_INTERN ulint os_n_file_writes = 0; +UNIV_INTERN ulint os_n_fsyncs = 0; +UNIV_INTERN ulint os_n_file_reads_old = 0; +UNIV_INTERN ulint os_n_file_writes_old = 0; +UNIV_INTERN ulint os_n_fsyncs_old = 0; +UNIV_INTERN time_t os_last_printout; + +UNIV_INTERN ibool os_has_said_disk_full = FALSE; + +#ifdef UNIV_DEBUG +# ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Validates the consistency the aio system some of the time. +@return TRUE if ok or the check was skipped */ +UNIV_INTERN +ibool +os_aio_validate_skip(void) +/*======================*/ +{ +/** Try os_aio_validate() every this many times */ +# define OS_AIO_VALIDATE_SKIP 13 + + /** The os_aio_validate() call skip counter. + Use a signed type because of the race condition below. */ + static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly os_aio_validate() + check in debug builds. */ + if (--os_aio_validate_count > 0) { + return(TRUE); + } + + os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + return(os_aio_validate()); +} +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_DEBUG */ + +#ifdef __WIN__ +/***********************************************************************//** +Gets the operating system version. Currently works only on Windows. +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, +OS_WIN7. */ +UNIV_INTERN +ulint +os_get_os_version(void) +/*===================*/ +{ + OSVERSIONINFO os_info; + + os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); + + ut_a(GetVersionEx(&os_info)); + + if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) { + return(OS_WIN31); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { + return(OS_WIN95); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { + switch (os_info.dwMajorVersion) { + case 3: + case 4: + return(OS_WINNT); + case 5: + return (os_info.dwMinorVersion == 0) + ? OS_WIN2000 : OS_WINXP; + case 6: + return (os_info.dwMinorVersion == 0) + ? OS_WINVISTA : OS_WIN7; + default: + return(OS_WIN7); + } + } else { + ut_error; + return(0); + } +} +#endif /* __WIN__ */ + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +static +ulint +os_file_get_last_error_low( +/*=======================*/ + bool report_all_errors, /*!< in: TRUE if we want an error + message printed of all errors */ + bool on_error_silent) /*!< in: TRUE then don't print any + diagnostic to the log */ +{ +#ifdef __WIN__ + + ulint err = (ulint) GetLastError(); + if (err == ERROR_SUCCESS) { + return(0); + } + + if (report_all_errors + || (!on_error_silent + && err != ERROR_DISK_FULL + && err != ERROR_FILE_EXISTS)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %lu" + " in a file operation.\n", (ulong) err); + + if (err == ERROR_PATH_NOT_FOUND) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == ERROR_ACCESS_DENIED) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory. It may also be" + " you have created a subdirectory\n" + "InnoDB: of the same name as a data file.\n"); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + fprintf(stderr, + "InnoDB: The error means that another program" + " is using InnoDB's files.\n" + "InnoDB: This might be a backup or antivirus" + " software or another instance\n" + "InnoDB: of MySQL." + " Please close it to get rid of this error.\n"); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + fprintf(stderr, + "InnoDB: The error means that there are no" + " sufficient system resources or quota to" + " complete the operation.\n"); + } else if (err == ERROR_OPERATION_ABORTED) { + fprintf(stderr, + "InnoDB: The error means that the I/O" + " operation has been aborted\n" + "InnoDB: because of either a thread exit" + " or an application request.\n" + "InnoDB: Retry attempt is made.\n"); + } else { + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + if (err == ERROR_FILE_NOT_FOUND) { + return(OS_FILE_NOT_FOUND); + } else if (err == ERROR_DISK_FULL) { + return(OS_FILE_DISK_FULL); + } else if (err == ERROR_FILE_EXISTS) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + return(OS_FILE_SHARING_VIOLATION); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + return(OS_FILE_INSUFFICIENT_RESOURCE); + } else if (err == ERROR_OPERATION_ABORTED) { + return(OS_FILE_OPERATION_ABORTED); + } else if (err == ERROR_ACCESS_DENIED) { + return(OS_FILE_ACCESS_VIOLATION); + } else if (err == ERROR_BUFFER_OVERFLOW) { + return(OS_FILE_NAME_TOO_LONG); + } else { + return(OS_FILE_ERROR_MAX + err); + } +#else + int err = errno; + if (err == 0) { + return(0); + } + + if (report_all_errors + || (err != ENOSPC && err != EEXIST && !on_error_silent)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %d" + " in a file operation.\n", err); + + if (err == ENOENT) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == EACCES) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory.\n"); + } else { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + + fprintf(stderr, + "InnoDB: Some operating system" + " error numbers are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + switch (err) { + case ENOSPC: + return(OS_FILE_DISK_FULL); + case ENOENT: + return(OS_FILE_NOT_FOUND); + case EEXIST: + return(OS_FILE_ALREADY_EXISTS); + case ENAMETOOLONG: + return(OS_FILE_NAME_TOO_LONG); + case EXDEV: + case ENOTDIR: + case EISDIR: + return(OS_FILE_PATH_ERROR); + case EAGAIN: + if (srv_use_native_aio) { + return(OS_FILE_AIO_RESOURCES_RESERVED); + } + break; + case EINTR: + if (srv_use_native_aio) { + return(OS_FILE_AIO_INTERRUPTED); + } + break; + case EACCES: + return(OS_FILE_ACCESS_VIOLATION); + } + return(OS_FILE_ERROR_MAX + err); +#endif +} + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + bool report_all_errors) /*!< in: TRUE if we want an error + message printed of all errors */ +{ + return(os_file_get_last_error_low(report_all_errors, false)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +Conditionally exits (calling exit(3)) based on should_exit value and the +error type, if should_exit is TRUE then on_error_silent is ignored. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_cond_exit( +/*===========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool should_exit, /*!< in: call exit(3) if unknown error + and this parameter is TRUE */ + ibool on_error_silent)/*!< in: if TRUE then don't print + any message to the log iff it is + an unknown non-fatal error */ +{ + ulint err; + + err = os_file_get_last_error_low(false, on_error_silent); + + switch (err) { + case OS_FILE_DISK_FULL: + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(FALSE); + } + + /* Disk full error is reported irrespective of the + on_error_silent setting. */ + + if (name) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Encountered a problem with" + " file %s\n", name); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Disk is full. Try to clean the disk" + " to free space.\n"); + + os_has_said_disk_full = TRUE; + + fflush(stderr); + ut_error; + return(FALSE); + + case OS_FILE_AIO_RESOURCES_RESERVED: + case OS_FILE_AIO_INTERRUPTED: + + return(TRUE); + + case OS_FILE_PATH_ERROR: + case OS_FILE_ALREADY_EXISTS: + case OS_FILE_ACCESS_VIOLATION: + + return(FALSE); + + case OS_FILE_SHARING_VIOLATION: + + os_thread_sleep(10000000); /* 10 sec */ + return(TRUE); + + case OS_FILE_OPERATION_ABORTED: + case OS_FILE_INSUFFICIENT_RESOURCE: + + os_thread_sleep(100000); /* 100 ms */ + return(TRUE); + + default: + + /* If it is an operation that can crash on error then it + is better to ignore on_error_silent and print an error message + to the log. */ + + if (should_exit || !on_error_silent) { + ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " + "error " ULINTPF ".%s", name ? name : "(unknown)", + operation, err, should_exit + ? " Cannot continue operation" : ""); + } + + if (should_exit) { + exit(1); + } + } + + return(FALSE); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error( +/*=================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation) /*!< in: operation */ +{ + /* exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent)/*!< in: if TRUE then don't print + any message to the log. */ +{ + /* don't exit in case of unknown error */ + return(os_file_handle_error_cond_exit( + name, operation, FALSE, on_error_silent)); +} + +#undef USE_FILE_LOCK +#define USE_FILE_LOCK +#if defined(UNIV_HOTBACKUP) || defined(__WIN__) +/* InnoDB Hot Backup does not lock the data files. + * On Windows, mandatory locking is used. + */ +# undef USE_FILE_LOCK +#endif +#ifdef USE_FILE_LOCK +/****************************************************************//** +Obtain an exclusive lock on a file. +@return 0 on success */ +static +int +os_file_lock( +/*=========*/ + int fd, /*!< in: file descriptor */ + const char* name) /*!< in: file name */ +{ + struct flock lk; + + ut_ad(!srv_read_only_mode); + + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + + if (fcntl(fd, F_SETLK, &lk) == -1) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to lock %s, error: %d", name, errno); + + if (errno == EAGAIN || errno == EACCES) { + ib_logf(IB_LOG_LEVEL_INFO, + "Check that you do not already have " + "another mysqld process using the " + "same InnoDB data or log files."); + } + + return(-1); + } + + return(0); +} +#endif /* USE_FILE_LOCK */ + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Creates the seek mutexes used in positioned reads and writes. */ +UNIV_INTERN +void +os_io_init_simple(void) +/*===================*/ +{ + for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + os_file_seek_mutexes[i] = os_mutex_create(); + } +} + +/** Create a temporary file. This function is like tmpfile(3), but +the temporary file is created in the given parameter path. If the path +is null then it will create the file in the mysql server configuration +parameter (--tmpdir). +@param[in] path location for creating temporary file +@return temporary file handle, or NULL on error */ +UNIV_INTERN +FILE* +os_file_create_tmpfile( + const char* path) +{ + FILE* file = NULL; + WAIT_ALLOW_WRITES(); + int fd = innobase_mysql_tmpfile(path); + + ut_ad(!srv_read_only_mode); + + if (fd >= 0) { + file = fdopen(fd, "w+b"); + } + + if (!file) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: unable to create temporary file;" + " errno: %d\n", errno); + if (fd >= 0) { + close(fd); + } + } + + return(file); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. +@return directory stream, NULL if error */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + const char* dirname, /*!< in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal) /*!< in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +{ + os_file_dir_t dir; +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + char path[OS_FILE_MAX_PATH + 3]; + + ut_a(strlen(dirname) < OS_FILE_MAX_PATH); + + strcpy(path, dirname); + strcpy(path + strlen(path), "\\*"); + + /* Note that in Windows opening the 'directory stream' also retrieves + the first entry in the directory. Since it is '.', that is no problem, + as we will skip over the '.' and '..' entries anyway. */ + + lpFindFileData = static_cast<LPWIN32_FIND_DATA>( + ut_malloc(sizeof(WIN32_FIND_DATA))); + + dir = FindFirstFile((LPCTSTR) path, lpFindFileData); + + ut_free(lpFindFileData); + + if (dir == INVALID_HANDLE_VALUE) { + + if (error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(NULL); + } + + return(dir); +#else + dir = opendir(dirname); + + if (dir == NULL && error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(dir); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Closes a directory stream. +@return 0 if success, -1 if failure */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + os_file_dir_t dir) /*!< in: directory stream */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = FindClose(dir); + + if (!ret) { + os_file_handle_error_no_exit(NULL, "closedir", FALSE); + + return(-1); + } + + return(0); +#else + int ret; + + ret = closedir(dir); + + if (ret) { + os_file_handle_error_no_exit(NULL, "closedir", FALSE); + } + + return(ret); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. +@return 0 if ok, -1 if error, 1 if at the end of the directory */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info) /*!< in/out: buffer where the info is returned */ +{ +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + BOOL ret; + + lpFindFileData = static_cast<LPWIN32_FIND_DATA>( + ut_malloc(sizeof(WIN32_FIND_DATA))); +next_file: + ret = FindNextFile(dir, lpFindFileData); + + if (ret) { + ut_a(strlen((char*) lpFindFileData->cFileName) + < OS_FILE_MAX_PATH); + + if (strcmp((char*) lpFindFileData->cFileName, ".") == 0 + || strcmp((char*) lpFindFileData->cFileName, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, (char*) lpFindFileData->cFileName); + + info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow) + + (((ib_int64_t)(lpFindFileData->nFileSizeHigh)) + << 32); + + if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_REPARSE_POINT) { + /* TODO: test Windows symlinks */ + /* TODO: MySQL has apparently its own symlink + implementation in Windows, dbname.sym can + redirect a database directory: + REFMAN "windows-symbolic-links.html" */ + info->type = OS_FILE_TYPE_LINK; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_DIRECTORY) { + info->type = OS_FILE_TYPE_DIR; + } else { + /* It is probably safest to assume that all other + file types are normal. Better to check them rather + than blindly skip them. */ + + info->type = OS_FILE_TYPE_FILE; + } + } + + ut_free(lpFindFileData); + + if (ret) { + return(0); + } else if (GetLastError() == ERROR_NO_MORE_FILES) { + + return(1); + } else { + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + return(-1); + } +#else + struct dirent* ent; + char* full_path; + int ret; + struct stat statinfo; +#ifdef HAVE_READDIR_R + char dirent_buf[sizeof(struct dirent) + + _POSIX_PATH_MAX + 100]; + /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as + the max file name len; but in most standards, the + length is NAME_MAX; we add 100 to be even safer */ +#endif + +next_file: + +#ifdef HAVE_READDIR_R + ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent); + + if (ret != 0 +#ifdef UNIV_AIX + /* On AIX, only if we got non-NULL 'ent' (result) value and + a non-zero 'ret' (return) value, it indicates a failed + readdir_r() call. An NULL 'ent' with an non-zero 'ret' + would indicate the "end of the directory" is reached. */ + && ent != NULL +#endif + ) { + fprintf(stderr, + "InnoDB: cannot read directory %s, error %lu\n", + dirname, (ulong) ret); + + return(-1); + } + + if (ent == NULL) { + /* End of directory */ + + return(1); + } + + ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1); +#else + ent = readdir(dir); + + if (ent == NULL) { + + return(1); + } +#endif + ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); + + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, ent->d_name); + + full_path = static_cast<char*>( + ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10)); + + sprintf(full_path, "%s/%s", dirname, ent->d_name); + + ret = stat(full_path, &statinfo); + + if (ret) { + + if (errno == ENOENT) { + /* readdir() returned a file that does not exist, + it must have been deleted in the meantime. Do what + would have happened if the file was deleted before + readdir() - ignore and go to the next entry. + If this is the last entry then info->name will still + contain the name of the deleted file when this + function returns, but this is not an issue since the + caller shouldn't be looking at info when end of + directory is returned. */ + + ut_free(full_path); + + goto next_file; + } + + os_file_handle_error_no_exit(full_path, "stat", FALSE); + + ut_free(full_path); + + return(-1); + } + + info->size = (ib_int64_t) statinfo.st_size; + + if (S_ISDIR(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + + ut_free(full_path); + + return(0); +#endif +} + +/*****************************************************************//** +This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns FALSE. +@return TRUE if call succeeds, FALSE on error */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + const char* pathname, /*!< in: directory name as + null-terminated string */ + ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory + is treated as an error. */ +{ +#ifdef __WIN__ + BOOL rcode; + + rcode = CreateDirectory((LPCTSTR) pathname, NULL); + if (!(rcode != 0 + || (GetLastError() == ERROR_ALREADY_EXISTS + && !fail_if_exists))) { + + os_file_handle_error_no_exit( + pathname, "CreateDirectory", FALSE); + + return(FALSE); + } + + return(TRUE); +#else + int rcode; + WAIT_ALLOW_WRITES(); + + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + + return(FALSE); + } + + return (TRUE); +#endif /* __WIN__ */ +} + +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_func( +/*=======================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + os_file_t file; + ibool retry; + + *success = FALSE; +#ifdef __WIN__ + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + ut_a(!srv_read_only_mode); + + /* Create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); + } + + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_INFO, + "read only mode set. Unable to " + "open file '%s' in RW mode, trying RO mode", name); + + access = GENERIC_READ; + + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + + return((os_file_t) -1); + } + + do { + /* Use default security attributes and no template file. */ + + file = CreateFile( + (LPCTSTR) name, access, FILE_SHARE_READ, NULL, + create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + + *success = FALSE; + + retry = os_file_handle_error( + name, create_mode == OS_FILE_OPEN ? + "open" : "create"); + + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + +#else /* __WIN__ */ + int create_flag; + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else if (srv_read_only_mode) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + /* Create subdirs along the path if needed */ + + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); + } + + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + do { - file = ::open(name, create_flag, os_innodb_umask); ++ file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + if (file == -1) { + *success = FALSE; + + retry = os_file_handle_error( + name, + create_mode == OS_FILE_OPEN + ? "open" : "create"); + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + + *success = FALSE; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + +#endif /* __WIN__ */ + + return(file); +} + +/****************************************************************//** +NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +pfs_os_file_t +os_file_create_simple_no_error_handling_func( +/*=========================================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + pfs_os_file_t file; + + *success = FALSE; +#ifdef __WIN__ + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + DWORD share_mode = FILE_SHARE_READ; + ut_a(name); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (srv_read_only_mode) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + file.m_file = (os_file_t)-1; + return(file); + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (srv_read_only_mode) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + + ut_a(!srv_read_only_mode); + + access = GENERIC_READ; + + /*!< A backup program has to give mysqld the maximum + freedom to do what it likes with the file */ + + share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + file.m_file = (os_file_t)-1; + return(file); + } + + file.m_file = CreateFile((LPCTSTR) name, + access, + share_mode, + NULL, // Security attributes + create_flag, + attributes, + NULL); // No template file + + *success = (file.m_file != INVALID_HANDLE_VALUE); +#else /* __WIN__ */ + int create_flag; + const char* mode_str = NULL; + ut_a(name); + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + mode_str = "OPEN"; + + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else { + + ut_a(access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_ALLOW_DELETE); + + create_flag = O_RDWR; + } + + } else if (srv_read_only_mode) { + + mode_str = "OPEN"; + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + file.m_file = -1; + return(file); + } + - file.m_file = ::open(name, create_flag, os_innodb_umask); ++ file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + *success = file.m_file == -1 ? FALSE : TRUE; + + /* This function is always called for data files, we should disable + OS caching (O_DIRECT) here as we do in os_file_create_func(), so + we open the same file in the same mode, see man page of open(2). */ + if (!srv_read_only_mode + && *success + && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT + || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { + + os_file_set_nocache(file.m_file, name, mode_str); + } + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file.m_file, name)) { + + *success = FALSE; + close(file.m_file); + file.m_file = -1; + + } +#endif /* USE_FILE_LOCK */ + +#endif /* __WIN__ */ + + return(file); +} + +/****************************************************************//** +Tries to disable OS caching on an opened file descriptor. */ +UNIV_INTERN +void +os_file_set_nocache( +/*================*/ + int fd /*!< in: file descriptor to alter */ + MY_ATTRIBUTE((unused)), + const char* file_name /*!< in: used in the diagnostic + message */ + MY_ATTRIBUTE((unused)), + const char* operation_name MY_ATTRIBUTE((unused))) + /*!< in: "open" or "create"; used + in the diagnostic message */ +{ + /* some versions of Solaris may not have DIRECTIO_ON */ +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) + if (directio(fd, DIRECTIO_ON) == -1) { + int errno_save = errno; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Failed to set DIRECTIO_ON on file %s: %s: %s, " + "continuing anyway.", + file_name, operation_name, strerror(errno_save)); + } +#elif defined(O_DIRECT) + if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { + int errno_save = errno; + static bool warning_message_printed = false; + if (errno_save == EINVAL) { + if (!warning_message_printed) { + warning_message_printed = true; +# ifdef UNIV_LINUX + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set O_DIRECT on file " + "%s: %s: %s, continuing anyway. " + "O_DIRECT is known to result " + "in 'Invalid argument' on Linux on " + "tmpfs, see MySQL Bug#26662.", + file_name, operation_name, + strerror(errno_save)); +# else /* UNIV_LINUX */ + goto short_warning; +# endif /* UNIV_LINUX */ + } + } else { +# ifndef UNIV_LINUX +short_warning: +# endif + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set O_DIRECT on file %s: %s: %s, " + "continuing anyway.", + file_name, operation_name, strerror(errno_save)); + } + } +#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ +} + +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +pfs_os_file_t +os_file_create_func( +/*================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + pfs_os_file_t file; + ibool retry; + ibool on_error_no_exit; + ibool on_error_silent; +#ifdef __WIN__ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + SetLastError(ERROR_DISK_FULL); + file.m_file = (os_file_t)-1; + return(file); + ); +#else /* __WIN__ */ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + errno = ENOSPC; + file.m_file = -1; + return(file); + ); +#endif /* __WIN__ */ + +#ifdef __WIN__ + DWORD create_flag; + DWORD share_mode = FILE_SHARE_READ; + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? TRUE : FALSE; + + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? TRUE : FALSE; + + create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; + create_mode &= ~OS_FILE_ON_ERROR_SILENT; + + if (create_mode == OS_FILE_OPEN_RAW) { + + ut_a(!srv_read_only_mode); + + create_flag = OPEN_EXISTING; + + /* On Windows Physical devices require admin privileges and + have to have the write-share mode set. See the remarks + section for the CreateFile() function documentation in MSDN. */ + + share_mode |= FILE_SHARE_WRITE; + + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RETRY) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + create_flag = CREATE_ALWAYS; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + file.m_file = (os_file_t)-1; + return(file); + } + + DWORD attributes = 0; + +#ifdef UNIV_HOTBACKUP + attributes |= FILE_FLAG_NO_BUFFERING; +#else + if (purpose == OS_FILE_AIO) { + +#ifdef WIN_ASYNC_IO + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ + + if (srv_use_native_aio) { + attributes |= FILE_FLAG_OVERLAPPED; + } +#endif /* WIN_ASYNC_IO */ + + } else if (purpose == OS_FILE_NORMAL) { + /* Use default setting. */ + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown purpose flag (%lu) while opening file '%s'", + purpose, name); + file.m_file = (os_file_t)-1; + return(file); + } + +#ifdef UNIV_NON_BUFFERED_IO + // TODO: Create a bug, this looks wrong. The flush log + // parameter is dynamic. + if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) { + + /* Do not use unbuffered i/o for the log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + + } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) { + + attributes |= FILE_FLAG_NO_BUFFERING; + } +#endif /* UNIV_NON_BUFFERED_IO */ + +#endif /* UNIV_HOTBACKUP */ + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + + do { + /* Use default security attributes and no template file. */ + file.m_file = CreateFile( + (LPCTSTR) name, access, share_mode, NULL, + create_flag, attributes, NULL); + + if (file.m_file == INVALID_HANDLE_VALUE) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; + + *success = FALSE; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } + } else { + *success = TRUE; + retry = FALSE; + } + + } while (retry); + +#else /* __WIN__ */ + int create_flag; + const char* mode_str = NULL; + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? TRUE : FALSE; + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? TRUE : FALSE; + + create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; + create_mode &= ~OS_FILE_ON_ERROR_SILENT; + + if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + + mode_str = "OPEN"; + + create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR; + + } else if (srv_read_only_mode) { + + mode_str = "OPEN"; + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + file.m_file = -1; + return(file); + } + + ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE); + ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); + +#ifdef O_SYNC + /* We let O_SYNC only affect log files; note that we map O_DSYNC to + O_SYNC because the datasync options seemed to corrupt files in 2001 + in both Linux and Solaris */ + + if (!srv_read_only_mode + && type == OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + + create_flag |= O_SYNC; + } +#endif /* O_SYNC */ + + do { - file.m_file = ::open(name, create_flag, os_innodb_umask); ++ file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + if (file.m_file == -1) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; + + *success = FALSE; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + + /* We disable OS caching (O_DIRECT) only on data files */ + + if (!srv_read_only_mode + && *success + && type != OS_LOG_FILE + && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT + || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { + + os_file_set_nocache(file.m_file, name, mode_str); + } + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && create_mode != OS_FILE_OPEN_RAW + && os_file_lock(file.m_file, name)) { + + if (create_mode == OS_FILE_OPEN_RETRY) { + + ut_a(!srv_read_only_mode); + + ib_logf(IB_LOG_LEVEL_INFO, + "Retrying to lock the first data file"); + + for (int i = 0; i < 100; i++) { + os_thread_sleep(1000000); + + if (!os_file_lock(file.m_file, name)) { + *success = TRUE; + return(file); + } + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Unable to open the first data file"); + } + + *success = FALSE; + close(file.m_file); + file.m_file = -1; + } +#endif /* USE_FILE_LOCK */ + +#endif /* __WIN__ */ + + return(file); +} + +/***********************************************************************//** +Deletes a file if it exists. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_if_exists_func( +/*==========================*/ + const char* name) /*!< in: file path as a null-terminated + string */ +{ +#ifdef __WIN__ + bool ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying + it */ + + ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + DWORD lasterr = GetLastError(); + if (lasterr == ERROR_FILE_NOT_FOUND + || lasterr == ERROR_PATH_NOT_FOUND) { + /* the file does not exist, this not an error */ + + return(true); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + + ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name); + } + + os_thread_sleep(500000); /* sleep for 0.5 second */ + + if (count > 2000) { + + return(false); + } + + goto loop; +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = unlink(name); + + if (ret != 0 && errno != ENOENT) { + os_file_handle_error_no_exit(name, "delete", FALSE); + + return(false); + } + + return(true); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Deletes a file. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_func( +/*================*/ + const char* name) /*!< in: file path as a null-terminated + string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying + it */ + + ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* If the file does not exist, we classify this as a 'mild' + error and return */ + + return(false); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + + fprintf(stderr, + "InnoDB: Warning: cannot delete file %s\n" + "InnoDB: Are you running mysqlbackup" + " to back up the file?\n", name); + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(false); + } + + goto loop; +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = unlink(name); + + if (ret != 0) { + os_file_handle_error_no_exit(name, "delete", FALSE); + + return(false); + } + + return(true); +#endif +} + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_rename(), not directly this function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_rename_func( +/*================*/ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath)/*!< in: new file path */ +{ +#ifdef UNIV_DEBUG + os_file_type_t type; + ibool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + +#ifdef __WIN__ + BOOL ret; + + ret = MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath); + + if (ret) { + return(TRUE); + } + + os_file_handle_error_no_exit(oldpath, "rename", FALSE); + + return(FALSE); +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = rename(oldpath, newpath); + + if (ret != 0) { + os_file_handle_error_no_exit(oldpath, "rename", FALSE); + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_close(), not directly this function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_func( +/*===============*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(NULL, "close"); + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + os_file_handle_error(NULL, "close"); + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +#ifdef UNIV_HOTBACKUP +/***********************************************************************//** +Closes a file handle. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_no_error_handling( +/*============================*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} +#endif /* UNIV_HOTBACKUP */ + +/***********************************************************************//** +Gets a file size. +@return file size, or (os_offset_t) -1 on failure */ +UNIV_INTERN +os_offset_t +os_file_get_size( +/*=============*/ + pfs_os_file_t file) /*!< in: handle to a file */ +{ +#ifdef __WIN__ + os_offset_t offset; + DWORD high; + DWORD low; + + low = GetFileSize(file.m_file, &high); + + if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) { + return((os_offset_t) -1); + } + + offset = (os_offset_t) low | ((os_offset_t) high << 32); + + return(offset); +#else + return((os_offset_t) lseek(file.m_file, 0, SEEK_END)); + +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Write the specified number of zeros to a newly created file. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + pfs_os_file_t file, /*!< in: handle to a file */ + os_offset_t size) /*!< in: file size */ +{ + ibool ret; + byte* buf; + byte* buf2; + ulint buf_size; + +#ifdef HAVE_POSIX_FALLOCATE + if (srv_use_posix_fallocate) { + int err; + do { + err = posix_fallocate(file.m_file, 0, size); + } while (err == EINTR + && srv_shutdown_state == SRV_SHUTDOWN_NONE); + + if (err) { + ib_logf(IB_LOG_LEVEL_ERROR, + "preallocating " INT64PF " bytes for" + "file %s failed with error %d", + size, name, err); + } + return(!err); + } +#endif + +#ifdef _WIN32 + /* Write 1 page of zeroes at the desired end. */ + buf_size = UNIV_PAGE_SIZE; + os_offset_t current_size = size - buf_size; +#else + /* Write up to 1 megabyte at a time. */ + buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) + * UNIV_PAGE_SIZE; + os_offset_t current_size = 0; +#endif + buf2 = static_cast<byte*>(calloc(1, buf_size + UNIV_PAGE_SIZE)); + + if (!buf2) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot allocate " ULINTPF " bytes to extend file\n", + buf_size + UNIV_PAGE_SIZE); + return(FALSE); + } + + /* Align the buffer for possible raw i/o */ + buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + + do { + ulint n_bytes; + + if (size - current_size < (os_offset_t) buf_size) { + n_bytes = (ulint) (size - current_size); + } else { + n_bytes = buf_size; + } + + ret = os_file_write(name, file, buf, current_size, n_bytes); + if (!ret) { + break; + } + + current_size += n_bytes; + } while (current_size < size); + + free(buf2); + + return(ret && os_file_flush(file)); +} + +/***********************************************************************//** +Truncates a file at its current position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + FILE* file) /*!< in: file to be truncated */ +{ +#ifdef __WIN__ + HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); + return(SetEndOfFile(h)); +#else /* __WIN__ */ + WAIT_ALLOW_WRITES(); + return(!ftruncate(fileno(file), ftell(file))); +#endif /* __WIN__ */ +} + +#ifndef __WIN__ +/***********************************************************************//** +Wrapper to fsync(2) that retries the call on some errors. +Returns the value 0 if successful; otherwise the value -1 is returned and +the global variable errno is set to indicate the error. +@return 0 if success, -1 otherwise */ + +static +int +os_file_fsync( +/*==========*/ + os_file_t file) /*!< in: handle to a file */ +{ + int ret; + int failures; + ibool retry; + + failures = 0; + + do { + ret = fsync(file); + + os_n_fsyncs++; + + if (ret == -1 && errno == ENOLCK) { + + if (failures % 100 == 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: fsync(): " + "No locks available; retrying\n"); + } + + os_thread_sleep(200000 /* 0.2 sec */); + + failures++; + + retry = TRUE; + } else { + + retry = FALSE; + } + } while (retry); + + return(ret); +} +#endif /* !__WIN__ */ + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_flush(), not directly this function! +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_flush_func( +/*===============*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + os_n_fsyncs++; + + ret = FlushFileBuffers(file); + + if (ret) { + return(TRUE); + } + + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + + if (srv_start_raw_disk_in_use && GetLastError() + == ERROR_INVALID_FUNCTION) { + return(TRUE); + } + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#else + int ret; + WAIT_ALLOW_WRITES(); + +#if defined(HAVE_DARWIN_THREADS) +# ifndef F_FULLFSYNC + /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */ +# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */ +# elif F_FULLFSYNC != 51 +# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3" +# endif + /* Apple has disabled fsync() for internal disk drives in OS X. That + caused corruption for a user when he tested a power outage. Let us in + OS X use a nonstandard flush method recommended by an Apple + engineer. */ + + if (!srv_have_fullfsync) { + /* If we are not on an operating system that supports this, + then fall back to a plain fsync. */ + + ret = os_file_fsync(file); + } else { + ret = fcntl(file, F_FULLFSYNC, NULL); + + if (ret) { + /* If we are not on a file system that supports this, + then fall back to a plain fsync. */ + ret = os_file_fsync(file); + } + } +#else + ret = os_file_fsync(file); +#endif + + if (ret == 0) { + return(TRUE); + } + + /* Since Linux returns EINVAL if the 'file' is actually a raw device, + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { + + return(TRUE); + } + + ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#endif +} + +#ifndef __WIN__ +/*******************************************************************//** +Does a synchronous read operation in Posix. +@return number of bytes read, -1 if error */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ssize_t +os_file_pread( +/*==========*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + ulint n, /*!< in: number of bytes to read */ + os_offset_t offset) /*!< in: file offset from where to read */ +{ + off_t offs; + + ut_ad(n); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + offs = (off_t) offset; + + if (sizeof(off_t) <= 4) { + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File read at offset > 4 GB"); + } + } + + os_n_file_reads++; + + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); + +#ifdef HAVE_PREAD + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); + ssize_t n_bytes = pread(file, buf, n, offs); + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + return(n_bytes); +#else + { + off_t ret_offset; + ssize_t ret; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); +#ifndef UNIV_HOTBACKUP + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + } else { + ret = read(file, buf, (ssize_t) n); + } + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + return(ret); + } +#endif +} + +/*******************************************************************//** +Does a synchronous write operation in Posix. +@return number of bytes written, -1 if error */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ssize_t +os_file_pwrite( +/*===========*/ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from where to write */ + ulint n, /*!< in: number of bytes to write */ + os_offset_t offset) /*!< in: file offset where to write */ +{ + ssize_t ret; + off_t offs; + + ut_ad(n); + ut_ad(!srv_read_only_mode); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + offs = (off_t) offset; + + if (sizeof(off_t) <= 4) { + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File write at offset > 4 GB."); + } + } + + os_n_file_writes++; + + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES); +#ifdef HAVE_PWRITE + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + ret = pwrite(file, buf, (ssize_t) n, offs); + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + return(ret); +#else + { + off_t ret_offset; +# ifndef UNIV_HOTBACKUP + ulint i; +# endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + +# ifndef UNIV_HOTBACKUP + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +# endif /* UNIV_HOTBACKUP */ + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + + goto func_exit; + } + + ret = write(file, buf, (ssize_t) n); + +func_exit: +# ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +# endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + return(ret); + } +#endif /* HAVE_PWRITE */ +} +#endif + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read(), not directly this +function! +Requests a synchronous positioned read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_func( +/*==============*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n) /*!< in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); + +try_again: + ut_ad(buf); + ut_ad(n > 0); + + low = (DWORD) offset & 0xFFFFFFFF; + high = (DWORD) (offset >> 32); + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); + +#ifndef UNIV_HOTBACKUP + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret2 = SetFilePointer( + file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + goto error_handling; + } + + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + + if (ret && len == n) { + return(TRUE); + } +#else /* __WIN__ */ + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset); + + if ((ulint) ret == n) { + return(TRUE); + } else if (ret == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error in system call pread(). The operating" + " system error number is %lu.",(ulint) errno); + } else { + /* Partial read occurred */ + ib_logf(IB_LOG_LEVEL_ERROR, + "Tried to read " ULINTPF " bytes at offset " + UINT64PF ". Was only able to read %ld.", + n, offset, (lint) ret); + } +#endif /* __WIN__ */ +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error(NULL, "read"); + + if (retry) { + goto try_again; + } + + fprintf(stderr, + "InnoDB: Fatal error: cannot read from file." + " OS error number %lu.\n", +#ifdef __WIN__ + (ulong) GetLastError() +#else + (ulong) errno +#endif /* __WIN__ */ + ); + fflush(stderr); + + ut_error; + + return(FALSE); +} + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_no_error_handling_func( +/*================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n) /*!< in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ibool retry; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); + +try_again: + ut_ad(buf); + ut_ad(n > 0); + + low = (DWORD) offset & 0xFFFFFFFF; + high = (DWORD) (offset >> 32); + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); + +#ifndef UNIV_HOTBACKUP + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret2 = SetFilePointer( + file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + goto error_handling; + } + + ret = ReadFile(file, buf, (DWORD) n, &len, NULL); + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + + if (ret && len == n) { + return(TRUE); + } +#else /* __WIN__ */ + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset); + + if ((ulint) ret == n) { + return(TRUE); + } else if (ret == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error in system call pread(). The operating" + " system error number is %lu.",(ulint) errno); + } else { + /* Partial read occurred */ + ib_logf(IB_LOG_LEVEL_ERROR, + "Tried to read " ULINTPF " bytes at offset " + UINT64PF ". Was only able to read %ld.", + n, offset, (lint) ret); + } +#endif /* __WIN__ */ +#ifdef __WIN__ +error_handling: +#endif + retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + + if (retry) { + goto try_again; + } + + return(FALSE); +} + +/*******************************************************************//** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /*!< in: file to read from */ + char* str, /*!< in: buffer where to read */ + ulint size) /*!< in: size of buffer */ +{ + size_t flen; + + if (size == 0) { + return; + } + + rewind(file); + flen = fread(str, 1, size - 1, file); + str[flen] = '\0'; +} + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_write(), not directly +this function! +Requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_write_func( +/*===============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n) /*!< in: number of bytes to write */ +{ + ut_ad(!srv_read_only_mode); +#ifdef __WIN__ + BOOL ret; + DWORD len; + DWORD ret2; + DWORD low; + DWORD high; + ulint n_retries = 0; + ulint err; + DWORD saved_error = 0; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_writes++; + + ut_ad(buf); + ut_ad(n > 0); + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES); +retry: + low = (DWORD) offset & 0xFFFFFFFF; + high = (DWORD) (offset >> 32); + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + +#ifndef UNIV_HOTBACKUP + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret2 = SetFilePointer( + file, low, reinterpret_cast<PLONG>(&high), FILE_BEGIN); + + if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) { + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: File pointer positioning to" + " file %s failed at\n" + "InnoDB: offset %llu. Operating system" + " error number %lu.\n" + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n", + name, offset, (ulong) GetLastError()); + + return(FALSE); + } + + ret = WriteFile(file, buf, (DWORD) n, &len, NULL); + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + if (ret && len == n) { + + return(TRUE); + } + + /* If some background file system backup tool is running, then, at + least in Windows 2000, we may get here a specific error. Let us + retry the operation 100 times, with 1 second waits. */ + + if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) { + + os_thread_sleep(1000000); + + n_retries++; + + goto retry; + } + + if (!os_has_said_disk_full) { + char *winmsg = NULL; + + saved_error = GetLastError(); + err = (ulint) saved_error; + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset %llu.\n" + "InnoDB: %lu bytes should have been written," + " only %lu were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset, + (ulong) n, (ulong) len, (ulong) err); + + /* Ask Windows to prepare a standard message for a + GetLastError() */ + + FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, saved_error, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPSTR)&winmsg, 0, NULL); + + if (winmsg) { + fprintf(stderr, + "InnoDB: FormatMessage: Error number %lu means '%s'.\n", + (ulong) saved_error, winmsg); + LocalFree(winmsg); + } + + if (strerror((int) err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu means '%s'.\n", + (ulong) err, strerror((int) err)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#else + ssize_t ret; + WAIT_ALLOW_WRITES(); + + ret = os_file_pwrite(file, buf, n, offset); + + if ((ulint) ret == n) { + + return(TRUE); + } + + if (!os_has_said_disk_full) { + ut_print_timestamp(stderr); + + if(ret == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Failure of system call pwrite(). Operating" + " system error number is %lu.", + (ulint) errno); + } else { + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset " UINT64PF ".\n" + "InnoDB: %lu bytes should have been written," + " only %ld were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset, n, (lint) ret, + (ulint) errno); + } + + if (strerror(errno) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d means '%s'.\n", + errno, strerror(errno)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#endif +} + +/*******************************************************************//** +Check the existence and type of the given file. +@return TRUE if call succeeded */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + const char* path, /*!< in: pathname of the file */ + ibool* exists, /*!< out: TRUE if file exists */ + os_file_type_t* type) /*!< out: type of the file (if it exists) */ +{ +#ifdef __WIN__ + int ret; + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(FALSE); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#endif +} + +/*******************************************************************//** +This function returns information about the specified file +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +os_file_get_status( +/*===============*/ + const char* path, /*!< in: pathname of the file */ + os_file_stat_t* stat_info, /*!< information of a file in a + directory */ + bool check_rw_perm) /*!< in: for testing whether the + file can be opened in RW mode */ +{ + int ret; + +#ifdef __WIN__ + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(DB_FAIL); + + } else if (_S_IFDIR & statinfo.st_mode) { + stat_info->type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + + stat_info->type = OS_FILE_TYPE_FILE; + + /* Check if we can open it in read-only mode. */ + + if (check_rw_perm) { + HANDLE fh; + + fh = CreateFile( + (LPCTSTR) path, // File to open + access, + 0, // No sharing + NULL, // Default security + OPEN_EXISTING, // Existing file only + FILE_ATTRIBUTE_NORMAL, // Normal file + NULL); // No attr. template + + if (fh == INVALID_HANDLE_VALUE) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + CloseHandle(fh); + } + } + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } +#else + struct stat statinfo; + + ret = stat(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(DB_FAIL); + + } + + switch (statinfo.st_mode & S_IFMT) { + case S_IFDIR: + stat_info->type = OS_FILE_TYPE_DIR; + break; + case S_IFLNK: + stat_info->type = OS_FILE_TYPE_LINK; + break; + case S_IFBLK: + /* Handle block device as regular file. */ + case S_IFCHR: + /* Handle character device as regular file. */ + case S_IFREG: + stat_info->type = OS_FILE_TYPE_FILE; + break; + default: + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + + if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) { + + int fh; + int access; + + access = !srv_read_only_mode ? O_RDWR : O_RDONLY; + - fh = ::open(path, access, os_innodb_umask); ++ fh = ::open(path, access | O_CLOEXEC, os_innodb_umask); + + if (fh == -1) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + close(fh); + } + } + +#endif /* _WIN_ */ + + stat_info->ctime = statinfo.st_ctime; + stat_info->atime = statinfo.st_atime; + stat_info->mtime = statinfo.st_mtime; + stat_info->size = statinfo.st_size; + + return(DB_SUCCESS); +} + +/* path name separator character */ +#ifdef __WIN__ +# define OS_FILE_PATH_SEPARATOR '\\' +#else +# define OS_FILE_PATH_SEPARATOR '/' +#endif + +/****************************************************************//** +This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: new full pathname */ +UNIV_INTERN +char* +os_file_make_new_pathname( +/*======================*/ + const char* old_path, /*!< in: pathname */ + const char* tablename) /*!< in: contains new base name */ +{ + ulint dir_len; + char* last_slash; + char* base_name; + char* new_path; + ulint new_path_len; + + /* Split the tablename into its database and table name components. + They are separated by a '/'. */ + last_slash = strrchr((char*) tablename, '/'); + base_name = last_slash ? last_slash + 1 : (char*) tablename; + + /* Find the offset of the last slash. We will strip off the + old basename.ibd which starts after that slash. */ + last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR); + dir_len = last_slash ? last_slash - old_path : strlen(old_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd"; + new_path = static_cast<char*>(mem_alloc(new_path_len)); + memcpy(new_path, old_path, dir_len); + + ut_snprintf(new_path + dir_len, + new_path_len - dir_len, + "%c%s.ibd", + OS_FILE_PATH_SEPARATOR, + base_name); + + return(new_path); +} + +/****************************************************************//** +This function returns a remote path name by combining a data directory +path provided in a DATA DIRECTORY clause with the tablename which is +in the form 'database/tablename'. It strips the file basename (which +is the tablename) found after the last directory in the path provided. +The full filepath created will include the database name as a directory +under the path provided. The filename is the tablename with the '.ibd' +extension. All input and output strings are null-terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: A full pathname; data_dir_path/databasename/tablename.ibd */ +UNIV_INTERN +char* +os_file_make_remote_pathname( +/*=========================*/ + const char* data_dir_path, /*!< in: pathname */ + const char* tablename, /*!< in: tablename */ + const char* extention) /*!< in: file extention; ibd,cfg */ +{ + ulint data_dir_len; + char* last_slash; + char* new_path; + ulint new_path_len; + + ut_ad(extention && strlen(extention) == 3); + + /* Find the offset of the last slash. We will strip off the + old basename or tablename which starts after that slash. */ + last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = data_dir_len + strlen(tablename) + + sizeof "/." + strlen(extention); + new_path = static_cast<char*>(mem_alloc(new_path_len)); + memcpy(new_path, data_dir_path, data_dir_len); + ut_snprintf(new_path + data_dir_len, + new_path_len - data_dir_len, + "%c%s.%s", + OS_FILE_PATH_SEPARATOR, + tablename, + extention); + + srv_normalize_path_for_win(new_path); + + return(new_path); +} + +/****************************************************************//** +This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. */ +UNIV_INTERN +void +os_file_make_data_dir_path( +/*========================*/ + char* data_dir_path) /*!< in/out: full path/data_dir_path */ +{ + char* ptr; + char* tablename; + ulint tablename_len; + + /* Replace the period before the extension with a null byte. */ + ptr = strrchr((char*) data_dir_path, '.'); + if (!ptr) { + return; + } + ptr[0] = '\0'; + + /* The tablename starts after the last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + ptr[0] = '\0'; + tablename = ptr + 1; + + /* The databasename starts after the next to last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + tablename_len = ut_strlen(tablename); + + ut_memmove(++ptr, tablename, tablename_len); + + ptr[tablename_len] = '\0'; +} + +/****************************************************************//** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' characters +are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." + +@return own: directory component of the pathname */ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + const char* path) /*!< in: pathname */ +{ + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR); + if (!last_slash) { + /* No slash in the path, return "." */ + + return(mem_strdup(".")); + } + + /* Ok, there is a slash */ + + if (last_slash == path) { + /* last slash is the first char of the path */ + + return(mem_strdup("/")); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, last_slash - path)); +} + +/****************************************************************//** +Creates all missing subdirectories along the given path. +@return TRUE if call succeeded FALSE otherwise */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + const char* path) /*!< in: path name */ +{ + if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "read only mode set. Can't create subdirectories '%s'", + path); + + return(FALSE); + + } + + char* subdir = os_file_dirname(path); + + if (strlen(subdir) == 1 + && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) { + /* subdir is root or cwd, nothing to do */ + mem_free(subdir); + + return(TRUE); + } + + /* Test if subdir exists */ + os_file_type_t type; + ibool subdir_exists; + ibool success = os_file_status(subdir, &subdir_exists, &type); + + if (success && !subdir_exists) { + + /* subdir does not exist, create it */ + success = os_file_create_subdirs_if_needed(subdir); + + if (!success) { + mem_free(subdir); + + return(FALSE); + } + + success = os_file_create_directory(subdir, FALSE); + } + + mem_free(subdir); + + return(success); +} + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Returns a pointer to the nth slot in the aio array. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_get_nth_slot( +/*======================*/ + os_aio_array_t* array, /*!< in: aio array */ + ulint index) /*!< in: index of the slot */ +{ + ut_a(index < array->n_slots); + + return(&array->slots[index]); +} + +#if defined(LINUX_NATIVE_AIO) +/******************************************************************//** +Creates an io_context for native linux AIO. +@return TRUE on success. */ +static +ibool +os_aio_linux_create_io_ctx( +/*=======================*/ + ulint max_events, /*!< in: number of events. */ + io_context_t* io_ctx) /*!< out: io_ctx to initialize. */ +{ + int ret; + ulint retries = 0; + +retry: + memset(io_ctx, 0x0, sizeof(*io_ctx)); + + /* Initialize the io_ctx. Tell it how many pending + IO requests this context will handle. */ + + ret = io_setup(max_events, io_ctx); + if (ret == 0) { +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "InnoDB: Linux native AIO:" + " initialized io_ctx for segment\n"); +#endif + /* Success. Return now. */ + return(TRUE); + } + + /* If we hit EAGAIN we'll make a few attempts before failing. */ + + switch (ret) { + case -EAGAIN: + if (retries == 0) { + /* First time around. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: io_setup() failed" + " with EAGAIN. Will make %d attempts" + " before giving up.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + } + + if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) { + ++retries; + fprintf(stderr, + "InnoDB: Warning: io_setup() attempt" + " %lu failed.\n", + retries); + os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP); + goto retry; + } + + /* Have tried enough. Better call it a day. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: io_setup() failed" + " with EAGAIN after %d attempts.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + break; + + case -ENOSYS: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO interface" + " is not supported on this platform. Please" + " check your OS documentation and install" + " appropriate binary of InnoDB.\n"); + + break; + + default: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO setup" + " returned following error[%d]\n", -ret); + break; + } + + fprintf(stderr, + "InnoDB: You can disable Linux Native AIO by" + " setting innodb_use_native_aio = 0 in my.cnf\n"); + return(FALSE); +} + +/******************************************************************//** +Checks if the system supports native linux aio. On some kernel +versions where native aio is supported it won't work on tmpfs. In such +cases we can't use native aio as it is not possible to mix simulated +and native aio. +@return: TRUE if supported, FALSE otherwise. */ +static +ibool +os_aio_native_aio_supported(void) +/*=============================*/ +{ + int fd; + io_context_t io_ctx; + char name[1000]; + + if (!os_aio_linux_create_io_ctx(1, &io_ctx)) { + /* The platform does not support native aio. */ + return(FALSE); + } else if (!srv_read_only_mode) { + /* Now check if tmpdir supports native aio ops. */ + fd = innobase_mysql_tmpfile(NULL); + + if (fd < 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to create temp file to check " + "native AIO support."); + + return(FALSE); + } + } else { + + srv_normalize_path_for_win(srv_log_group_home_dir); + + ulint dirnamelen = strlen(srv_log_group_home_dir); + ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile"); + memcpy(name, srv_log_group_home_dir, dirnamelen); + + /* Add a path separator if needed. */ + if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + name[dirnamelen++] = SRV_PATH_SEPARATOR; + } + + strcpy(name + dirnamelen, "ib_logfile0"); + + fd = ::open(name, O_RDONLY | O_CLOEXEC); + + if (fd == -1) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to open \"%s\" to check " + "native AIO read support.", name); + + return(FALSE); + } + } + + struct io_event io_event; + + memset(&io_event, 0x0, sizeof(io_event)); + + byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2)); + byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); + + struct iocb iocb; + + /* Suppress valgrind warning. */ + memset(buf, 0x00, UNIV_PAGE_SIZE * 2); + memset(&iocb, 0x0, sizeof(iocb)); + + struct iocb* p_iocb = &iocb; + + if (!srv_read_only_mode) { + io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0); + } else { + ut_a(UNIV_PAGE_SIZE >= 512); + io_prep_pread(p_iocb, fd, ptr, 512, 0); + } + + int err = io_submit(io_ctx, 1, &p_iocb); + + if (err >= 1) { + /* Now collect the submitted IO request. */ + err = io_getevents(io_ctx, 1, 1, &io_event, NULL); + } + + ut_free(buf); + close(fd); + + switch (err) { + case 1: + return(TRUE); + + case -EINVAL: + case -ENOSYS: + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO not supported. You can either " + "move %s to a file system that supports native " + "AIO or you can set innodb_use_native_aio to " + "FALSE to avoid this message.", + srv_read_only_mode ? name : "tmpdir"); + + /* fall through. */ + default: + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO check on %s returned error[%d]", + srv_read_only_mode ? name : "tmpdir", -err); + } + + return(FALSE); +} +#endif /* LINUX_NATIVE_AIO */ + +/******************************************************************//** +Creates an aio wait array. Note that we return NULL in case of failure. +We don't care about freeing memory here because we assume that a +failure will result in server refusing to start up. +@return own: aio array, NULL on failure */ +static +os_aio_array_t* +os_aio_array_create( +/*================*/ + ulint n, /*!< in: maximum number of pending aio + operations allowed; n must be + divisible by n_segments */ + ulint n_segments) /*!< in: number of segments in the aio array */ +{ + os_aio_array_t* array; +#ifdef WIN_ASYNC_IO + OVERLAPPED* over; +#elif defined(LINUX_NATIVE_AIO) + struct io_event* io_event = NULL; +#endif /* WIN_ASYNC_IO */ + ut_a(n > 0); + ut_a(n_segments > 0); + + array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array))); + memset(array, 0x0, sizeof(*array)); + + array->mutex = os_mutex_create(); + array->not_full = os_event_create(); + array->is_empty = os_event_create(); + + os_event_set(array->is_empty); + + array->n_slots = n; + array->n_segments = n_segments; + + array->slots = static_cast<os_aio_slot_t*>( + ut_malloc(n * sizeof(*array->slots))); + + memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); +#ifdef __WIN__ + array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE))); +#endif /* __WIN__ */ + +#if defined(LINUX_NATIVE_AIO) + array->aio_ctx = NULL; + array->aio_events = NULL; + + /* If we are not using native aio interface then skip this + part of initialization. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Initialize the io_context array. One io_context + per segment in the array. */ + + array->aio_ctx = static_cast<io_context**>( + ut_malloc(n_segments * sizeof(*array->aio_ctx))); + + for (ulint i = 0; i < n_segments; ++i) { + if (!os_aio_linux_create_io_ctx(n/n_segments, + &array->aio_ctx[i])) { + /* If something bad happened during aio setup + we disable linux native aio. + The disadvantage will be a small memory leak + at shutdown but that's ok compared to a crash + or a not working server. + This frequently happens when running the test suite + with many threads on a system with low fs.aio-max-nr! + */ + + fprintf(stderr, + " InnoDB: Warning: Linux Native AIO disabled " + "because os_aio_linux_create_io_ctx() " + "failed. To get rid of this warning you can " + "try increasing system " + "fs.aio-max-nr to 1048576 or larger or " + "setting innodb_use_native_aio = 0 in my.cnf\n"); + srv_use_native_aio = FALSE; + goto skip_native_aio; + } + } + + /* Initialize the event array. One event per slot. */ + io_event = static_cast<struct io_event*>( + ut_malloc(n * sizeof(*io_event))); + + memset(io_event, 0x0, sizeof(*io_event) * n); + array->aio_events = io_event; + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i); + + slot->pos = i; + slot->reserved = FALSE; +#ifdef WIN_ASYNC_IO + slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL); + + over = &slot->control; + + over->hEvent = slot->handle; + + array->handles[i] = over->hEvent; + +#elif defined(LINUX_NATIVE_AIO) + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; +#endif /* WIN_ASYNC_IO */ + } + + return(array); +} + +/************************************************************************//** +Frees an aio wait array. */ +static +void +os_aio_array_free( +/*==============*/ + os_aio_array_t*& array) /*!< in, own: array to free */ +{ +#ifdef WIN_ASYNC_IO + ulint i; + + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + CloseHandle(slot->handle); + } +#endif /* WIN_ASYNC_IO */ + +#ifdef __WIN__ + ut_free(array->handles); +#endif /* __WIN__ */ + os_mutex_free(array->mutex); + os_event_free(array->not_full); + os_event_free(array->is_empty); + +#if defined(LINUX_NATIVE_AIO) + if (srv_use_native_aio) { + ut_free(array->aio_events); + ut_free(array->aio_ctx); + } +#endif /* LINUX_NATIVE_AIO */ + + ut_free(array->slots); + ut_free(array); + + array = 0; +} + +/*********************************************************************** +Initializes the asynchronous io system. Creates one array each for ibuf +and log i/o. Also creates one array each for read and write where each +array is divided logically into n_read_segs and n_write_segs +respectively. The caller must create an i/o handler thread for each +segment in these arrays. This function also creates the sync array. +No i/o handler thread needs to be created for that */ +UNIV_INTERN +ibool +os_aio_init( +/*========*/ + ulint n_per_seg, /*<! in: maximum number of pending aio + operations allowed per segment */ + ulint n_read_segs, /*<! in: number of reader threads */ + ulint n_write_segs, /*<! in: number of writer threads */ + ulint n_slots_sync) /*<! in: number of slots in the sync aio + array */ +{ + os_io_init_simple(); + +#if defined(LINUX_NATIVE_AIO) + /* Check if native aio is supported on this system and tmpfs */ + if (srv_use_native_aio && !os_aio_native_aio_supported()) { + + ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled."); + + srv_use_native_aio = FALSE; + } +#endif /* LINUX_NATIVE_AIO */ + + srv_reset_io_thread_op_info(); + + os_aio_read_array = os_aio_array_create( + n_read_segs * n_per_seg, n_read_segs); + + if (os_aio_read_array == NULL) { + return(FALSE); + } + + ulint start = (srv_read_only_mode) ? 0 : 2; + ulint n_segs = n_read_segs + start; + + /* 0 is the ibuf segment and 1 is the insert buffer segment. */ + for (ulint i = start; i < n_segs; ++i) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "read thread"; + } + + ulint n_segments = n_read_segs; + + if (!srv_read_only_mode) { + + os_aio_log_array = os_aio_array_create(n_per_seg, 1); + + if (os_aio_log_array == NULL) { + return(FALSE); + } + + ++n_segments; + + srv_io_thread_function[1] = "log thread"; + + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + if (os_aio_ibuf_array == NULL) { + return(FALSE); + } + + ++n_segments; + + srv_io_thread_function[0] = "insert buffer thread"; + + os_aio_write_array = os_aio_array_create( + n_write_segs * n_per_seg, n_write_segs); + + if (os_aio_write_array == NULL) { + return(FALSE); + } + + n_segments += n_write_segs; + + for (ulint i = start + n_read_segs; i < n_segments; ++i) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "write thread"; + } + + ut_ad(n_segments >= 4); + } else { + ut_ad(n_segments > 0); + } + + os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); + + if (os_aio_sync_array == NULL) { + return(FALSE); + } + + os_aio_n_segments = n_segments; + + os_aio_validate(); + + os_last_printout = ut_time(); + + if (srv_use_native_aio) { + return(TRUE); + } + + os_aio_segment_wait_events = static_cast<os_event_t*>( + ut_malloc(n_segments * sizeof *os_aio_segment_wait_events)); + + for (ulint i = 0; i < n_segments; ++i) { + os_aio_segment_wait_events[i] = os_event_create(); + } + + return(TRUE); +} + +/*********************************************************************** +Frees the asynchronous io system. */ +UNIV_INTERN +void +os_aio_free(void) +/*=============*/ +{ + if (os_aio_ibuf_array != 0) { + os_aio_array_free(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_free(os_aio_log_array); + } + + if (os_aio_write_array != 0) { + os_aio_array_free(os_aio_write_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_free(os_aio_sync_array); + } + + os_aio_array_free(os_aio_read_array); + + if (!srv_use_native_aio) { + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_event_free(os_aio_segment_wait_events[i]); + } + } + + ut_free(os_aio_segment_wait_events); + os_aio_segment_wait_events = 0; + os_aio_n_segments = 0; +} + +#ifdef WIN_ASYNC_IO +/************************************************************************//** +Wakes up all async i/o threads in the array in Windows async i/o at +shutdown. */ +static +void +os_aio_array_wake_win_aio_at_shutdown( +/*==================================*/ + os_aio_array_t* array) /*!< in: aio array */ +{ + ulint i; + + for (i = 0; i < array->n_slots; i++) { + + SetEvent((array->slots + i)->handle); + } +} +#endif + +/************************************************************************//** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void) +/*=====================================*/ +{ +#ifdef WIN_ASYNC_IO + /* This code wakes up all ai/o threads in Windows native aio */ + os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array); + if (os_aio_write_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); + } +#elif defined(LINUX_NATIVE_AIO) + /* When using native AIO interface the io helper threads + wait on io_getevents with a timeout value of 500ms. At + each wake up these threads check the server status. + No need to do anything to wake them up. */ +#endif /* !WIN_ASYNC_AIO */ + + if (srv_use_native_aio) { + return; + } + + /* This loop wakes up all simulated ai/o threads */ + + for (ulint i = 0; i < os_aio_n_segments; i++) { + + os_event_set(os_aio_segment_wait_events[i]); + } +} + +/************************************************************************//** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void) +/*=====================================*/ +{ + ut_ad(!srv_read_only_mode); + os_event_wait(os_aio_write_array->is_empty); +} + +/**********************************************************************//** +Calculates segment number for a slot. +@return segment number (which is the number used by, for example, +i/o-handler threads) */ +static +ulint +os_aio_get_segment_no_from_slot( +/*============================*/ + os_aio_array_t* array, /*!< in: aio wait array */ + os_aio_slot_t* slot) /*!< in: slot in this array */ +{ + ulint segment; + ulint seg_len; + + if (array == os_aio_ibuf_array) { + ut_ad(!srv_read_only_mode); + + segment = IO_IBUF_SEGMENT; + + } else if (array == os_aio_log_array) { + ut_ad(!srv_read_only_mode); + + segment = IO_LOG_SEGMENT; + + } else if (array == os_aio_read_array) { + seg_len = os_aio_read_array->n_slots + / os_aio_read_array->n_segments; + + segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len; + } else { + ut_ad(!srv_read_only_mode); + ut_a(array == os_aio_write_array); + + seg_len = os_aio_write_array->n_slots + / os_aio_write_array->n_segments; + + segment = os_aio_read_array->n_segments + 2 + + slot->pos / seg_len; + } + + return(segment); +} + +/**********************************************************************//** +Calculates local segment number and aio array from global segment number. +@return local segment number within the aio array */ +static +ulint +os_aio_get_array_and_local_segment( +/*===============================*/ + os_aio_array_t** array, /*!< out: aio wait array */ + ulint global_segment)/*!< in: global segment number */ +{ + ulint segment; + + ut_a(global_segment < os_aio_n_segments); + + if (srv_read_only_mode) { + *array = os_aio_read_array; + + return(global_segment); + } else if (global_segment == IO_IBUF_SEGMENT) { + *array = os_aio_ibuf_array; + segment = 0; + + } else if (global_segment == IO_LOG_SEGMENT) { + *array = os_aio_log_array; + segment = 0; + + } else if (global_segment < os_aio_read_array->n_segments + 2) { + *array = os_aio_read_array; + + segment = global_segment - 2; + } else { + *array = os_aio_write_array; + + segment = global_segment - (os_aio_read_array->n_segments + 2); + } + + return(segment); +} + +/*******************************************************************//** +Requests for a slot in the aio array. If no slot is available, waits until +not_full-event becomes signaled. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_reserve_slot( +/*======================*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + os_aio_array_t* array, /*!< in: aio array */ + fil_node_t* message1,/*!< in: message to be passed along with + the aio operation */ + void* message2,/*!< in: message to be passed along with + the aio operation */ + pfs_os_file_t file, /*!< in: file handle */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset */ + ulint len) /*!< in: length of the block to read or write */ +{ + os_aio_slot_t* slot = NULL; +#ifdef WIN_ASYNC_IO + OVERLAPPED* control; + +#elif defined(LINUX_NATIVE_AIO) + + struct iocb* iocb; + off_t aio_offset; + +#endif /* WIN_ASYNC_IO */ + ulint i; + ulint counter; + ulint slots_per_seg; + ulint local_seg; + +#ifdef WIN_ASYNC_IO + ut_a((len & 0xFFFFFFFFUL) == len); +#endif /* WIN_ASYNC_IO */ + + /* No need of a mutex. Only reading constant fields */ + slots_per_seg = array->n_slots / array->n_segments; + + /* We attempt to keep adjacent blocks in the same local + segment. This can help in merging IO requests when we are + doing simulated AIO */ + local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) + % array->n_segments; + +loop: + os_mutex_enter(array->mutex); + + if (array->n_reserved == array->n_slots) { + os_mutex_exit(array->mutex); + + if (!srv_use_native_aio) { + /* If the handler threads are suspended, wake them + so that we get more slots */ + + os_aio_simulated_wake_handler_threads(); + } + + os_event_wait(array->not_full); + + goto loop; + } + + /* We start our search for an available slot from our preferred + local segment and do a full scan of the array. We are + guaranteed to find a slot in full scan. */ + for (i = local_seg * slots_per_seg, counter = 0; + counter < array->n_slots; + i++, counter++) { + + i %= array->n_slots; + + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + goto found; + } + } + + /* We MUST always be able to get hold of a reserved slot. */ + ut_error; + +found: + ut_a(slot->reserved == FALSE); + array->n_reserved++; + + if (array->n_reserved == 1) { + os_event_reset(array->is_empty); + } + + if (array->n_reserved == array->n_slots) { + os_event_reset(array->not_full); + } + + slot->reserved = TRUE; + slot->reservation_time = ut_time(); + slot->message1 = message1; + slot->message2 = message2; + slot->file = file; + slot->name = name; + slot->len = len; + slot->type = type; + slot->buf = static_cast<byte*>(buf); + slot->offset = offset; + slot->io_already_done = FALSE; + +#ifdef WIN_ASYNC_IO + control = &slot->control; + control->Offset = (DWORD) offset & 0xFFFFFFFF; + control->OffsetHigh = (DWORD) (offset >> 32); + ResetEvent(slot->handle); + +#elif defined(LINUX_NATIVE_AIO) + + /* If we are not using native AIO skip this part. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Check if we are dealing with 64 bit arch. + If not then make sure that offset fits in 32 bits. */ + aio_offset = (off_t) offset; + + ut_a(sizeof(aio_offset) >= sizeof(offset) + || ((os_offset_t) aio_offset) == offset); + + iocb = &slot->control; + + if (type == OS_FILE_READ) { + io_prep_pread(iocb, file.m_file, buf, len, aio_offset); + } else { + ut_a(type == OS_FILE_WRITE); + io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset); + } + + iocb->data = (void*) slot; + slot->n_bytes = 0; + slot->ret = 0; + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ + os_mutex_exit(array->mutex); + + return(slot); +} + +/*******************************************************************//** +Frees a slot in the aio array. */ +static +void +os_aio_array_free_slot( +/*===================*/ + os_aio_array_t* array, /*!< in: aio array */ + os_aio_slot_t* slot) /*!< in: pointer to slot */ +{ + os_mutex_enter(array->mutex); + + ut_ad(slot->reserved); + + slot->reserved = FALSE; + + array->n_reserved--; + + if (array->n_reserved == array->n_slots - 1) { + os_event_set(array->not_full); + } + + if (array->n_reserved == 0) { + os_event_set(array->is_empty); + } + +#ifdef WIN_ASYNC_IO + + ResetEvent(slot->handle); + +#elif defined(LINUX_NATIVE_AIO) + + if (srv_use_native_aio) { + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; + /*fprintf(stderr, "Freed up Linux native slot.\n");*/ + } else { + /* These fields should not be used if we are not + using native AIO. */ + ut_ad(slot->n_bytes == 0); + ut_ad(slot->ret == 0); + } + +#endif + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up a simulated aio i/o-handler thread if it has something to do. */ +static +void +os_aio_simulated_wake_handler_thread( +/*=================================*/ + ulint global_segment) /*!< in: the number of the segment in the aio + arrays */ +{ + os_aio_array_t* array; + ulint segment; + + ut_ad(!srv_use_native_aio); + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + + ulint n = array->n_slots / array->n_segments; + + segment *= n; + + /* Look through n slots after the segment * n'th slot */ + + os_mutex_enter(array->mutex); + + for (ulint i = 0; i < n; ++i) { + const os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, segment + i); + + if (slot->reserved) { + + /* Found an i/o request */ + + os_mutex_exit(array->mutex); + + os_event_t event; + + event = os_aio_segment_wait_events[global_segment]; + + os_event_set(event); + + return; + } + } + + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void) +/*=======================================*/ +{ + if (srv_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = FALSE; + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_simulated_wake_handler_thread(i); + } +} + +#ifdef _WIN32 +/**********************************************************************//** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep() +{ + +/* The idea of putting background IO threads to sleep is only for +Windows when using simulated AIO. Windows XP seems to schedule +background threads too eagerly to allow for coalescing during +readahead requests. */ + + os_aio_array_t* array; + + if (srv_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = TRUE; + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_get_array_and_local_segment(&array, i); + + if (array == os_aio_read_array) { + + os_event_reset(os_aio_segment_wait_events[i]); + } + } +} +#endif /* _WIN32 */ + +#if defined(LINUX_NATIVE_AIO) +/*******************************************************************//** +Dispatch an AIO request to the kernel. +@return TRUE on success. */ +static +ibool +os_aio_linux_dispatch( +/*==================*/ + os_aio_array_t* array, /*!< in: io request array. */ + os_aio_slot_t* slot) /*!< in: an already reserved slot. */ +{ + int ret; + ulint io_ctx_index; + struct iocb* iocb; + + ut_ad(slot != NULL); + ut_ad(array); + + ut_a(slot->reserved); + + /* Find out what we are going to work with. + The iocb struct is directly in the slot. + The io_context is one per segment. */ + + iocb = &slot->control; + io_ctx_index = (slot->pos * array->n_segments) / array->n_slots; + + ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot, + array->aio_ctx[io_ctx_index], (ulong) io_ctx_index); +#endif + + /* io_submit returns number of successfully + queued requests or -errno. */ + if (UNIV_UNLIKELY(ret != 1)) { + errno = -ret; + return(FALSE); + } + + return(TRUE); +} +#endif /* LINUX_NATIVE_AIO */ + + +/*******************************************************************//** +NOTE! Use the corresponding macro os_aio(), not directly this function! +Requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INTERN +ibool +os_aio_func( +/*========*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + pfs_os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2)/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ +{ + os_aio_array_t* array; + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + ibool retval; + BOOL ret = TRUE; + DWORD len = (DWORD) n; + struct fil_node_t* dummy_mess1; + void* dummy_mess2; + ulint dummy_type; +#endif /* WIN_ASYNC_IO */ + ulint wake_later; + ut_ad(buf); + ut_ad(n > 0); + ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0); + ut_ad(os_aio_validate_skip()); +#ifdef WIN_ASYNC_IO + ut_ad((n & 0xFFFFFFFFUL) == n); +#endif + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); + + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;); + + if (mode == OS_AIO_SYNC +#ifdef WIN_ASYNC_IO + && !srv_use_native_aio +#endif /* WIN_ASYNC_IO */ + ) { + ibool ret; + + /* This is actually an ordinary synchronous read or write: + no need to use an i/o-handler thread. NOTE that if we use + Windows async i/o, Windows does not allow us to use + ordinary synchronous os_file_read etc. on the same file, + therefore we have built a special mechanism for synchronous + wait in the Windows case. + Also note that the Performance Schema instrumentation has + been performed by current os_aio_func()'s wrapper function + pfs_os_aio_func(). So we would no longer need to call + Performance Schema instrumented os_file_read() and + os_file_write(). Instead, we should use os_file_read_func() + and os_file_write_func() */ + + if (type == OS_FILE_READ) { + ret = os_file_read_func(file.m_file, buf, offset, n); + } else { + + ut_ad(!srv_read_only_mode); + ut_a(type == OS_FILE_WRITE); + + ret = os_file_write_func(name, file.m_file, buf, offset, n); + + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + os_has_said_disk_full = FALSE; ret = 0; errno = 28;); + + if (!ret) { + os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE); + } + } + + return ret; + } + +try_again: + switch (mode) { + case OS_AIO_NORMAL: + if (type == OS_FILE_READ) { + array = os_aio_read_array; + } else { + ut_ad(!srv_read_only_mode); + array = os_aio_write_array; + } + break; + case OS_AIO_IBUF: + ut_ad(type == OS_FILE_READ); + /* Reduce probability of deadlock bugs in connection with ibuf: + do not let the ibuf i/o handler sleep */ + + wake_later = FALSE; + + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_ibuf_array; + } + break; + case OS_AIO_LOG: + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_log_array; + } + break; + case OS_AIO_SYNC: + array = os_aio_sync_array; +#if defined(LINUX_NATIVE_AIO) + /* In Linux native AIO we don't use sync IO array. */ + ut_a(!srv_use_native_aio); +#endif /* LINUX_NATIVE_AIO */ + break; + default: + ut_error; + array = NULL; /* Eliminate compiler warning */ + } + + slot = os_aio_array_reserve_slot(type, array, message1, message2, file, + name, buf, offset, n); + if (type == OS_FILE_READ) { + if (srv_use_native_aio) { + os_n_file_reads++; + os_bytes_read_since_printout += n; +#ifdef WIN_ASYNC_IO + ret = ReadFile(file.m_file, buf, (DWORD) n, &len, + &(slot->control)); +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot)) { + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + if (srv_use_native_aio) { + os_n_file_writes++; +#ifdef WIN_ASYNC_IO + ret = WriteFile(file.m_file, buf, (DWORD) n, &len, + &(slot->control)); +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot)) { + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else { + ut_error; + } + +#ifdef WIN_ASYNC_IO + if (srv_use_native_aio) { + if ((ret && len == n) + || (!ret && GetLastError() == ERROR_IO_PENDING)) { + /* aio was queued successfully! */ + + if (mode == OS_AIO_SYNC) { + /* We want a synchronous i/o operation on a + file where we also use async i/o: in Windows + we must use the same wait mechanism as for + async i/o */ + + retval = os_aio_windows_handle( + ULINT_UNDEFINED, slot->pos, + &dummy_mess1, &dummy_mess2, + &dummy_type); + + return(retval); + } + + return(TRUE); + } + + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + /* aio was queued successfully! */ + return(TRUE); + +#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO +err_exit: +#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */ + os_aio_array_free_slot(array, slot); + + if (os_file_handle_error( + name,type == OS_FILE_READ ? "aio read" : "aio write")) { + + goto try_again; + } + + return(FALSE); +} + +#ifdef WIN_ASYNC_IO +/**********************************************************************//** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /*!< this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */ +{ + ulint orig_seg = segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret_val; + BOOL ret; + DWORD len; + BOOL retry = FALSE; + + if (segment == ULINT_UNDEFINED) { + segment = 0; + array = os_aio_sync_array; + } else { + segment = os_aio_get_array_and_local_segment(&array, segment); + } + + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + ut_ad(os_aio_validate_skip()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + if (array == os_aio_sync_array) { + + WaitForSingleObject( + os_aio_array_get_nth_slot(array, pos)->handle, + INFINITE); + + i = pos; + + } else { + if (orig_seg != ULINT_UNDEFINED) { + srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); + } + + i = WaitForMultipleObjects( + (DWORD) n, array->handles + segment * n, + FALSE, INFINITE); + } + + os_mutex_enter(array->mutex); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS + && array->n_reserved == 0) { + *message1 = NULL; + *message2 = NULL; + os_mutex_exit(array->mutex); + return(TRUE); + } + + ut_a(i >= WAIT_OBJECT_0 && i <= WAIT_OBJECT_0 + n); + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + ut_a(slot->reserved); + + if (orig_seg != ULINT_UNDEFINED) { + srv_set_io_thread_op_info( + orig_seg, "get windows aio return value"); + } + ret = GetOverlappedResult(slot->file.m_file, &(slot->control), &len, TRUE); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + + if (ret && len == slot->len) { + + ret_val = TRUE; + } else if (os_file_handle_error(slot->name, "Windows aio")) { + + retry = TRUE; + } else { + + ret_val = FALSE; + } + + os_mutex_exit(array->mutex); + + if (retry) { + /* retry failed read/write operation synchronously. + No need to hold array->mutex. */ + +#ifdef UNIV_PFS_IO + /* This read/write does not go through os_file_read + and os_file_write APIs, need to register with + performance schema explicitly here. */ + struct PSI_file_locker* locker = NULL; + PSI_file_locker_state state; + register_pfs_file_io_begin(&state, locker, slot->file, slot->len, + (slot->type == OS_FILE_WRITE) + ? PSI_FILE_WRITE + : PSI_FILE_READ, + __FILE__, __LINE__); +#endif + + ut_a((slot->len & 0xFFFFFFFFUL) == slot->len); + + switch (slot->type) { + case OS_FILE_WRITE: + ret = WriteFile(slot->file.m_file, slot->buf, + (DWORD) slot->len, &len, + &(slot->control)); + break; + case OS_FILE_READ: + ret = ReadFile(slot->file.m_file, slot->buf, + (DWORD) slot->len, &len, + &(slot->control)); + break; + default: + ut_error; + } + +#ifdef UNIV_PFS_IO + register_pfs_file_io_end(locker, len); +#endif + + if (!ret && GetLastError() == ERROR_IO_PENDING) { + /* aio was queued successfully! + We want a synchronous i/o operation on a + file where we also use async i/o: in Windows + we must use the same wait mechanism as for + async i/o */ + ret = GetOverlappedResult(slot->file.m_file, + &(slot->control), + &len, TRUE); + } + + ret_val = ret && len == slot->len; + } + + os_aio_array_free_slot(array, slot); + + return(ret_val); +} +#endif + +#if defined(LINUX_NATIVE_AIO) +/******************************************************************//** +This function is only used in Linux native asynchronous i/o. This is +called from within the io-thread. If there are no completed IO requests +in the slot array, the thread calls this function to collect more +requests from the kernel. +The io-thread waits on io_getevents(), which is a blocking call, with +a timeout value. Unless the system is very heavy loaded, keeping the +io-thread very busy, the io-thread will spend most of its time waiting +in this function. +The io-thread also exits in this function. It checks server status at +each wakeup and that is why we use timed wait in io_getevents(). */ +static +void +os_aio_linux_collect( +/*=================*/ + os_aio_array_t* array, /*!< in/out: slot array. */ + ulint segment, /*!< in: local segment no. */ + ulint seg_size) /*!< in: segment size. */ +{ + int i; + int ret; + ulint start_pos; + ulint end_pos; + struct timespec timeout; + struct io_event* events; + struct io_context* io_ctx; + + /* sanity checks. */ + ut_ad(array != NULL); + ut_ad(seg_size > 0); + ut_ad(segment < array->n_segments); + + /* Which part of event array we are going to work on. */ + events = &array->aio_events[segment * seg_size]; + + /* Which io_context we are going to use. */ + io_ctx = array->aio_ctx[segment]; + + /* Starting point of the segment we will be working on. */ + start_pos = segment * seg_size; + + /* End point. */ + end_pos = start_pos + seg_size; + +retry: + + /* Initialize the events. The timeout value is arbitrary. + We probably need to experiment with it a little. */ + memset(events, 0, sizeof(*events) * seg_size); + timeout.tv_sec = 0; + timeout.tv_nsec = OS_AIO_REAP_TIMEOUT; + + ret = io_getevents(io_ctx, 1, seg_size, events, &timeout); + + if (ret > 0) { + for (i = 0; i < ret; i++) { + os_aio_slot_t* slot; + struct iocb* control; + + control = (struct iocb*) events[i].obj; + ut_a(control != NULL); + + slot = (os_aio_slot_t*) control->data; + + /* Some sanity checks. */ + ut_a(slot != NULL); + ut_a(slot->reserved); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_getevents[%c]: slot[%p] ctx[%p]" + " seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', + slot, io_ctx, segment); +#endif + + /* We are not scribbling previous segment. */ + ut_a(slot->pos >= start_pos); + + /* We have not overstepped to next segment. */ + ut_a(slot->pos < end_pos); + + /* Mark this request as completed. The error handling + will be done in the calling function. */ + os_mutex_enter(array->mutex); + slot->n_bytes = events[i].res; + slot->ret = events[i].res2; + slot->io_already_done = TRUE; + os_mutex_exit(array->mutex); + } + return; + } + + if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + return; + } + + /* This error handling is for any error in collecting the + IO requests. The errors, if any, for any particular IO + request are simply passed on to the calling routine. */ + + switch (ret) { + case -EAGAIN: + /* Not enough resources! Try again. */ + case -EINTR: + /* Interrupted! I have tested the behaviour in case of an + interrupt. If we have some completed IOs available then + the return code will be the number of IOs. We get EINTR only + if there are no completed IOs and we have been interrupted. */ + case 0: + /* No pending request! Go back and check again. */ + goto retry; + } + + /* All other errors should cause a trap for now. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected ret_code[%d] from io_getevents()!\n", + ret); + ut_error; +} + +/**********************************************************************//** +This function is only used in Linux native asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait for +the completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the IO was successful */ +UNIV_INTERN +ibool +os_aio_linux_handle( +/*================*/ + ulint global_seg, /*!< in: segment number in the aio array + to wait for; segment 0 is the ibuf + i/o thread, segment 1 is log i/o thread, + then follow the non-ibuf read threads, + and the last are the non-ibuf write + threads. */ + fil_node_t**message1, /*!< out: the messages passed with the */ + void** message2, /*!< aio request; note that in case the + aio operation failed, these output + parameters are valid and can be used to + restart the operation. */ + ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */ +{ + ulint segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret = FALSE; + + /* Should never be doing Sync IO here. */ + ut_a(global_seg != ULINT_UNDEFINED); + + /* Find the array and the local segment. */ + segment = os_aio_get_array_and_local_segment(&array, global_seg); + n = array->n_slots / array->n_segments; + + /* Loop until we have found a completed request. */ + for (;;) { + ibool any_reserved = FALSE; + os_mutex_enter(array->mutex); + for (i = 0; i < n; ++i) { + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { + /* Something for us to work on. */ + goto found; + } else { + any_reserved = TRUE; + } + } + + os_mutex_exit(array->mutex); + + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (UNIV_UNLIKELY + (!any_reserved + && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + /* Wait for some request. Note that we return + from wait iff we have found a request. */ + + srv_set_io_thread_op_info(global_seg, + "waiting for completed aio requests"); + os_aio_linux_collect(array, segment, n); + } + +found: + /* Note that it may be that there are more then one completed + IO requests. We process them one at a time. We may have a case + here to improve the performance slightly by dealing with all + requests in one sweep. */ + srv_set_io_thread_op_info(global_seg, + "processing completed aio requests"); + + /* Ensure that we are scribbling only our segment. */ + ut_a(i < n); + + ut_ad(slot != NULL); + ut_ad(slot->reserved); + ut_ad(slot->io_already_done); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + + if (slot->ret == 0 && slot->n_bytes == (long) slot->len) { + + ret = TRUE; + } else { + errno = -slot->ret; + + /* os_file_handle_error does tell us if we should retry + this IO. As it stands now, we don't do this retry when + reaping requests from a different context than + the dispatcher. This non-retry logic is the same for + windows and linux native AIO. + We should probably look into this to transparently + re-submit the IO. */ + os_file_handle_error(slot->name, "Linux aio"); + + ret = FALSE; + } + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret); +} +#endif /* LINUX_NATIVE_AIO */ + +/**********************************************************************//** +Does simulated aio. This function should be called by an i/o-handler +thread. +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + ulint global_segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */ +{ + os_aio_array_t* array; + ulint segment; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; + ulint n_consecutive; + ulint total_len; + ulint offs; + os_offset_t lowest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; + byte* combined_buf2; + ibool ret; + ibool any_reserved; + ulint n; + os_aio_slot_t* aio_slot; + + /* Fix compiler warning */ + *consecutive_ios = NULL; + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + +restart: + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (a)"); + ut_ad(os_aio_validate_skip()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + /* Look through n slots after the segment * n'th slot */ + + if (array == os_aio_read_array + && os_aio_recommend_sleep_for_read_threads) { + + /* Give other threads chance to add several i/os to the array + at once. */ + + goto recommended_sleep; + } + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (b)"); + + /* Check if there is a slot for which the i/o has already been + done */ + any_reserved = FALSE; + + os_mutex_enter(array->mutex); + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: i/o for slot %lu" + " already done, returning\n", + (ulong) i); + } + + aio_slot = slot; + ret = TRUE; + goto slot_io_done; + } else { + any_reserved = TRUE; + } + } + + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_mutex_exit(array->mutex); + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + n_consecutive = 0; + + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ + + biggest_age = 0; + lowest_offset = IB_UINT64_MAX; + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved) { + + age = (ulint) difftime( + ut_time(), slot->reservation_time); + + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age + && slot->offset < lowest_offset)) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + biggest_age = age; + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + /* There were no old requests. Look for an i/o request at the + lowest offset in the array (we ignore the high 32 bits of the + offset in these heuristics) */ + + lowest_offset = IB_UINT64_MAX; + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + + if (slot->reserved && slot->offset < lowest_offset) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + + /* if n_consecutive != 0, then we have assigned + something valid to consecutive_ios[0] */ + ut_ad(n_consecutive != 0); + ut_ad(consecutive_ios[0] != NULL); + + aio_slot = consecutive_ios[0]; + + /* Check if there are several consecutive blocks to read or write */ + +consecutive_loop: + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + if (slot->reserved + && slot != aio_slot + && slot->offset == aio_slot->offset + aio_slot->len + && slot->type == aio_slot->type + && slot->file.m_file == aio_slot->file.m_file) { + + /* Found a consecutive i/o request */ + + consecutive_ios[n_consecutive] = slot; + n_consecutive++; + + aio_slot = slot; + + if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { + + goto consecutive_loop; + } else { + break; + } + } + } + + srv_set_io_thread_op_info(global_segment, "consecutive i/o requests"); + + /* We have now collected n_consecutive i/o requests in the array; + allocate a single buffer which can hold all data, and perform the + i/o */ + + total_len = 0; + aio_slot = consecutive_ios[0]; + + for (ulint i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; + } + + if (n_consecutive == 1) { + /* We can use the buffer of the i/o request */ + combined_buf = aio_slot->buf; + combined_buf2 = NULL; + } else { + combined_buf2 = static_cast<byte*>( + ut_malloc(total_len + UNIV_PAGE_SIZE)); + + ut_a(combined_buf2); + + combined_buf = static_cast<byte*>( + ut_align(combined_buf2, UNIV_PAGE_SIZE)); + } + + /* We release the array mutex for the time of the i/o: NOTE that + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + + os_mutex_exit(array->mutex); + + if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) { + /* Copy the buffers to the combined buffer */ + offs = 0; + + for (ulint i = 0; i < n_consecutive; i++) { + + ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf, + consecutive_ios[i]->len); + + offs += consecutive_ios[i]->len; + } + } + + srv_set_io_thread_op_info(global_segment, "doing file i/o"); + + /* Do the i/o with ordinary, synchronous i/o functions: */ + if (aio_slot->type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + ret = os_file_write( + aio_slot->name, aio_slot->file, combined_buf, + aio_slot->offset, total_len); + + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + os_has_said_disk_full = FALSE; + ret = 0; + errno = 28;); + + if (!ret) { + os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE); + } + + } else { + ret = os_file_read( + aio_slot->file, combined_buf, + aio_slot->offset, total_len); + } + + srv_set_io_thread_op_info(global_segment, "file i/o done"); + + if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) { + /* Copy the combined buffer to individual buffers */ + offs = 0; + + for (ulint i = 0; i < n_consecutive; i++) { + + ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs, + consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; + } + } + + if (combined_buf2) { + ut_free(combined_buf2); + } + + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (ulint i = 0; i < n_consecutive; i++) { + consecutive_ios[i]->io_already_done = TRUE; + } + + /* We return the messages for the first slot now, and if there were + several slots, the messages will be returned with subsequent calls + of this function */ + +slot_io_done: + + ut_a(aio_slot->reserved); + + *message1 = aio_slot->message1; + *message2 = aio_slot->message2; + + *type = aio_slot->type; + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, aio_slot); + + return(ret); + +wait_for_io: + srv_set_io_thread_op_info(global_segment, "resetting wait event"); + + /* We wait here until there again can be i/os in the segment + of this thread */ + + os_event_reset(os_aio_segment_wait_events[global_segment]); + + os_mutex_exit(array->mutex); + +recommended_sleep: + srv_set_io_thread_op_info(global_segment, "waiting for i/o request"); + + os_event_wait(os_aio_segment_wait_events[global_segment]); + + goto restart; +} + +/**********************************************************************//** +Validates the consistency of an aio array. +@return true if ok */ +static +bool +os_aio_array_validate( +/*==================*/ + os_aio_array_t* array) /*!< in: aio wait array */ +{ + ulint i; + ulint n_reserved = 0; + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { + n_reserved++; + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + os_mutex_exit(array->mutex); + + return(true); +} + +/**********************************************************************//** +Validates the consistency the aio system. +@return TRUE if ok */ +UNIV_INTERN +ibool +os_aio_validate(void) +/*=================*/ +{ + os_aio_array_validate(os_aio_read_array); + + if (os_aio_write_array != 0) { + os_aio_array_validate(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_validate(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_validate(os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_validate(os_aio_sync_array); + } + + return(TRUE); +} + +/**********************************************************************//** +Prints pending IO requests per segment of an aio array. +We probably don't need per segment statistics but they can help us +during development phase to see if the IO requests are being +distributed as expected. */ +static +void +os_aio_print_segment_info( +/*======================*/ + FILE* file, /*!< in: file where to print */ + ulint* n_seg, /*!< in: pending IO array */ + os_aio_array_t* array) /*!< in: array to process */ +{ + ulint i; + + ut_ad(array); + ut_ad(n_seg); + ut_ad(array->n_segments > 0); + + if (array->n_segments == 1) { + return; + } + + fprintf(file, " ["); + for (i = 0; i < array->n_segments; i++) { + if (i != 0) { + fprintf(file, ", "); + } + + fprintf(file, "%lu", n_seg[i]); + } + fprintf(file, "] "); +} + +/**********************************************************************//** +Prints info about the aio array. */ +UNIV_INTERN +void +os_aio_print_array( +/*==============*/ + FILE* file, /*!< in: file where to print */ + os_aio_array_t* array) /*!< in: aio array to print */ +{ + ulint n_reserved = 0; + ulint n_res_seg[SRV_MAX_N_IO_THREADS]; + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + memset(n_res_seg, 0x0, sizeof(n_res_seg)); + + for (ulint i = 0; i < array->n_slots; ++i) { + os_aio_slot_t* slot; + ulint seg_no; + + slot = os_aio_array_get_nth_slot(array, i); + + seg_no = (i * array->n_segments) / array->n_slots; + + if (slot->reserved) { + ++n_reserved; + ++n_res_seg[seg_no]; + + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + fprintf(file, " %lu", (ulong) n_reserved); + + os_aio_print_segment_info(file, n_res_seg, array); + + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file) /*!< in: file where to print */ +{ + time_t current_time; + double time_elapsed; + double avg_bytes_read; + + for (ulint i = 0; i < srv_n_file_io_threads; ++i) { + fprintf(file, "I/O thread %lu state: %s (%s)", + (ulong) i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); + +#ifndef _WIN32 + if (!srv_use_native_aio + && os_aio_segment_wait_events[i]->is_set) { + fprintf(file, " ev set"); + } +#endif /* _WIN32 */ + + fprintf(file, "\n"); + } + + fputs("Pending normal aio reads:", file); + + os_aio_print_array(file, os_aio_read_array); + + if (os_aio_write_array != 0) { + fputs(", aio writes:", file); + os_aio_print_array(file, os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + fputs(",\n ibuf aio reads:", file); + os_aio_print_array(file, os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + fputs(", log i/o's:", file); + os_aio_print_array(file, os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + fputs(", sync i/o's:", file); + os_aio_print_array(file, os_aio_sync_array); + } + + putc('\n', file); + current_time = ut_time(); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + fprintf(file, + "Pending flushes (fsync) log: " ULINTPF + "; buffer pool: " ULINTPF "\n" + ULINTPF " OS file reads, " + ULINTPF " OS file writes, " + ULINTPF " OS fsyncs\n", + fil_n_pending_log_flushes, + fil_n_pending_tablespace_flushes, + os_n_file_reads, + os_n_file_writes, + os_n_fsyncs); + + const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS)); + const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES)); + + if (n_reads != 0 || n_writes != 0) { + fprintf(file, + ULINTPF " pending reads, " ULINTPF " pending writes\n", + n_reads, n_writes); + } + + if (os_n_file_reads == os_n_file_reads_old) { + avg_bytes_read = 0.0; + } else { + avg_bytes_read = (double) os_bytes_read_since_printout + / (os_n_file_reads - os_n_file_reads_old); + } + + fprintf(file, + "%.2f reads/s, %lu avg bytes/read," + " %.2f writes/s, %.2f fsyncs/s\n", + (os_n_file_reads - os_n_file_reads_old) + / time_elapsed, + (ulong) avg_bytes_read, + (os_n_file_writes - os_n_file_writes_old) + / time_elapsed, + (os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = current_time; +} + +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void) +/*======================*/ +{ + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = time(NULL); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. +@return TRUE if all free */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void) +/*=======================*/ +{ + os_aio_array_t* array; + ulint n_res = 0; + + array = os_aio_read_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (!srv_read_only_mode) { + ut_a(os_aio_write_array == 0); + + array = os_aio_write_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + ut_a(os_aio_ibuf_array == 0); + + array = os_aio_ibuf_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + } + + ut_a(os_aio_log_array == 0); + + array = os_aio_log_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_sync_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (n_res == 0) { + + return(TRUE); + } + + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ diff --cc storage/xtradb/handler/ha_innodb.cc index d3e3109951b,65c5ce69713..440e14e1989 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@@ -1,10 -1,8 +1,10 @@@ /***************************************************************************** -Copyright (c) 2000, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2012, Facebook Inc. - Copyright (c) 2013, 2017, MariaDB Corporation. ++Copyright (c) 2013, 2018, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@@ -1131,3238 -972,216 +1131,3242 @@@ static SHOW_VAR innodb_status_variables {NullS, NullS, SHOW_LONG} }; -/* General functions */ - -/******************************************************************//** -Returns true if the thread is the replication thread on the slave -server. Used in srv_conc_enter_innodb() to determine if the thread -should be allowed to enter InnoDB - the replication thread is treated -differently than other threads. Also used in -srv_conc_force_exit_innodb(). -@return true if thd is the replication thread */ -extern "C" UNIV_INTERN -ibool -thd_is_replication_slave_thread( -/*============================*/ - const void* thd) /*!< in: thread handle (THD*) */ -{ - return((ibool) thd_slave_thread((THD*) thd)); -} +/************************************************************************//** +Handling the shared INNOBASE_SHARE structure that is needed to provide table +locking. Register the table name if it doesn't exist in the hash table. */ +static +INNOBASE_SHARE* +get_share( +/*======*/ + const char* table_name); /*!< in: table to lookup */ -/******************************************************************//** -Save some CPU by testing the value of srv_thread_concurrency in inline -functions. */ -static inline +/************************************************************************//** +Free the shared object that was registered with get_share(). */ +static void -innodb_srv_conc_enter_innodb( -/*=========================*/ - trx_t* trx) /*!< in: transaction handle */ -{ - if (UNIV_LIKELY(!srv_thread_concurrency)) { +free_share( +/*=======*/ + INNOBASE_SHARE* share); /*!< in/own: share to free */ - return; - } +/*****************************************************************//** +Frees a possible InnoDB trx object associated with the current THD. +@return 0 or error number */ +static +int +innobase_close_connection( +/*======================*/ + handlerton* hton, /*!< in/out: Innodb handlerton */ + THD* thd); /*!< in: MySQL thread handle for + which to close the connection */ - srv_conc_enter_innodb(trx); -} +static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all); +static void innobase_checkpoint_request(handlerton *hton, void *cookie); -/******************************************************************//** -Save some CPU by testing the value of srv_thread_concurrency in inline -functions. */ -static inline +/*****************************************************************//** +Cancel any pending lock request associated with the current THD. */ +static void -innodb_srv_conc_exit_innodb( -/*========================*/ - trx_t* trx) /*!< in: transaction handle */ -{ - if (UNIV_LIKELY(!trx->declared_to_be_inside_innodb)) { - - return; - } +innobase_kill_connection( +/*======================*/ + handlerton* hton, /*!< in: innobase handlerton */ + THD* thd, /*!< in: handle to the MySQL thread being killed */ + thd_kill_levels); - srv_conc_exit_innodb(trx); -} +/*****************************************************************//** +Commits a transaction in an InnoDB database or marks an SQL statement +ended. +@return 0 */ +static +int +innobase_commit( +/*============*/ + handlerton* hton, /*!< in/out: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the + user for whom the transaction should + be committed */ + bool commit_trx); /*!< in: true - commit transaction + false - the current SQL statement + ended */ -/******************************************************************//** -Force a thread to leave InnoDB even if it has spare tickets. */ -static inline -void -innodb_srv_conc_force_exit_innodb( -/*==============================*/ - trx_t* trx) /*!< in: transaction handle */ -{ -#ifdef UNIV_SYNC_DEBUG - ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); -#endif /* UNIV_SYNC_DEBUG */ +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback( +/*==============*/ + handlerton* hton, /*!< in/out: Innodb handlerton */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction should + be rolled back */ + bool rollback_trx); /*!< in: TRUE - rollback entire + transaction FALSE - rollback the current + statement only */ - if (trx->declared_to_be_inside_innodb) { +/*****************************************************************//** +Rolls back a transaction to a savepoint. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_rollback_to_savepoint( +/*===========================*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be rolled back to savepoint */ + void* savepoint); /*!< in: savepoint data */ - srv_conc_force_exit_innodb(trx); - } -} +/*****************************************************************//** +Check whether innodb state allows to safely release MDL locks after +rollback to savepoint. +@return true if it is safe, false if its not safe. */ +static +bool +innobase_rollback_to_savepoint_can_release_mdl( +/*===========================================*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd); /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be rolled back to savepoint */ -/******************************************************************//** -Returns true if the transaction this thread is processing has edited -non-transactional tables. Used by the deadlock detector when deciding -which transaction to rollback in case of a deadlock - we try to avoid -rolling back transactions that have edited non-transactional tables. -@return true if non-transactional tables have been edited */ -extern "C" UNIV_INTERN -ibool -thd_has_edited_nontrans_tables( -/*===========================*/ - void* thd) /*!< in: thread handle (THD*) */ -{ - return((ibool) thd_non_transactional_update((THD*) thd)); -} +/*****************************************************************//** +Sets a transaction savepoint. +@return always 0, that is, always succeeds */ +static +int +innobase_savepoint( +/*===============*/ + handlerton* hton, /*!< in/out: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user's XA transaction for which + we need to take a savepoint */ + void* savepoint); /*!< in: savepoint data */ -/******************************************************************//** -Returns true if the thread is executing a SELECT statement. -@return true if thd is executing SELECT */ -extern "C" UNIV_INTERN -ibool -thd_is_select( -/*==========*/ - const void* thd) /*!< in: thread handle (THD*) */ -{ - return(thd_sql_command((const THD*) thd) == SQLCOM_SELECT); -} +/*****************************************************************//** +Release transaction savepoint name. +@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the +given name */ +static +int +innobase_release_savepoint( +/*=======================*/ + handlerton* hton, /*!< in/out: handlerton for Innodb */ + THD* thd, /*!< in: handle to the MySQL thread + of the user whose transaction's + savepoint should be released */ + void* savepoint); /*!< in: savepoint data */ -/******************************************************************//** -Returns true if the thread supports XA, -global value of innodb_supports_xa if thd is NULL. -@return true if thd has XA support */ -extern "C" UNIV_INTERN -ibool -thd_supports_xa( -/*============*/ - void* thd) /*!< in: thread handle (THD*), or NULL to query - the global innodb_supports_xa */ -{ - return(THDVAR((THD*) thd, support_xa)); -} +/************************************************************************//** +Function for constructing an InnoDB table handler instance. */ +static +handler* +innobase_create_handler( +/*====================*/ + handlerton* hton, /*!< in/out: handlerton for Innodb */ + TABLE_SHARE* table, + MEM_ROOT* mem_root); -/******************************************************************//** -Check the status of fake changes mode (innodb_fake_changes) -@return true if fake change mode is enabled. */ -extern "C" UNIV_INTERN -ibool -thd_fake_changes( -/*=============*/ - void* thd) /*!< in: thread handle, or NULL to query - the global innodb_supports_xa */ -{ - return(THDVAR((THD*) thd, fake_changes)); -} +/** @brief Initialize the default value of innodb_commit_concurrency. -/******************************************************************//** -Returns the lock wait timeout for the current connection. -@return the lock wait timeout, in seconds */ -extern "C" UNIV_INTERN -ulong -thd_lock_wait_timeout( -/*==================*/ - void* thd) /*!< in: thread handle (THD*), or NULL to query - the global innodb_lock_wait_timeout */ -{ - /* According to <mysql/plugin.h>, passing thd == NULL - returns the global value of the session variable. */ - return(THDVAR((THD*) thd, lock_wait_timeout)); -} +Once InnoDB is running, the innodb_commit_concurrency must not change +from zero to nonzero. (Bug #42101) -/******************************************************************//** -Set the time waited for the lock for the current query. */ -extern "C" UNIV_INTERN -void -thd_set_lock_wait_time( +The initial default value is 0, and without this extra initialization, +SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter +to 0, even if it was initially set to nonzero at the command line +or configuration file. */ +static +void +innobase_commit_concurrency_init_default(); +/*=======================================*/ + +/** @brief Initialize the default and max value of innodb_undo_logs. + +Once InnoDB is running, the default value and the max value of +innodb_undo_logs must be equal to the available undo logs, +given by srv_available_undo_logs. */ +static +void +innobase_undo_logs_init_default_max(); +/*==================================*/ + +/************************************************************//** +Validate the file format name and return its corresponding id. +@return valid file format id */ +static +uint +innobase_file_format_name_lookup( +/*=============================*/ + const char* format_name); /*!< in: pointer to file format + name */ +/************************************************************//** +Validate the file format check config parameters, as a side effect it +sets the srv_max_file_format_at_startup variable. +@return the format_id if valid config value, otherwise, return -1 */ +static +int +innobase_file_format_validate_and_set( +/*==================================*/ + const char* format_max); /*!< in: parameter value */ + +/*******************************************************************//** +This function is used to prepare an X/Open XA distributed transaction. +@return 0 or error number */ +static +int +innobase_xa_prepare( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd, /*!< in: handle to the MySQL thread of + the user whose XA transaction should + be prepared */ + bool all); /*!< in: true - prepare transaction + false - the current SQL statement + ended */ +/*******************************************************************//** +This function is used to recover X/Open XA distributed transactions. +@return number of prepared transactions stored in xid_list */ +static +int +innobase_xa_recover( +/*================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid_list, /*!< in/out: prepared transactions */ + uint len); /*!< in: number of slots in xid_list */ +/*******************************************************************//** +This function is used to commit one X/Open XA distributed transaction +which is in the prepared state +@return 0 or error number */ +static +int +innobase_commit_by_xid( /*===================*/ - void* thd, /*!< in: thread handle (THD*) */ - ulint value) /*!< in: time waited for the lock */ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid); /*!< in: X/Open XA transaction + identification */ +/*******************************************************************//** +This function is used to rollback one X/Open XA distributed transaction +which is in the prepared state +@return 0 or error number */ +static +int +innobase_rollback_by_xid( +/*=====================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + XID* xid); /*!< in: X/Open XA transaction + identification */ +/*******************************************************************//** +Create a consistent view for a cursor based on current transaction +which is created if the corresponding MySQL thread still lacks one. +This consistent view is then used inside of MySQL when accessing records +using a cursor. +@return pointer to cursor view or NULL */ +static +void* +innobase_create_cursor_view( +/*========================*/ + handlerton* hton, /*!< in: innobase hton */ + THD* thd); /*!< in: user thread handle */ +/*******************************************************************//** +Set the given consistent cursor view to a transaction which is created +if the corresponding MySQL thread still lacks one. If the given +consistent cursor view is NULL global read view of a transaction is +restored to a transaction read view. */ +static +void +innobase_set_cursor_view( +/*=====================*/ + handlerton* hton, /*!< in: handlerton of Innodb */ + THD* thd, /*!< in: user thread handle */ + void* curview); /*!< in: Consistent cursor view to + be set */ +/*******************************************************************//** +Close the given consistent cursor view of a transaction and restore +global read view to a transaction read view. Transaction is created if the +corresponding MySQL thread still lacks one. */ +static +void +innobase_close_cursor_view( +/*=======================*/ + handlerton* hton, /*!< in: handlerton of Innodb */ + THD* thd, /*!< in: user thread handle */ + void* curview); /*!< in: Consistent read view to be + closed */ +/*****************************************************************//** +Removes all tables in the named database inside InnoDB. */ +static +void +innobase_drop_database( +/*===================*/ + handlerton* hton, /*!< in: handlerton of Innodb */ + char* path); /*!< in: database path; inside InnoDB + the name of the last directory in + the path is used as the database name: + for example, in 'mysql/data/test' the + database name is 'test' */ +/** Shut down the InnoDB storage engine. +@return 0 */ +static +int +innobase_end(handlerton*, ha_panic_function); + +#if NOT_USED +/*****************************************************************//** +Stores the current binlog coordinates in the trx system header. */ +static +int +innobase_store_binlog_info( +/*=======================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd); /*!< in: MySQL thread handle */ +#endif + +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. +@return 0 */ +static +int +innobase_start_trx_and_assign_read_view( +/*====================================*/ + handlerton* hton, /* in: Innodb handlerton */ + THD* thd); /* in: MySQL thread handle of the + user for whom the transaction should + be committed */ +#ifdef NOT_USED +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +clones snapshot for a consistent read from another session, if it has one. +@return 0 */ +static +int +innobase_start_trx_and_clone_read_view( +/*====================================*/ + handlerton* hton, /* in: Innodb handlerton */ + THD* thd, /* in: MySQL thread handle of the + user for whom the transaction should + be committed */ + THD* from_thd); /* in: MySQL thread handle of the + user session from which the consistent + read should be cloned */ +#endif +/****************************************************************//** +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes +the logs, and the name of this function should be innobase_checkpoint. +@return TRUE if error */ +static +bool +innobase_flush_logs( +/*================*/ + handlerton* hton); /*!< in: InnoDB handlerton */ + +/************************************************************************//** +Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the +InnoDB Monitor to the client. +@return 0 on success */ +static +int +innodb_show_status( +/*===============*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of + the caller */ + stat_print_fn* stat_print); +/************************************************************************//** +Return 0 on success and non-zero on failure. Note: the bool return type +seems to be abused here, should be an int. */ +static +bool +innobase_show_status( +/*=================*/ + handlerton* hton, /*!< in: the innodb handlerton */ + THD* thd, /*!< in: the MySQL query thread of + the caller */ + stat_print_fn* stat_print, + enum ha_stat_type stat_type); + +/*****************************************************************//** +Commits a transaction in an InnoDB database. */ +static +void +innobase_commit_low( +/*================*/ + trx_t* trx); /*!< in: transaction handle */ + +/****************************************************************//** +Parse and enable InnoDB monitor counters during server startup. +User can enable monitor counters/groups by specifying +"loose-innodb_monitor_enable = monitor_name1;monitor_name2..." +in server configuration file or at the command line. */ +static +void +innodb_enable_monitor_at_startup( +/*=============================*/ + char* str); /*!< in: monitor counter enable list */ + +/********************************************************************* +Normalizes a table name string. A normalized name consists of the +database name catenated to '/' and table name. An example: +test/mytable. On Windows normalization puts both the database name and the +table name always to lower case if "set_lower_case" is set to TRUE. */ +void +normalize_table_name_low( +/*=====================*/ + char* norm_name, /* out: normalized name as a + null-terminated string */ + const char* name, /* in: table name string */ + ibool set_lower_case); /* in: TRUE if we want to set + name to lower case */ + +#ifdef NOT_USED +/*************************************************************//** +Removes old archived transaction log files. +@return true on error */ +static bool innobase_purge_archive_logs( + handlerton *hton, /*!< in: InnoDB handlerton */ + time_t before_date, /*!< in: all files modified + before timestamp should be removed */ + const char* to_filename) /*!< in: this and earler files + should be removed */ +{ + ulint err= DB_ERROR; + if (before_date > 0) { + err= purge_archived_logs(before_date, 0); + } else if (to_filename) { + if (is_prefix(to_filename, IB_ARCHIVED_LOGS_PREFIX)) { + unsigned long long log_file_lsn = strtoll(to_filename + + IB_ARCHIVED_LOGS_PREFIX_LEN, + NULL, 10); + if (log_file_lsn > 0 && log_file_lsn < ULLONG_MAX) { + err= purge_archived_logs(0, log_file_lsn); + } + } + } + return (err != DB_SUCCESS); +} +#endif + + +/*************************************************************//** +Check for a valid value of innobase_commit_concurrency. +@return 0 for valid innodb_commit_concurrency */ +static +int +innobase_commit_concurrency_validate( +/*=================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming string */ +{ + long long intbuf; + ulong commit_concurrency; + + DBUG_ENTER("innobase_commit_concurrency_validate"); + + if (value->val_int(value, &intbuf)) { + /* The value is NULL. That is invalid. */ + DBUG_RETURN(1); + } + + *reinterpret_cast<ulong*>(save) = commit_concurrency + = static_cast<ulong>(intbuf); + + /* Allow the value to be updated, as long as it remains zero + or nonzero. */ + DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency)); +} + +/*******************************************************************//** +Function for constructing an InnoDB table handler instance. */ +static +handler* +innobase_create_handler( +/*====================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + TABLE_SHARE* table, + MEM_ROOT* mem_root) +{ + return(new (mem_root) ha_innobase(hton, table)); +} + +/* General functions */ + +/*************************************************************//** +Check that a page_size is correct for InnoDB. If correct, set the +associated page_size_shift which is the power of 2 for this page size. +@return an associated page_size_shift if valid, 0 if invalid. */ +inline +int +innodb_page_size_validate( +/*======================*/ + ulong page_size) /*!< in: Page Size to evaluate */ +{ + ulong n; + + DBUG_ENTER("innodb_page_size_validate"); + + for (n = UNIV_PAGE_SIZE_SHIFT_MIN; + n <= UNIV_PAGE_SIZE_SHIFT_MAX; + n++) { + if (page_size == (ulong) (1 << n)) { + DBUG_RETURN(n); + } + } + + DBUG_RETURN(0); +} + +/******************************************************************//** +Returns true if the thread is the replication thread on the slave +server. Used in srv_conc_enter_innodb() to determine if the thread +should be allowed to enter InnoDB - the replication thread is treated +differently than other threads. Also used in +srv_conc_force_exit_innodb(). +@return true if thd is the replication thread */ +UNIV_INTERN +ibool +thd_is_replication_slave_thread( +/*============================*/ + THD* thd) /*!< in: thread handle */ +{ + return((ibool) thd_slave_thread(thd)); +} + +/******************************************************************//** +Gets information on the durability property requested by thread. +Used when writing either a prepare or commit record to the log +buffer. @return the durability property. */ +UNIV_INTERN +enum durability_properties +thd_requested_durability( +/*=====================*/ + const THD* thd) /*!< in: thread handle */ +{ + return(thd_get_durability_property(thd)); +} + +/******************************************************************//** +Returns true if transaction should be flagged as read-only. +@return true if the thd is marked as read-only */ +UNIV_INTERN +ibool +thd_trx_is_read_only( +/*=================*/ + THD* thd) /*!< in: thread handle */ +{ + return(thd != 0 && thd_tx_is_read_only(thd)); +} + +/******************************************************************//** +Check if the transaction is an auto-commit transaction. TRUE also +implies that it is a SELECT (read-only) transaction. +@return true if the transaction is an auto commit read-only transaction. */ +UNIV_INTERN +ibool +thd_trx_is_auto_commit( +/*===================*/ + THD* thd) /*!< in: thread handle, can be NULL */ +{ + return(thd != NULL + && !thd_test_options( + thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) + && thd_is_select(thd)); +} + +/******************************************************************//** +Save some CPU by testing the value of srv_thread_concurrency in inline +functions. */ +static inline +void +innobase_srv_conc_enter_innodb( +/*===========================*/ + trx_t* trx) /*!< in: transaction handle */ +{ +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd) && + wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return; +#endif /* WITH_WSREP */ + if (srv_thread_concurrency) { + if (trx->n_tickets_to_enter_innodb > 0) { + + /* If trx has 'free tickets' to enter the engine left, + then use one such ticket */ + + --trx->n_tickets_to_enter_innodb; + + } else if (trx->mysql_thd != NULL + && thd_is_replication_slave_thread(trx->mysql_thd)) { + + UT_WAIT_FOR( + srv_conc_get_active_threads() + < srv_thread_concurrency, + srv_replication_delay * 1000); + + } else { + srv_conc_enter_innodb(trx); + } + } +} + +/******************************************************************//** +Note that the thread wants to leave InnoDB only if it doesn't have +any spare tickets. */ +static inline +void +innobase_srv_conc_exit_innodb( +/*==========================*/ + trx_t* trx) /*!< in: transaction handle */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ +#ifdef WITH_WSREP + if (wsrep_on(trx->mysql_thd) && + wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return; +#endif /* WITH_WSREP */ + + /* This is to avoid making an unnecessary function call. */ + if (trx->declared_to_be_inside_innodb + && trx->n_tickets_to_enter_innodb == 0) { + + srv_conc_force_exit_innodb(trx); + } +} + +/******************************************************************//** +Force a thread to leave InnoDB even if it has spare tickets. */ +static inline +void +innobase_srv_conc_force_exit_innodb( +/*================================*/ + trx_t* trx) /*!< in: transaction handle */ +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch)); +#endif /* UNIV_SYNC_DEBUG */ + + /* This is to avoid making an unnecessary function call. */ + if (trx->declared_to_be_inside_innodb) { + srv_conc_force_exit_innodb(trx); + } +} + +/******************************************************************//** +Returns the NUL terminated value of glob_hostname. +@return pointer to glob_hostname. */ +UNIV_INTERN +const char* +server_get_hostname() +/*=================*/ +{ + return(glob_hostname); +} + +/******************************************************************//** +Returns true if the transaction this thread is processing has edited +non-transactional tables. Used by the deadlock detector when deciding +which transaction to rollback in case of a deadlock - we try to avoid +rolling back transactions that have edited non-transactional tables. +@return true if non-transactional tables have been edited */ +UNIV_INTERN +ibool +thd_has_edited_nontrans_tables( +/*===========================*/ + THD* thd) /*!< in: thread handle */ +{ + return((ibool) thd_non_transactional_update(thd)); +} + +/******************************************************************//** +Returns true if the thread is executing a SELECT statement. +@return true if thd is executing SELECT */ +UNIV_INTERN +ibool +thd_is_select( +/*==========*/ + const THD* thd) /*!< in: thread handle */ +{ + return(thd_sql_command(thd) == SQLCOM_SELECT); +} + +/******************************************************************//** +Returns true if the thread supports XA, +global value of innodb_supports_xa if thd is NULL. +@return true if thd has XA support */ +UNIV_INTERN +ibool +thd_supports_xa( +/*============*/ + THD* thd) /*!< in: thread handle, or NULL to query + the global innodb_supports_xa */ +{ + return(THDVAR(thd, support_xa)); +} + +/** Get the value of innodb_tmpdir. +@param[in] thd thread handle, or NULL to query + the global innodb_tmpdir. +@retval NULL if innodb_tmpdir="" */ +UNIV_INTERN +const char* +thd_innodb_tmpdir( + THD* thd) +{ +#ifdef UNIV_SYNC_DEBUG + ut_ad(!sync_thread_levels_nonempty_trx(false)); +#endif /* UNIV_SYNC_DEBUG */ + + const char* tmp_dir = THDVAR(thd, tmpdir); + if (tmp_dir != NULL && *tmp_dir == '\0') { + tmp_dir = NULL; + } + + return(tmp_dir); +} +/******************************************************************//** +Check the status of fake changes mode (innodb_fake_changes) +@return true if fake change mode is enabled. */ +UNIV_INTERN +ibool +thd_fake_changes( +/*=============*/ + THD* thd) /*!< in: thread handle, or NULL to query + the global innodb_supports_xa */ +{ + return(THDVAR((THD*) thd, fake_changes)); +} + +/******************************************************************//** +Returns the lock wait timeout for the current connection. +@return the lock wait timeout, in seconds */ +UNIV_INTERN +ulong +thd_lock_wait_timeout( +/*==================*/ + THD* thd) /*!< in: thread handle, or NULL to query + the global innodb_lock_wait_timeout */ +{ + /* According to <mysql/plugin.h>, passing thd == NULL + returns the global value of the session variable. */ + return(THDVAR(thd, lock_wait_timeout)); +} + +/******************************************************************//** +Set the time waited for the lock for the current query. */ +UNIV_INTERN +void +thd_set_lock_wait_time( +/*===================*/ + THD* thd, /*!< in/out: thread handle */ + ulint value) /*!< in: time waited for the lock */ +{ + if (thd) { + thd_storage_lock_wait(thd, value); + } +} + +/******************************************************************//** +*/ +UNIV_INTERN +ulong +thd_flush_log_at_trx_commit( +/*================================*/ + void* thd) +{ + return(THDVAR((THD*) thd, flush_log_at_trx_commit)); +} + +/********************************************************************//** +Obtain the InnoDB transaction of a MySQL thread. +@return reference to transaction pointer */ +MY_ATTRIBUTE((warn_unused_result, nonnull)) +static inline +trx_t*& +thd_to_trx( +/*=======*/ + THD* thd) /*!< in: MySQL thread */ +{ + return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr)); +} + +#ifdef WITH_WSREP +ulonglong +thd_to_trx_id( +/*=======*/ + THD* thd) /*!< in: MySQL thread */ +{ + return(thd_to_trx(thd)->id); +} +#endif /* WITH_WSREP */ + +my_bool +ha_innobase::is_fake_change_enabled(THD* thd) +{ + trx_t* trx = thd_to_trx(thd); + return(trx && UNIV_UNLIKELY(trx->fake_changes)); +} + +/********************************************************************//** +In XtraDB it is impossible for a transaction to own a search latch outside of +InnoDB code, so there is nothing to release on demand. We keep this function to +simplify maintenance. +@return 0 */ +static +int +innobase_release_temporary_latches( +/*===============================*/ + handlerton* hton MY_ATTRIBUTE((unused)), /*!< in: handlerton */ + THD* thd MY_ATTRIBUTE((unused))) /*!< in: MySQL thread */ +{ +#ifdef UNIV_DEBUG + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (!innodb_inited || thd == NULL) { + + return(0); + } + + trx_t* trx = thd_to_trx(thd); + + if (trx != NULL) { +#ifdef UNIV_SYNC_DEBUG + ut_ad(!btr_search_own_any()); +#endif + trx_search_latch_release_if_reserved(trx); + } +#endif + + return(0); +} + +/********************************************************************//** +Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth +time calls srv_active_wake_master_thread. This function should be used +when a single database operation may introduce a small need for +server utility activity, like checkpointing. */ +static inline +void +innobase_active_small(void) +/*=======================*/ +{ + innobase_active_counter++; + + if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) { + srv_active_wake_master_thread(); + } +} + +/********************************************************************//** +Converts an InnoDB error code to a MySQL error code and also tells to MySQL +about a possible transaction rollback inside InnoDB caused by a lock wait +timeout or a deadlock. +@return MySQL error code */ +static +int +convert_error_code_to_mysql( +/*========================*/ + dberr_t error, /*!< in: InnoDB error code */ + ulint flags, /*!< in: InnoDB table flags, or 0 */ + THD* thd) /*!< in: user thread handle or NULL */ +{ + switch (error) { + case DB_SUCCESS: + return(0); + + case DB_INTERRUPTED: + return(HA_ERR_ABORTED_BY_USER); + + case DB_FOREIGN_EXCEED_MAX_CASCADE: + ut_ad(thd); + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_ROW_IS_REFERENCED, + "InnoDB: Cannot delete/update " + "rows with cascading foreign key " + "constraints that exceed max " + "depth of %d. Please " + "drop extra constraints and try " + "again", DICT_FK_MAX_RECURSIVE_LOAD); + + /* fall through */ + + case DB_ERROR: + default: + return(-1); /* unspecified error */ + + case DB_DUPLICATE_KEY: + /* Be cautious with returning this error, since + mysql could re-enter the storage layer to get + duplicated key info, the operation requires a + valid table handle and/or transaction information, + which might not always be available in the error + handling stage. */ + return(HA_ERR_FOUND_DUPP_KEY); + + case DB_READ_ONLY: + return(HA_ERR_TABLE_READONLY); + + case DB_FOREIGN_DUPLICATE_KEY: + return(HA_ERR_FOREIGN_DUPLICATE_KEY); + + case DB_MISSING_HISTORY: + return(HA_ERR_TABLE_DEF_CHANGED); + + case DB_RECORD_NOT_FOUND: + return(HA_ERR_NO_ACTIVE_RECORD); + + case DB_SEARCH_ABORTED_BY_USER: + return(HA_ERR_ABORTED_BY_USER); + + case DB_DEADLOCK: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, TRUE); + } + + return(HA_ERR_LOCK_DEADLOCK); + + case DB_LOCK_WAIT_TIMEOUT: + /* Starting from 5.0.13, we let MySQL just roll back the + latest SQL statement in a lock wait timeout. Previously, we + rolled back the whole transaction. */ + + if (thd) { + thd_mark_transaction_to_rollback( + thd, (bool) row_rollback_on_timeout); + } + + return(HA_ERR_LOCK_WAIT_TIMEOUT); + + case DB_NO_REFERENCED_ROW: + return(HA_ERR_NO_REFERENCED_ROW); + + case DB_ROW_IS_REFERENCED: + return(HA_ERR_ROW_IS_REFERENCED); + + case DB_CANNOT_ADD_CONSTRAINT: + case DB_CHILD_NO_INDEX: + case DB_PARENT_NO_INDEX: + return(HA_ERR_CANNOT_ADD_FOREIGN); + + case DB_CANNOT_DROP_CONSTRAINT: + + return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit + misleading, a new MySQL error + code should be introduced */ + + case DB_CORRUPTION: + return(HA_ERR_CRASHED); + + case DB_OUT_OF_FILE_SPACE: + return(HA_ERR_RECORD_FILE_FULL); + + case DB_TEMP_FILE_WRITE_FAILURE: + my_error(ER_GET_ERRMSG, MYF(0), + DB_TEMP_FILE_WRITE_FAILURE, + ut_strerr(DB_TEMP_FILE_WRITE_FAILURE), + "InnoDB"); + return(HA_ERR_INTERNAL_ERROR); + + case DB_TABLE_IN_FK_CHECK: + return(HA_ERR_TABLE_IN_FK_CHECK); + + case DB_TABLE_IS_BEING_USED: + return(HA_ERR_WRONG_COMMAND); + + case DB_TABLESPACE_DELETED: + case DB_TABLE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + + case DB_TABLESPACE_NOT_FOUND: + return(HA_ERR_NO_SUCH_TABLE); + + case DB_TOO_BIG_RECORD: { + /* If prefix is true then a 768-byte prefix is stored + locally for BLOB fields. Refer to dict_table_get_format() */ + bool prefix = (dict_tf_get_format(flags) == UNIV_FORMAT_A); + my_printf_error(ER_TOO_BIG_ROWSIZE, + "Row size too large (> %lu). Changing some columns " + "to TEXT or BLOB %smay help. In current row " + "format, BLOB prefix of %d bytes is stored inline.", + MYF(0), + page_get_free_space_of_empty(flags & + DICT_TF_COMPACT) / 2, + prefix ? "or using ROW_FORMAT=DYNAMIC " + "or ROW_FORMAT=COMPRESSED ": "", + prefix ? DICT_MAX_FIXED_COL_LEN : 0); + return(HA_ERR_TO_BIG_ROW); + } + + + case DB_TOO_BIG_FOR_REDO: + my_printf_error(ER_TOO_BIG_ROWSIZE, "%s" , MYF(0), + "The size of BLOB/TEXT data inserted" + " in one transaction is greater than" + " 10% of redo log size. Increase the" + " redo log size using innodb_log_file_size."); + return(HA_ERR_TO_BIG_ROW); + + case DB_TOO_BIG_INDEX_COL: + my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0), + DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)); + return(HA_ERR_INDEX_COL_TOO_LONG); + + case DB_NO_SAVEPOINT: + return(HA_ERR_NO_SAVEPOINT); + + case DB_LOCK_TABLE_FULL: + /* Since we rolled back the whole transaction, we must + tell it also to MySQL so that MySQL knows to empty the + cached binlog for this transaction */ + + if (thd) { + thd_mark_transaction_to_rollback(thd, TRUE); + } + + return(HA_ERR_LOCK_TABLE_FULL); + + case DB_FTS_INVALID_DOCID: + return(HA_FTS_INVALID_DOCID); + case DB_FTS_EXCEED_RESULT_CACHE_LIMIT: + return(HA_ERR_OUT_OF_MEM); + case DB_TOO_MANY_CONCURRENT_TRXS: + return(HA_ERR_TOO_MANY_CONCURRENT_TRXS); + case DB_UNSUPPORTED: + return(HA_ERR_UNSUPPORTED); + case DB_INDEX_CORRUPT: + return(HA_ERR_INDEX_CORRUPT); + case DB_UNDO_RECORD_TOO_BIG: + return(HA_ERR_UNDO_REC_TOO_BIG); + case DB_OUT_OF_MEMORY: + return(HA_ERR_OUT_OF_MEM); + case DB_TABLESPACE_EXISTS: + return(HA_ERR_TABLESPACE_EXISTS); + case DB_IDENTIFIER_TOO_LONG: + return(HA_ERR_INTERNAL_ERROR); + case DB_FTS_TOO_MANY_WORDS_IN_PHRASE: + return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE); + } +} + +/*************************************************************//** +Prints info of a THD object (== user session thread) to the given file. */ +UNIV_INTERN +void +innobase_mysql_print_thd( +/*=====================*/ + FILE* f, /*!< in: output stream */ + THD* thd, /*!< in: MySQL THD object */ + uint max_query_len) /*!< in: max query length to print, or 0 to + use the default max length */ +{ + char buffer[1024]; + + fputs(thd_get_error_context_description((THD*) thd, + buffer, sizeof buffer, + max_query_len), f); + putc('\n', f); +} + +/******************************************************************//** +Get the error message format string. +@return the format string or 0 if not found. */ +UNIV_INTERN +const char* +innobase_get_err_msg( +/*=================*/ + int error_code) /*!< in: MySQL error code */ +{ + return(my_get_err_msg(error_code)); +} + +/******************************************************************//** +Get the variable length bounds of the given character set. */ +UNIV_INTERN +void +innobase_get_cset_width( +/*====================*/ + ulint cset, /*!< in: MySQL charset-collation code */ + ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */ + ulint* mbmaxlen) /*!< out: maximum length of a char (in bytes) */ +{ + CHARSET_INFO* cs; + ut_ad(cset <= MAX_CHAR_COLL_NUM); + ut_ad(mbminlen); + ut_ad(mbmaxlen); + + cs = all_charsets[cset]; + if (cs) { + *mbminlen = cs->mbminlen; + *mbmaxlen = cs->mbmaxlen; + ut_ad(*mbminlen < DATA_MBMAX); + ut_ad(*mbmaxlen < DATA_MBMAX); + } else { + THD* thd = current_thd; + + if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) { + + /* Fix bug#46256: allow tables to be dropped if the + collation is not found, but issue a warning. */ + if ((global_system_variables.log_warnings) + && (cset != 0)){ + + sql_print_warning( + "Unknown collation #%lu.", cset); + } + } else { + + ut_a(cset == 0); + } + + *mbminlen = *mbmaxlen = 0; + } +} + +/******************************************************************//** +Converts an identifier to a table name. */ +UNIV_INTERN +void +innobase_convert_from_table_id( +/*===========================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len) /*!< in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, strlen(from), &my_charset_filename, to, (uint) len, &errors); +} + +/********************************************************************** +Check if the length of the identifier exceeds the maximum allowed. +return true when length of identifier is too long. */ +UNIV_INTERN +my_bool +innobase_check_identifier_length( +/*=============================*/ + const char* id) /* in: FK identifier to check excluding the + database portion. */ +{ + int well_formed_error = 0; + CHARSET_INFO *cs = system_charset_info; + DBUG_ENTER("innobase_check_identifier_length"); + + size_t len = cs->cset->well_formed_len( + cs, id, id + strlen(id), + NAME_CHAR_LEN, &well_formed_error); + + if (well_formed_error || len == NAME_CHAR_LEN) { + my_error(ER_TOO_LONG_IDENT, MYF(0), id); + DBUG_RETURN(true); + } + DBUG_RETURN(false); +} + +/******************************************************************//** +Converts an identifier to UTF-8. */ +UNIV_INTERN +void +innobase_convert_from_id( +/*=====================*/ + struct charset_info_st* cs, /*!< in: the 'from' character set */ + char* to, /*!< out: converted identifier */ + const char* from, /*!< in: identifier to convert */ + ulint len) /*!< in: length of 'to', in bytes */ +{ + uint errors; + + strconvert(cs, from, strlen(from), system_charset_info, to, (uint) len, &errors); +} + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. +@return 0 if a=b, <0 if a<b, >1 if a>b */ +UNIV_INTERN +int +innobase_strcasecmp( +/*================*/ + const char* a, /*!< in: first string to compare */ + const char* b) /*!< in: second string to compare */ +{ + if (!a) { + if (!b) { + return(0); + } else { + return(-1); + } + } else if (!b) { + return(1); + } + + return(my_strcasecmp(system_charset_info, a, b)); +} + +/******************************************************************//** +Compares NUL-terminated UTF-8 strings case insensitively. The +second string contains wildcards. +@return 0 if a match is found, 1 if not */ +UNIV_INTERN +int +innobase_wildcasecmp( +/*=================*/ + const char* a, /*!< in: string to compare */ + const char* b) /*!< in: wildcard string to compare */ +{ + return(wild_case_compare(system_charset_info, a, b)); +} + +/******************************************************************//** +Strip dir name from a full path name and return only the file name +@return file name or "null" if no file name */ +UNIV_INTERN +const char* +innobase_basename( +/*==============*/ + const char* path_name) /*!< in: full path name */ +{ + const char* name = base_name(path_name); + + return((name) ? name : "null"); +} + +/******************************************************************//** +Makes all characters in a NUL-terminated UTF-8 string lower case. */ +UNIV_INTERN +void +innobase_casedn_str( +/*================*/ + char* a) /*!< in/out: string to put in lower case */ +{ + my_casedn_str(system_charset_info, a); +} + +/**********************************************************************//** +Determines the connection character set. +@return connection character set */ +UNIV_INTERN +struct charset_info_st* +innobase_get_charset( +/*=================*/ + THD* mysql_thd) /*!< in: MySQL thread handle */ +{ + return(thd_charset(mysql_thd)); +} + +/**********************************************************************//** +Determines the current SQL statement. +@return SQL statement string */ +UNIV_INTERN +const char* +innobase_get_stmt( +/*==============*/ + THD* thd, /*!< in: MySQL thread handle */ + size_t* length) /*!< out: length of the SQL statement */ +{ + if (const LEX_STRING *stmt = thd_query_string(thd)) { + *length = stmt->length; + return stmt->str; + } + return NULL; +} + +/**********************************************************************//** +Get the current setting of the table_def_size global parameter. We do +a dirty read because for one there is no synchronization object and +secondly there is little harm in doing so even if we get a torn read. +@return value of table_def_size */ +UNIV_INTERN +ulint +innobase_get_table_cache_size(void) +/*===============================*/ +{ + return(tdc_size); +} + +/**********************************************************************//** +Get the current setting of the lower_case_table_names global parameter from +mysqld.cc. We do a dirty read because for one there is no synchronization +object and secondly there is little harm in doing so even if we get a torn +read. +@return value of lower_case_table_names */ +UNIV_INTERN +ulint +innobase_get_lower_case_table_names(void) +/*=====================================*/ +{ + return(lower_case_table_names); +} + +/** Create a temporary file in the location specified by the parameter +path. If the path is null, then it will be created in tmpdir. +@param[in] path location for creating temporary file +@return temporary file descriptor, or < 0 on error */ +UNIV_INTERN +int +innobase_mysql_tmpfile( + const char* path) +{ +#ifdef WITH_INNODB_DISALLOW_WRITES + os_event_wait(srv_allow_writes_event); +#endif /* WITH_INNODB_DISALLOW_WRITES */ + int fd2 = -1; + File fd; + + DBUG_EXECUTE_IF( + "innobase_tmpfile_creation_failure", + return(-1); + ); + + if (path == NULL) { + fd = mysql_tmpfile("ib"); + } else { + fd = mysql_tmpfile_path(path, "ib"); + } + + if (fd >= 0) { + /* Copy the file descriptor, so that the additional resources + allocated by create_temp_file() can be freed by invoking + my_close(). + + Because the file descriptor returned by this function + will be passed to fdopen(), it will be closed by invoking + fclose(), which in turn will invoke close() instead of + my_close(). */ + +#ifdef _WIN32 + /* Note that on Windows, the integer returned by mysql_tmpfile + has no relation to C runtime file descriptor. Here, we need + to call my_get_osfhandle to get the HANDLE and then convert it + to C runtime filedescriptor. */ + { + HANDLE hFile = my_get_osfhandle(fd); + HANDLE hDup; + BOOL bOK = DuplicateHandle( + GetCurrentProcess(), + hFile, GetCurrentProcess(), + &hDup, 0, FALSE, DUPLICATE_SAME_ACCESS); + if (bOK) { + fd2 = _open_osfhandle((intptr_t) hDup, 0); + } else { + my_osmaperr(GetLastError()); + fd2 = -1; + } + } ++#else ++#ifdef F_DUPFD_CLOEXEC ++ fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 0); +#else + fd2 = dup(fd); ++#endif +#endif + if (fd2 < 0) { + DBUG_PRINT("error",("Got error %d on dup",fd2)); + my_errno=errno; + my_error(EE_OUT_OF_FILERESOURCES, + MYF(ME_BELL+ME_WAITTANG), + "ib*", my_errno); + } + my_close(fd, MYF(MY_WME)); + } + return(fd2); +} + +/*********************************************************************//** +Wrapper around MySQL's copy_and_convert function. +@return number of bytes copied to 'to' */ +UNIV_INTERN +ulint +innobase_convert_string( +/*====================*/ + void* to, /*!< out: converted string */ + ulint to_length, /*!< in: number of bytes reserved + for the converted string */ + CHARSET_INFO* to_cs, /*!< in: character set to convert to */ + const void* from, /*!< in: string to convert */ + ulint from_length, /*!< in: number of bytes to convert */ + CHARSET_INFO* from_cs, /*!< in: character set to convert + from */ + uint* errors) /*!< out: number of errors encountered + during the conversion */ +{ + return(copy_and_convert( + (char*) to, (uint32) to_length, to_cs, + (const char*) from, (uint32) from_length, from_cs, + errors)); +} + +/*******************************************************************//** +Formats the raw data in "data" (in InnoDB on-disk format) that is of +type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes +the result to "buf". The result is converted to "system_charset_info". +Not more than "buf_size" bytes are written to "buf". +The result is always NUL-terminated (provided buf_size > 0) and the +number of bytes that were written to "buf" is returned (including the +terminating NUL). +@return number of bytes that were written */ +UNIV_INTERN +ulint +innobase_raw_format( +/*================*/ + const char* data, /*!< in: raw data */ + ulint data_len, /*!< in: raw data length + in bytes */ + ulint charset_coll, /*!< in: charset collation */ + char* buf, /*!< out: output buffer */ + ulint buf_size) /*!< in: output buffer size + in bytes */ +{ + /* XXX we use a hard limit instead of allocating + but_size bytes from the heap */ + CHARSET_INFO* data_cs; + char buf_tmp[8192]; + ulint buf_tmp_used; + uint num_errors; + + data_cs = all_charsets[charset_coll]; + + buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp), + system_charset_info, + data, data_len, data_cs, + &num_errors); + + return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size)); +} + +/*********************************************************************//** +Compute the next autoinc value. + +For MySQL replication the autoincrement values can be partitioned among +the nodes. The offset is the start or origin of the autoincrement value +for a particular node. For n nodes the increment will be n and the offset +will be in the interval [1, n]. The formula tries to allocate the next +value for a particular node. + +Note: This function is also called with increment set to the number of +values we want to reserve for multi-value inserts e.g., + + INSERT INTO T VALUES(), (), (); + +innobase_next_autoinc() will be called with increment set to 3 where +autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for +the multi-value INSERT above. +@return the next value */ +UNIV_INTERN +ulonglong +innobase_next_autoinc( +/*==================*/ + ulonglong current, /*!< in: Current value */ + ulonglong need, /*!< in: count of values needed */ + ulonglong step, /*!< in: AUTOINC increment step */ + ulonglong offset, /*!< in: AUTOINC offset */ + ulonglong max_value) /*!< in: max value for type */ +{ + ulonglong next_value; + ulonglong block = need * step; + + /* Should never be 0. */ + ut_a(need > 0); + ut_a(block > 0); + ut_a(max_value > 0); + + /* + Allow auto_increment to go over max_value up to max ulonglong. + This allows us to detect that all values are exhausted. + If we don't do this, we will return max_value several times + and get duplicate key errors instead of auto increment value + out of range. + */ + max_value= (~(ulonglong) 0); + + /* According to MySQL documentation, if the offset is greater than + the step then the offset is ignored. */ + if (offset > block) { + offset = 0; + } + + /* Check for overflow. Current can be > max_value if the value is + in reality a negative value.The visual studio compilers converts + large double values automatically into unsigned long long datatype + maximum value */ + + if (block >= max_value + || offset > max_value + || current >= max_value + || max_value - offset <= offset) { + + next_value = max_value; + } else { + ut_a(max_value > current); + + ulonglong free = max_value - current; + + if (free < offset || free - offset <= block) { + next_value = max_value; + } else { + next_value = 0; + } + } + + if (next_value == 0) { + ulonglong next; + + if (current >= offset) { + next = (current - offset) / step; + } else { + next = 0; + block -= step; + } + + ut_a(max_value > next); + next_value = next * step; + /* Check for multiplication overflow. */ + ut_a(next_value >= next); + ut_a(max_value > next_value); + + /* Check for overflow */ + if (max_value - next_value >= block) { + + next_value += block; + + if (max_value - next_value >= offset) { + next_value += offset; + } else { + next_value = max_value; + } + } else { + next_value = max_value; + } + } + + ut_a(next_value != 0); + ut_a(next_value <= max_value); + + return(next_value); +} + +/*********************************************************************//** +Initializes some fields in an InnoDB transaction object. */ +static +void +innobase_trx_init( +/*==============*/ + THD* thd, /*!< in: user thread handle */ + trx_t* trx) /*!< in/out: InnoDB transaction handle */ +{ + DBUG_ENTER("innobase_trx_init"); + DBUG_ASSERT(thd == trx->mysql_thd); + + trx->check_foreigns = !thd_test_options( + thd, OPTION_NO_FOREIGN_KEY_CHECKS); + + trx->check_unique_secondary = !thd_test_options( + thd, OPTION_RELAXED_UNIQUE_CHECKS); + + /* Transaction on start caches the fake_changes state and uses it for + complete transaction lifetime. + There are some APIs that doesn't need an active transaction object + but transaction object are just use as a cache object/data carrier. + Before using transaction object for such APIs refresh the state of + fake_changes. */ + if (trx->state == TRX_STATE_NOT_STARTED) { + trx->fake_changes = thd_fake_changes(thd); + } + +#ifdef EXTENDED_SLOWLOG + if (thd_log_slow_verbosity(thd) & (1ULL << SLOG_V_INNODB)) { + trx->take_stats = TRUE; + } else { + trx->take_stats = FALSE; + } +#else + trx->take_stats = FALSE; +#endif + + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Allocates an InnoDB transaction for a MySQL handler object for DML. +@return InnoDB transaction handle */ +UNIV_INTERN +trx_t* +innobase_trx_allocate( +/*==================*/ + THD* thd) /*!< in: user thread handle */ +{ + trx_t* trx; + + DBUG_ENTER("innobase_trx_allocate"); + DBUG_ASSERT(thd != NULL); + DBUG_ASSERT(EQ_CURRENT_THD(thd)); + + trx = trx_allocate_for_mysql(); + + trx->mysql_thd = thd; + + innobase_trx_init(thd, trx); + + DBUG_RETURN(trx); +} + +/*********************************************************************//** +Gets the InnoDB transaction handle for a MySQL handler object, creates +an InnoDB transaction struct if the corresponding MySQL thread struct still +lacks one. +@return InnoDB transaction handle */ +static inline +trx_t* +check_trx_exists( +/*=============*/ + THD* thd) /*!< in: user thread handle */ +{ + trx_t*& trx = thd_to_trx(thd); + + if (trx == NULL) { + trx = innobase_trx_allocate(thd); + thd_set_ha_data(thd, innodb_hton_ptr, trx); + } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) { + mem_analyze_corruption(trx); + ut_error; + } + + innobase_trx_init(thd, trx); + + return(trx); +} + +/************************************************************************* +Gets current trx. */ +trx_t* +innobase_get_trx() +{ + THD *thd=current_thd; + if (likely(thd != 0)) { + trx_t*& trx = thd_to_trx(thd); + return(trx); + } else { + return(NULL); + } +} + +ibool +innobase_get_slow_log() +{ +#ifdef EXTENDED_SLOWLOG + return((ibool) thd_opt_slow_log()); +#else + return(FALSE); +#endif +} + +/*********************************************************************//** +Note that a transaction has been registered with MySQL. +@return true if transaction is registered with MySQL 2PC coordinator */ +static inline +bool +trx_is_registered_for_2pc( +/*=========================*/ + const trx_t* trx) /* in: transaction */ +{ + return(trx->is_registered == 1); +} + +/*********************************************************************//** +Note that innobase_commit_ordered() was run. */ +static inline +void +trx_set_active_commit_ordered( +/*==============================*/ + trx_t* trx) /* in: transaction */ +{ + ut_a(trx_is_registered_for_2pc(trx)); + trx->active_commit_ordered = 1; +} + +/*********************************************************************//** +Note that a transaction has been registered with MySQL 2PC coordinator. */ +static inline +void +trx_register_for_2pc( +/*==================*/ + trx_t* trx) /* in: transaction */ +{ + trx->is_registered = 1; + ut_ad(trx->active_commit_ordered == 0); +} + +/*********************************************************************//** +Note that a transaction has been deregistered. */ +static inline +void +trx_deregister_from_2pc( +/*====================*/ + trx_t* trx) /* in: transaction */ +{ + trx->is_registered = 0; + trx->active_commit_ordered = 0; +} + +/*********************************************************************//** +Check whether a transaction has active_commit_ordered set */ +static inline +bool +trx_is_active_commit_ordered( +/*=========================*/ + const trx_t* trx) /* in: transaction */ +{ + return(trx->active_commit_ordered == 1); +} + +/*********************************************************************//** +Check if transaction is started. +@reutrn true if transaction is in state started */ +static +bool +trx_is_started( +/*===========*/ + trx_t* trx) /* in: transaction */ +{ + return(trx->state != TRX_STATE_NOT_STARTED); +} + +/****************************************************************//** +Update log_checksum_algorithm_ptr with a pointer to the function corresponding +to a given checksum algorithm. */ +static +void +innodb_log_checksum_func_update( +/*============================*/ + ulint algorithm) /*!< in: algorithm */ +{ + switch (algorithm) { + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + case SRV_CHECKSUM_ALGORITHM_INNODB: + log_checksum_algorithm_ptr=log_block_calc_checksum_innodb; + break; + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + case SRV_CHECKSUM_ALGORITHM_CRC32: + log_checksum_algorithm_ptr=log_block_calc_checksum_crc32; + break; + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + case SRV_CHECKSUM_ALGORITHM_NONE: + log_checksum_algorithm_ptr=log_block_calc_checksum_none; + break; + default: + ut_a(0); + } +} + +/****************************************************************//** +On update hook for the innodb_log_checksum_algorithm variable. */ +static +void +innodb_log_checksum_algorithm_update( +/*=================================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + srv_checksum_algorithm_t algorithm; + + algorithm = (srv_checksum_algorithm_t) + (*static_cast<const ulong*>(save)); + + /* Make sure we are the only log user */ + mutex_enter(&log_sys->mutex); + + innodb_log_checksum_func_update(algorithm); + + srv_log_checksum_algorithm = algorithm; + + mutex_exit(&log_sys->mutex); +} + +/*********************************************************************//** +Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_create_info( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const HA_CREATE_INFO* create_info) /*!< in: create info */ +{ + ibool ps_on; + ibool ps_off; + + if (dict_table_is_temporary(innodb_table)) { + /* Temp tables do not use persistent stats. */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = create_info->table_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = create_info->table_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = create_info->stats_sample_pages; +} + +/*********************************************************************//** +Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object. +Those flags are stored in .frm file and end up in the MySQL table object, +but are frequently used inside InnoDB so we keep their copies into the +InnoDB table object. */ +UNIV_INTERN +void +innobase_copy_frm_flags_from_table_share( +/*=====================================*/ + dict_table_t* innodb_table, /*!< in/out: InnoDB table */ + const TABLE_SHARE* table_share) /*!< in: table share */ +{ + ibool ps_on; + ibool ps_off; + + if (dict_table_is_temporary(innodb_table)) { + /* Temp tables do not use persistent stats */ + ps_on = FALSE; + ps_off = TRUE; + } else { + ps_on = table_share->db_create_options + & HA_OPTION_STATS_PERSISTENT; + ps_off = table_share->db_create_options + & HA_OPTION_NO_STATS_PERSISTENT; + } + + dict_stats_set_persistent(innodb_table, ps_on, ps_off); + + dict_stats_auto_recalc_set( + innodb_table, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON, + table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF); + + innodb_table->stats_sample_pages = table_share->stats_sample_pages; +} + +/*********************************************************************//** +Construct ha_innobase handler. */ +UNIV_INTERN +ha_innobase::ha_innobase( +/*=====================*/ + handlerton* hton, + TABLE_SHARE* table_arg) + :handler(hton, table_arg), + int_table_flags(HA_REC_NOT_IN_SEQ | + HA_NULL_IN_KEY | HA_CAN_VIRTUAL_COLUMNS | + HA_CAN_INDEX_BLOBS | + HA_CAN_SQL_HANDLER | + HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | + HA_PRIMARY_KEY_IN_READ_INDEX | + HA_BINLOG_ROW_CAPABLE | + HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ | + HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT | + HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT), + start_of_scan(0), + num_write_row(0) +{} + +/*********************************************************************//** +Destruct ha_innobase handler. */ +UNIV_INTERN +ha_innobase::~ha_innobase() +/*======================*/ +{ +} + +/*********************************************************************//** +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +prebuilt struct. */ +UNIV_INTERN inline +void +ha_innobase::update_thd( +/*====================*/ + THD* thd) /*!< in: thd to use the handle */ +{ + trx_t* trx; + + DBUG_ENTER("ha_innobase::update_thd"); + DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p", + user_thd, thd)); + + /* The table should have been opened in ha_innobase::open(). */ + DBUG_ASSERT(prebuilt->table->n_ref_count > 0); + + trx = check_trx_exists(thd); + + if (prebuilt->trx != trx) { + + row_update_prebuilt_trx(prebuilt, trx); + } + + user_thd = thd; + DBUG_VOID_RETURN; +} + +/*********************************************************************//** +Updates the user_thd field in a handle and also allocates a new InnoDB +transaction handle if needed, and updates the transaction fields in the +prebuilt struct. */ +UNIV_INTERN +void +ha_innobase::update_thd() +/*=====================*/ +{ + THD* thd = ha_thd(); + + ut_ad(EQ_CURRENT_THD(thd)); + update_thd(thd); +} + +/*********************************************************************//** +Registers an InnoDB transaction with the MySQL 2PC coordinator, so that +the MySQL XA code knows to call the InnoDB prepare and commit, or rollback +for the transaction. This MUST be called for every transaction for which +the user may call commit or rollback. Calling this several times to register +the same transaction is allowed, too. This function also registers the +current SQL statement. */ +static inline +void +innobase_register_trx( +/*==================*/ + handlerton* hton, /* in: Innobase handlerton */ + THD* thd, /* in: MySQL thd (connection) object */ + trx_t* trx) /* in: transaction to register */ +{ + trans_register_ha(thd, FALSE, hton); + + if (!trx_is_registered_for_2pc(trx) + && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + trans_register_ha(thd, TRUE, hton); + } + + trx_register_for_2pc(trx); +} + +/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB + ------------------------------------------------------------ + +1) The use of the query cache for TBL is disabled when there is an +uncommitted change to TBL. + +2) When a change to TBL commits, InnoDB stores the current value of +its global trx id counter, let us denote it by INV_TRX_ID, to the table object +in the InnoDB data dictionary, and does only allow such transactions whose +id <= INV_TRX_ID to use the query cache. + +3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit +modification because an ON DELETE CASCADE, we invalidate the MySQL query cache +of TBL immediately. + +How this is implemented inside InnoDB: + +1) Since every modification always sets an IX type table lock on the InnoDB +table, it is easy to check if there can be uncommitted modifications for a +table: just check if there are locks in the lock list of the table. + +2) When a transaction inside InnoDB commits, it reads the global trx id +counter and stores the value INV_TRX_ID to the tables on which it had a lock. + +3) If there is an implicit table change from ON DELETE CASCADE or SET NULL, +InnoDB calls an invalidate method for the MySQL query cache for that table. + +How this is implemented inside sql_cache.cc: + +1) The query cache for an InnoDB table TBL is invalidated immediately at an +INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay +invalidation to the transaction commit. + +2) To store or retrieve a value from the query cache of an InnoDB table TBL, +any query must first ask InnoDB's permission. We must pass the thd as a +parameter because InnoDB will look at the trx id, if any, associated with +that thd. Also the full_name which is used as key to search for the table +object. The full_name is a string containing the normalized path to the +table in the canonical format. + +3) Use of the query cache for InnoDB tables is now allowed also when +AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer +put restrictions on the use of the query cache. +*/ + +/******************************************************************//** +The MySQL query cache uses this to check from InnoDB if the query cache at +the moment is allowed to operate on an InnoDB table. The SQL query must +be a non-locking SELECT. + +The query cache is allowed to operate on certain query only if this function +returns TRUE for all tables in the query. + +If thd is not in the autocommit state, this function also starts a new +transaction for thd if there is no active trx yet, and assigns a consistent +read view to it if there is no read view yet. + +Why a deadlock of threads is not possible: the query cache calls this function +at the start of a SELECT processing. Then the calling thread cannot be +holding any InnoDB semaphores. The calling thread is holding the +query cache mutex, and this function will reserve the InnoDB trx_sys->mutex. +Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above +the InnoDB trx_sys->mutex. +@return TRUE if permitted, FALSE if not; note that the value FALSE +does not mean we should invalidate the query cache: invalidation is +called explicitly */ +static +my_bool +innobase_query_caching_of_table_permitted( +/*======================================*/ + THD* thd, /*!< in: thd of the user who is trying to + store a result to the query cache or + retrieve it */ + char* full_name, /*!< in: normalized path to the table */ + uint full_name_len, /*!< in: length of the normalized path + to the table */ + ulonglong *unused) /*!< unused for this engine */ +{ + ibool is_autocommit; + trx_t* trx; + char norm_name[1000]; + + ut_a(full_name_len < 999); + + trx = check_trx_exists(thd); + + if (trx->isolation_level == TRX_ISO_SERIALIZABLE) { + /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every + plain SELECT if AUTOCOMMIT is not on. */ + + return((my_bool)FALSE); + } + + if (UNIV_UNLIKELY(trx->has_search_latch)) { + sql_print_error("The calling thread is holding the adaptive " + "search, latch though calling " + "innobase_query_caching_of_table_permitted."); + trx_print(stderr, trx, 1024); + } + + trx_search_latch_release_if_reserved(trx); + + innobase_srv_conc_force_exit_innodb(trx); + + if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { + + is_autocommit = TRUE; + } else { + is_autocommit = FALSE; + + } + + if (is_autocommit && trx->n_mysql_tables_in_use == 0) { + /* We are going to retrieve the query result from the query + cache. This cannot be a store operation to the query cache + because then MySQL would have locks on tables already. + + TODO: if the user has used LOCK TABLES to lock the table, + then we open a transaction in the call of row_.. below. + That trx can stay open until UNLOCK TABLES. The same problem + exists even if we do not use the query cache. MySQL should be + modified so that it ALWAYS calls some cleanup function when + the processing of a query ends! + + We can imagine we instantaneously serialize this consistent + read trx to the current trx id counter. If trx2 would have + changed the tables of a query result stored in the cache, and + trx2 would have already committed, making the result obsolete, + then trx2 would have already invalidated the cache. Thus we + can trust the result in the cache is ok for this query. */ + + return((my_bool)TRUE); + } + + /* Normalize the table name to InnoDB format */ + normalize_table_name(norm_name, full_name); + + innobase_register_trx(innodb_hton_ptr, thd, trx); + + if (row_search_check_if_query_cache_permitted(trx, norm_name)) { + + /* printf("Query cache for %s permitted\n", norm_name); */ + + return((my_bool)TRUE); + } + + /* printf("Query cache for %s NOT permitted\n", norm_name); */ + + return((my_bool)FALSE); +} + +/*****************************************************************//** +Invalidates the MySQL query cache for the table. */ +UNIV_INTERN +void +innobase_invalidate_query_cache( +/*============================*/ + trx_t* trx, /*!< in: transaction which + modifies the table */ + const char* full_name, /*!< in: concatenation of + database name, null char NUL, + table name, null char NUL; + NOTE that in Windows this is + always in LOWER CASE! */ + ulint full_name_len) /*!< in: full name length where + also the null chars count */ +{ + /* Note that the sync0sync.h rank of the query cache mutex is just + above the InnoDB trx_sys_t->lock. The caller of this function must + not have latches of a lower rank. */ + +#ifdef HAVE_QUERY_CACHE + char qcache_key_name[2 * (NAME_LEN + 1)]; + size_t tabname_len; + size_t dbname_len; + + /* Construct the key("db-name\0table$name\0") for the query cache using + the path name("db@002dname\0table@0024name\0") of the table in its + canonical form. */ + dbname_len = filename_to_tablename(full_name, qcache_key_name, + sizeof(qcache_key_name)); + tabname_len = filename_to_tablename(full_name + strlen(full_name) + 1, + qcache_key_name + dbname_len + 1, + sizeof(qcache_key_name) + - dbname_len - 1); + + /* Argument TRUE below means we are using transactions */ + mysql_query_cache_invalidate4(trx->mysql_thd, + qcache_key_name, + (dbname_len + tabname_len + 2), + TRUE); +#endif +} + +/*****************************************************************//** +Convert an SQL identifier to the MySQL system_charset_info (UTF-8) +and quote it if needed. +@return pointer to the end of buf */ +static +char* +innobase_convert_identifier( +/*========================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: identifier to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ + ibool file_id)/*!< in: TRUE=id is a table or database name; + FALSE=id is an UTF-8 string */ +{ + char nz2[MAX_TABLE_NAME_LEN + 1]; + const char* s = id; + int q; + + if (file_id) { + + char nz[MAX_TABLE_NAME_LEN + 1]; + + /* Decode the table name. The MySQL function expects + a NUL-terminated string. The input and output strings + buffers must not be shared. */ + ut_a(idlen <= MAX_TABLE_NAME_LEN); + memcpy(nz, id, idlen); + nz[idlen] = 0; + + s = nz2; + idlen = explain_filename(thd, nz, nz2, sizeof nz2, + EXPLAIN_PARTITIONS_AS_COMMENT); + goto no_quote; + } + + /* See if the identifier needs to be quoted. */ + if (UNIV_UNLIKELY(!thd)) { + q = '"'; + } else { + q = get_quote_char_for_identifier(thd, s, (int) idlen); + } + + if (q == EOF) { +no_quote: + if (UNIV_UNLIKELY(idlen > buflen)) { + idlen = buflen; + } + memcpy(buf, s, idlen); + return(buf + idlen); + } + + /* Quote the identifier. */ + if (buflen < 2) { + return(buf); + } + + *buf++ = q; + buflen--; + + for (; idlen; idlen--) { + int c = *s++; + if (UNIV_UNLIKELY(c == q)) { + if (UNIV_UNLIKELY(buflen < 3)) { + break; + } + + *buf++ = c; + *buf++ = c; + buflen -= 2; + } else { + if (UNIV_UNLIKELY(buflen < 2)) { + break; + } + + *buf++ = c; + buflen--; + } + } + + *buf++ = q; + return(buf); +} + +/*****************************************************************//** +Convert a table or index name to the MySQL system_charset_info (UTF-8) +and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +char* +innobase_convert_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* id, /*!< in: identifier to convert */ + ulint idlen, /*!< in: length of id, in bytes */ + THD* thd, /*!< in: MySQL connection thread, or NULL */ + ibool table_id)/*!< in: TRUE=id is a table or database name; + FALSE=id is an index name */ +{ + char* s = buf; + const char* bufend = buf + buflen; + + if (table_id) { + const char* slash = (const char*) memchr(id, '/', idlen); + if (!slash) { + + goto no_db_name; + } + + /* Print the database name and table name separately. */ + s = innobase_convert_identifier(s, bufend - s, id, slash - id, + thd, TRUE); + if (UNIV_LIKELY(s < bufend)) { + *s++ = '.'; + s = innobase_convert_identifier(s, bufend - s, + slash + 1, idlen + - (slash - id) - 1, + thd, TRUE); + } + } else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) { + /* Temporary index name (smart ALTER TABLE) */ + const char temp_index_suffix[]= "--temporary--"; + + s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1, + thd, FALSE); + if (s - buf + (sizeof temp_index_suffix - 1) < buflen) { + memcpy(s, temp_index_suffix, + sizeof temp_index_suffix - 1); + s += sizeof temp_index_suffix - 1; + } + } else { +no_db_name: + s = innobase_convert_identifier(buf, buflen, id, idlen, + thd, table_id); + } + + return(s); +} + +/*****************************************************************//** +A wrapper function of innobase_convert_name(), convert a table or +index name to the MySQL system_charset_info (UTF-8) and quote it if needed. +@return pointer to the end of buf */ +UNIV_INTERN +void +innobase_format_name( +/*==================*/ + char* buf, /*!< out: buffer for converted identifier */ + ulint buflen, /*!< in: length of buf, in bytes */ + const char* name, /*!< in: index or table name to format */ + ibool is_index_name) /*!< in: index name */ +{ + const char* bufend; + + bufend = innobase_convert_name(buf, buflen, name, strlen(name), + NULL, !is_index_name); + + ut_ad((ulint) (bufend - buf) < buflen); + + buf[bufend - buf] = '\0'; +} + +/**********************************************************************//** +Determines if the currently running transaction has been interrupted. +@return TRUE if interrupted */ +UNIV_INTERN +ibool +trx_is_interrupted( +/*===============*/ + const trx_t* trx) /*!< in: transaction */ +{ + return(trx && trx->mysql_thd && thd_kill_level((THD*) trx->mysql_thd)); +} + +/**********************************************************************//** +Determines if the currently running transaction is in strict mode. +@return TRUE if strict */ +UNIV_INTERN +ibool +trx_is_strict( +/*==========*/ + trx_t* trx) /*!< in: transaction */ +{ + return(trx && trx->mysql_thd && THDVAR(trx->mysql_thd, strict_mode)); +} + +/**************************************************************//** +Resets some fields of a prebuilt struct. The template is used in fast +retrieval of just those column values MySQL needs in its processing. */ +inline +void +ha_innobase::reset_template(void) +/*=============================*/ +{ + ut_ad(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED); + ut_ad(prebuilt->magic_n2 == prebuilt->magic_n); + + /* Force table to be freed in close_thread_table(). */ + DBUG_EXECUTE_IF("free_table_in_fts_query", + if (prebuilt->in_fts_query) { + table->m_needs_reopen = true; + } + ); + + prebuilt->keep_other_fields_on_keyread = 0; + prebuilt->read_just_key = 0; + prebuilt->in_fts_query = 0; + /* Reset index condition pushdown state. */ + if (prebuilt->idx_cond) { + prebuilt->idx_cond = NULL; + prebuilt->idx_cond_n_cols = 0; + /* Invalidate prebuilt->mysql_template + in ha_innobase::write_row(). */ + prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE; + } +} + +/*****************************************************************//** +Call this when you have opened a new table handle in HANDLER, before you +call index_read_idx() etc. Actually, we can let the cursor stay open even +over a transaction commit! Then you should call this before every operation, +fetch next etc. This function inits the necessary things even after a +transaction commit. */ +UNIV_INTERN +void +ha_innobase::init_table_handle_for_HANDLER(void) +/*============================================*/ +{ + /* If current thd does not yet have a trx struct, create one. + If the current handle does not yet have a prebuilt struct, create + one. Update the trx pointers in the prebuilt struct. Normally + this operation is done in external_lock. */ + + update_thd(ha_thd()); + + /* Initialize the prebuilt struct much like it would be inited in + external_lock */ + + trx_search_latch_release_if_reserved(prebuilt->trx); + + innobase_srv_conc_force_exit_innodb(prebuilt->trx); + + /* If the transaction is not started yet, start it */ + + trx_start_if_not_started_xa(prebuilt->trx); + + /* Assign a read view if the transaction does not have it yet */ + + trx_assign_read_view(prebuilt->trx); + + innobase_register_trx(ht, user_thd, prebuilt->trx); + + /* We did the necessary inits in this function, no need to repeat them + in row_search_for_mysql */ + + prebuilt->sql_stat_start = FALSE; + + /* We let HANDLER always to do the reads as consistent reads, even + if the trx isolation level would have been specified as SERIALIZABLE */ + + prebuilt->select_lock_type = LOCK_NONE; + prebuilt->stored_select_lock_type = LOCK_NONE; + + /* Always fetch all columns in the index record */ + + prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS; + + /* We want always to fetch all columns in the whole row? Or do + we???? */ + + prebuilt->used_in_HANDLER = TRUE; + reset_template(); +} + +/****************************************************************//** +Gives the file extension of an InnoDB single-table tablespace. */ +static const char* ha_innobase_exts[] = { + ".ibd", + ".isl", + NullS +}; + +/*********************************************************************//** +Opens an InnoDB database. +@return 0 on success, error code on failure */ +static +int +innobase_init( +/*==========*/ + void *p) /*!< in: InnoDB handlerton */ +{ + static char current_dir[3]; /*!< Set if using current lib */ + int err; + bool ret; + char *default_path; + uint format_id; + ulong num_pll_degree; + + DBUG_ENTER("innobase_init"); + handlerton *innobase_hton= (handlerton*) p; + innodb_hton_ptr = innobase_hton; + + innobase_hton->state = SHOW_OPTION_YES; + innobase_hton->db_type= DB_TYPE_INNODB; + innobase_hton->savepoint_offset = sizeof(trx_named_savept_t); + innobase_hton->close_connection = innobase_close_connection; + innobase_hton->savepoint_set = innobase_savepoint; + innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint; + innobase_hton->savepoint_rollback_can_release_mdl = + innobase_rollback_to_savepoint_can_release_mdl; + innobase_hton->savepoint_release = innobase_release_savepoint; + innobase_hton->commit_ordered=innobase_commit_ordered; + innobase_hton->commit = innobase_commit; + innobase_hton->rollback = innobase_rollback; + innobase_hton->prepare = innobase_xa_prepare; + innobase_hton->recover = innobase_xa_recover; + innobase_hton->commit_by_xid = innobase_commit_by_xid; + innobase_hton->rollback_by_xid = innobase_rollback_by_xid; + innobase_hton->commit_checkpoint_request=innobase_checkpoint_request; + innobase_hton->checkpoint_state= innobase_checkpoint_state; + innobase_hton->create_cursor_read_view = innobase_create_cursor_view; + innobase_hton->set_cursor_read_view = innobase_set_cursor_view; + innobase_hton->close_cursor_read_view = innobase_close_cursor_view; + innobase_hton->create = innobase_create_handler; + innobase_hton->drop_database = innobase_drop_database; + innobase_hton->panic = innobase_end; + + innobase_hton->start_consistent_snapshot = + innobase_start_trx_and_assign_read_view; + + /*innobase_hton->store_binlog_info = + innobase_store_binlog_info;*/ + + innobase_hton->flush_logs = innobase_flush_logs; + innobase_hton->show_status = innobase_show_status; + innobase_hton->flags = HTON_SUPPORTS_EXTENDED_KEYS | + HTON_SUPPORTS_FOREIGN_KEYS; + + innobase_hton->release_temporary_latches = + innobase_release_temporary_latches; + + innobase_hton->kill_query = innobase_kill_connection; + + if (srv_file_per_table) + innobase_hton->tablefile_extensions = ha_innobase_exts; + +#ifdef WITH_WSREP + innobase_hton->wsrep_abort_transaction=wsrep_abort_transaction; + innobase_hton->wsrep_set_checkpoint=innobase_wsrep_set_checkpoint; + innobase_hton->wsrep_get_checkpoint=innobase_wsrep_get_checkpoint; + innobase_hton->wsrep_fake_trx_id=wsrep_fake_trx_id; +#endif /* WITH_WSREP */ + + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); + +#ifndef DBUG_OFF + static const char test_filename[] = "-@"; + char test_tablename[sizeof test_filename + + sizeof(srv_mysql50_table_name_prefix) - 1]; + if ((sizeof(test_tablename)) - 1 + != filename_to_tablename(test_filename, + test_tablename, + sizeof(test_tablename), true) + || strncmp(test_tablename, + srv_mysql50_table_name_prefix, + sizeof(srv_mysql50_table_name_prefix) - 1) + || strcmp(test_tablename + + sizeof(srv_mysql50_table_name_prefix) - 1, + test_filename)) { + + sql_print_error("tablename encoding has been changed"); + + goto error; + } +#endif /* DBUG_OFF */ + + srv_log_block_size = 0; + if (innobase_log_block_size != (1 << 9)) { /*!=512*/ + uint n_shift; + + fprintf(stderr, + "InnoDB: Warning: innodb_log_block_size has been " + "changed from default value 512. (###EXPERIMENTAL### " + "operation)\n"); + for (n_shift = 9; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX; + n_shift++) { + if (innobase_log_block_size == ((ulong)1 << n_shift)) { + srv_log_block_size = (1 << n_shift); + fprintf(stderr, + "InnoDB: The log block size is set to " + ULINTPF ".\n",srv_log_block_size); + break; + } + } + } else { + srv_log_block_size = 512; + } + ut_ad (srv_log_block_size >= OS_MIN_LOG_BLOCK_SIZE); + + if (!srv_log_block_size) { + fprintf(stderr, + "InnoDB: Error: %lu is not a valid value for " + "innodb_log_block_size.\n" + "InnoDB: Error: A valid value for " + "innodb_log_block_size is\n" + "InnoDB: Error: a power of 2 from 512 to 16384.\n", + innobase_log_block_size); + goto error; + } + + /* Check that values don't overflow on 32-bit systems. */ + if (sizeof(ulint) == 4) { + if (innobase_buffer_pool_size > UINT_MAX32) { + sql_print_error( + "innobase_buffer_pool_size can't be over 4GB" + " on 32-bit systems"); + + goto error; + } + } + + os_innodb_umask = (ulint) my_umask; + + /* First calculate the default path for innodb_data_home_dir etc., + in case the user has not given any value. + + Note that when using the embedded server, the datadirectory is not + necessarily the current directory of this program. */ + + if (mysqld_embedded) { + default_path = mysql_real_data_home; + fil_path_to_mysql_datadir = mysql_real_data_home; + } else { + /* It's better to use current lib, to keep paths short */ + current_dir[0] = FN_CURLIB; + current_dir[1] = FN_LIBCHAR; + current_dir[2] = 0; + default_path = current_dir; + } + + ut_a(default_path); + + /* Set InnoDB initialization parameters according to the values + read from MySQL .cnf file */ + + /*--------------- Data files -------------------------*/ + + /* The default dir for data files is the datadir of MySQL */ + + srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir : + default_path); + + /* Set default InnoDB data file size to 12 MB and let it be + auto-extending. Thus users can use InnoDB in >= 4.0 without having + to specify any startup options. */ + + if (!innobase_data_file_path) { + innobase_data_file_path = (char*) "ibdata1:12M:autoextend"; + } + + /* Since InnoDB edits the argument in the next call, we make another + copy of it: */ + + internal_innobase_data_file_path = my_strdup(innobase_data_file_path, + MYF(MY_FAE)); + + ret = (bool) srv_parse_data_file_paths_and_sizes( + internal_innobase_data_file_path); + if (ret == FALSE) { + sql_print_error( + "InnoDB: syntax error in innodb_data_file_path" + " or size specified is less than 1 megabyte"); +mem_free_and_error: + srv_free_paths_and_sizes(); + my_free(internal_innobase_data_file_path); + goto error; + } + + /* -------------- All log files ---------------------------*/ + + /* The default dir for log files is the datadir of MySQL */ + + if (!srv_log_group_home_dir) { + srv_log_group_home_dir = default_path; + } + +#ifdef UNIV_LOG_ARCHIVE + if (!innobase_log_arch_dir) { + innobase_log_arch_dir = srv_log_group_home_dir; + } + srv_arch_dir = innobase_log_arch_dir; +#endif /* UNIG_LOG_ARCHIVE */ + + srv_normalize_path_for_win(srv_log_group_home_dir); + + if (strchr(srv_log_group_home_dir, ';')) { + sql_print_error("syntax error in innodb_log_group_home_dir"); + goto mem_free_and_error; + } + + if (innobase_mirrored_log_groups == 1) { + sql_print_warning( + "innodb_mirrored_log_groups is an unimplemented " + "feature and the variable will be completely " + "removed in a future version."); + } + + if (innobase_mirrored_log_groups > 1) { + sql_print_error( + "innodb_mirrored_log_groups is an unimplemented feature and " + "the variable will be completely removed in a future version. " + "Using values other than 1 is not supported."); + goto mem_free_and_error; + } + + if (innobase_mirrored_log_groups == 0) { + /* To throw a deprecation warning message when the option is + passed, the default was changed to '0' (as a workaround). Since + the only value accepted for this option is '1', reset it to 1 */ + innobase_mirrored_log_groups = 1; + } + + /* Validate the file format by animal name */ + if (innobase_file_format_name != NULL) { + + format_id = innobase_file_format_name_lookup( + innobase_file_format_name); + + if (format_id > UNIV_FORMAT_MAX) { + + sql_print_error("InnoDB: wrong innodb_file_format."); + + goto mem_free_and_error; + } + } else { + /* Set it to the default file format id. Though this + should never happen. */ + format_id = 0; + } + + srv_file_format = format_id; + + /* Given the type of innobase_file_format_name we have little + choice but to cast away the constness from the returned name. + innobase_file_format_name is used in the MySQL set variable + interface and so can't be const. */ + + innobase_file_format_name = + (char*) trx_sys_file_format_id_to_name(format_id); + + /* Check innobase_file_format_check variable */ + if (!innobase_file_format_check) { + + /* Set the value to disable checking. */ + srv_max_file_format_at_startup = UNIV_FORMAT_MAX + 1; + + } else { + + /* Set the value to the lowest supported format. */ + srv_max_file_format_at_startup = UNIV_FORMAT_MIN; + } + + /* Did the user specify a format name that we support? + As a side effect it will update the variable + srv_max_file_format_at_startup */ + if (innobase_file_format_validate_and_set( + innobase_file_format_max) < 0) { + + sql_print_error("InnoDB: invalid " + "innodb_file_format_max value: " + "should be any value up to %s or its " + "equivalent numeric id", + trx_sys_file_format_id_to_name( + UNIV_FORMAT_MAX)); + + goto mem_free_and_error; + } + + if (innobase_change_buffering) { + ulint use; + + for (use = 0; + use < UT_ARR_SIZE(innobase_change_buffering_values); + use++) { + if (!innobase_strcasecmp( + innobase_change_buffering, + innobase_change_buffering_values[use])) { + ibuf_use = (ibuf_use_t) use; + goto innobase_change_buffering_inited_ok; + } + } + + sql_print_error("InnoDB: invalid value " + "innodb_change_buffering=%s", + innobase_change_buffering); + goto mem_free_and_error; + } + +innobase_change_buffering_inited_ok: + ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values)); + innobase_change_buffering = (char*) + innobase_change_buffering_values[ibuf_use]; + + /* Check that interdependent parameters have sane values. */ + if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) { + sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm" + " cannot be set higher than" + " innodb_max_dirty_pages_pct.\n" + "InnoDB: Setting" + " innodb_max_dirty_pages_pct_lwm to %lf\n", + srv_max_buf_pool_modified_pct); + + srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct; + } + + if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) { + + if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) { + /* Avoid overflow. */ + srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT; + } else { + /* The user has not set the value. We should + set it based on innodb_io_capacity. */ + srv_max_io_capacity = static_cast<ulong>( + ut_max(2 * srv_io_capacity, 2000)); + } + + } else if (srv_max_io_capacity < srv_io_capacity) { + sql_print_warning("InnoDB: innodb_io_capacity" + " cannot be set higher than" + " innodb_io_capacity_max.\n" + "InnoDB: Setting" + " innodb_io_capacity to %lu\n", + srv_max_io_capacity); + + srv_io_capacity = srv_max_io_capacity; + } + + if (!is_filename_allowed(srv_buf_dump_filename, + strlen(srv_buf_dump_filename), FALSE)) { + sql_print_error("InnoDB: innodb_buffer_pool_filename" + " cannot have colon (:) in the file name."); + goto mem_free_and_error; + } + + /* --------------------------------------------------*/ + + srv_file_flush_method_str = innobase_file_flush_method; + + srv_log_file_size = (ib_uint64_t) innobase_log_file_size; + +#ifdef UNIV_LOG_ARCHIVE + srv_log_archive_on = (ulint) innobase_log_archive; +#endif /* UNIV_LOG_ARCHIVE */ + + /* Check that the value of system variable innodb_page_size was + set correctly. Its value was put into srv_page_size. If valid, + return the associated srv_page_size_shift.*/ + srv_page_size_shift = innodb_page_size_validate(srv_page_size); + if (!srv_page_size_shift) { + sql_print_error("InnoDB: Invalid page size=%lu.\n", + srv_page_size); + goto mem_free_and_error; + } + if (UNIV_PAGE_SIZE_DEF != srv_page_size) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: innodb-page-size has been changed" + " from the default value %d to %lu.\n", + UNIV_PAGE_SIZE_DEF, srv_page_size); + } + + srv_log_buffer_size = (ulint) innobase_log_buffer_size; + + if (innobase_buffer_pool_instances == 0) { + innobase_buffer_pool_instances = 8; + +#if defined(__WIN__) && !defined(_WIN64) + if (innobase_buffer_pool_size > 1331 * 1024 * 1024) { + innobase_buffer_pool_instances + = ut_min(MAX_BUFFER_POOLS, + (long) (innobase_buffer_pool_size + / (128 * 1024 * 1024))); + } +#endif /* defined(__WIN__) && !defined(_WIN64) */ + } + srv_buf_pool_size = (ulint) innobase_buffer_pool_size; + srv_buf_pool_instances = (ulint) innobase_buffer_pool_instances; + + srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size; + + if (innobase_additional_mem_pool_size + != 8*1024*1024L /* the default */ ) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Using " + "innodb_additional_mem_pool_size is DEPRECATED. " + "This option may be removed in future releases, " + "together with the option innodb_use_sys_malloc " + "and with the InnoDB's internal memory " + "allocator.\n"); + } + + if (!srv_use_sys_malloc ) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Setting " + "innodb_use_sys_malloc to FALSE is DEPRECATED. " + "This option may be removed in future releases, " + "together with the InnoDB's internal memory " + "allocator.\n"); + } + + srv_n_file_io_threads = (ulint) innobase_file_io_threads; + srv_n_read_io_threads = (ulint) innobase_read_io_threads; + srv_n_write_io_threads = (ulint) innobase_write_io_threads; + + srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite; + + if (!innobase_use_checksums) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Setting " + "innodb_checksums to OFF is DEPRECATED. " + "This option may be removed in future releases. " + "You should set innodb_checksum_algorithm=NONE " + "instead.\n"); + srv_checksum_algorithm = SRV_CHECKSUM_ALGORITHM_NONE; + } + + innodb_log_checksum_func_update(srv_log_checksum_algorithm); + +#ifdef HAVE_LARGE_PAGES + if ((os_use_large_pages = (ibool) my_use_large_pages)) { + os_large_page_size = (ulint) opt_large_page_size; + } +#endif + + row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout; + + srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog; + if (innobase_locks_unsafe_for_binlog) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: Using " + "innodb_locks_unsafe_for_binlog is DEPRECATED. " + "This option may be removed in future releases. " + "Please use READ COMMITTED transaction isolation " + "level instead, see " REFMAN "set-transaction.html.\n"); + } + + if (innobase_open_files < 10) { + innobase_open_files = 300; + if (srv_file_per_table && tc_size > 300) { + innobase_open_files = tc_size; + } + } + + if (innobase_open_files > (long) open_files_limit) { + fprintf(stderr, + "innodb_open_files should not be greater" + " than the open_files_limit.\n"); + if (innobase_open_files > (long) tc_size) { + innobase_open_files = tc_size; + } + } + + srv_max_n_open_files = (ulint) innobase_open_files; + srv_innodb_status = (ibool) innobase_create_status_file; + + srv_print_verbose_log = mysqld_embedded ? 0 : 1; + + /* Round up fts_sort_pll_degree to nearest power of 2 number */ + for (num_pll_degree = 1; + num_pll_degree < fts_sort_pll_degree; + num_pll_degree <<= 1) { + + /* No op */ + } + + fts_sort_pll_degree = num_pll_degree; + + /* Store the default charset-collation number of this MySQL + installation */ + + data_mysql_default_charset_coll = (ulint) default_charset_info->number; + + ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL == + my_charset_latin1.number); + ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number); + + /* Store the latin1_swedish_ci character ordering table to InnoDB. For + non-latin1_swedish_ci charsets we use the MySQL comparison functions, + and consequently we do not need to know the ordering internally in + InnoDB. */ + + srv_latin1_ordering = my_charset_latin1.sort_order; + + innobase_commit_concurrency_init_default(); + +#ifdef HAVE_POSIX_FALLOCATE + srv_use_posix_fallocate = (ibool) innobase_use_fallocate; +#endif + /* Do not enable backoff algorithm for small buffer pool. */ + if (!innodb_empty_free_list_algorithm_allowed( + static_cast<srv_empty_free_list_t>( + srv_empty_free_list_algorithm))) { + sql_print_information( + "InnoDB: innodb_empty_free_list_algorithm " + "has been changed to legacy " + "because of small buffer pool size. " + "In order to use backoff, " + "increase buffer pool at least up to 20MB.\n"); + srv_empty_free_list_algorithm + = SRV_EMPTY_FREE_LIST_LEGACY; + } + + srv_use_atomic_writes = (ibool) innobase_use_atomic_writes; + if (innobase_use_atomic_writes) { + ib_logf(IB_LOG_LEVEL_INFO, "using atomic writes."); + + /* Force doublewrite buffer off, atomic writes replace it. */ + if (srv_use_doublewrite_buf) { + ib_logf(IB_LOG_LEVEL_INFO, "switching off doublewrite " + "buffer because of atomic writes."); + innobase_use_doublewrite = FALSE; + srv_use_doublewrite_buf = FALSE; + } + + /* Force O_DIRECT on Unixes (on Windows writes are always + unbuffered)*/ +#ifndef _WIN32 + if(!innobase_file_flush_method || + !strstr(innobase_file_flush_method, "O_DIRECT")) { + innobase_file_flush_method = + srv_file_flush_method_str = (char*)"O_DIRECT"; + ib_logf(IB_LOG_LEVEL_INFO, + "using O_DIRECT due to atomic writes."); + } +#endif +#ifdef HAVE_POSIX_FALLOCATE + /* Due to a bug in directFS, using atomics needs + posix_fallocate() to extend the file, because pwrite() past the + end of the file won't work */ + srv_use_posix_fallocate = TRUE; +#endif + } + +#ifdef HAVE_PSI_INTERFACE + /* Register keys with MySQL performance schema */ + int count; + + count = array_elements(all_pthread_mutexes); + mysql_mutex_register("innodb", all_pthread_mutexes, count); + +# ifdef UNIV_PFS_MUTEX + count = array_elements(all_innodb_mutexes); + mysql_mutex_register("innodb", all_innodb_mutexes, count); +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_RWLOCK + count = array_elements(all_innodb_rwlocks); + mysql_rwlock_register("innodb", all_innodb_rwlocks, count); +# endif /* UNIV_PFS_MUTEX */ + +# ifdef UNIV_PFS_THREAD + count = array_elements(all_innodb_threads); + mysql_thread_register("innodb", all_innodb_threads, count); +# endif /* UNIV_PFS_THREAD */ + +# ifdef UNIV_PFS_IO + count = array_elements(all_innodb_files); + mysql_file_register("innodb", all_innodb_files, count); +# endif /* UNIV_PFS_IO */ + + count = array_elements(all_innodb_conds); + mysql_cond_register("innodb", all_innodb_conds, count); +#endif /* HAVE_PSI_INTERFACE */ + + /* Since we in this module access directly the fields of a trx + struct, and due to different headers and flags it might happen that + ib_mutex_t has a different size in this module and in InnoDB + modules, we check at run time that the size is the same in + these compilation modules. */ + + err = innobase_start_or_create_for_mysql(); + + if (err != DB_SUCCESS) { + goto mem_free_and_error; + } + + /* Adjust the innodb_undo_logs config object */ + innobase_undo_logs_init_default_max(); + + innobase_old_blocks_pct = static_cast<uint>( + buf_LRU_old_ratio_update(innobase_old_blocks_pct, TRUE)); + + ibuf_max_size_update(innobase_change_buffer_max_size); + + innobase_open_tables = hash_create(200); + mysql_mutex_init(innobase_share_mutex_key, + &innobase_share_mutex, + MY_MUTEX_INIT_FAST); + mysql_mutex_init(commit_cond_mutex_key, + &commit_cond_m, MY_MUTEX_INIT_FAST); + mysql_cond_init(commit_cond_key, &commit_cond, NULL); + mysql_mutex_init(pending_checkpoint_mutex_key, + &pending_checkpoint_mutex, + MY_MUTEX_INIT_FAST); + innodb_inited= 1; +#ifdef MYSQL_DYNAMIC_PLUGIN + if (innobase_hton != p) { + innobase_hton = reinterpret_cast<handlerton*>(p); + *innobase_hton = *innodb_hton_ptr; + } +#endif /* MYSQL_DYNAMIC_PLUGIN */ + + /* Get the current high water mark format. */ + innobase_file_format_max = (char*) trx_sys_file_format_max_get(); + + /* Currently, monitor counter information are not persistent. */ + memset(monitor_set_tbl, 0, sizeof monitor_set_tbl); + + memset(innodb_counter_value, 0, sizeof innodb_counter_value); + + /* Do this as late as possible so server is fully starts up, + since we might get some initial stats if user choose to turn + on some counters from start up */ + if (innobase_enable_monitor_counter) { + innodb_enable_monitor_at_startup( + innobase_enable_monitor_counter); + } + + /* Turn on monitor counters that are default on */ + srv_mon_default_on(); + + DBUG_RETURN(FALSE); +error: + DBUG_RETURN(TRUE); +} + +/** Shut down the InnoDB storage engine. +@return 0 */ +static +int +innobase_end(handlerton*, ha_panic_function) { - if (thd) { - thd_storage_lock_wait((THD*)thd, value); + DBUG_ENTER("innobase_end"); + + if (innodb_inited) { + + THD *thd= current_thd; + if (thd) { // may be UNINSTALL PLUGIN statement + trx_t* trx = thd_to_trx(thd); + if (trx) { + trx_free_for_mysql(trx); + } + } + + srv_fast_shutdown = (ulint) innobase_fast_shutdown; + + innodb_inited = 0; + hash_table_free(innobase_open_tables); + innobase_open_tables = NULL; + innodb_shutdown(); + srv_free_paths_and_sizes(); + my_free(internal_innobase_data_file_path); + mysql_mutex_destroy(&innobase_share_mutex); + mysql_mutex_destroy(&commit_cond_m); + mysql_cond_destroy(&commit_cond); + mysql_mutex_destroy(&pending_checkpoint_mutex); } + + DBUG_RETURN(0); } -/******************************************************************//** -*/ -extern "C" UNIV_INTERN -ulong -thd_flush_log_at_trx_commit( -/*================================*/ - void* thd) +/****************************************************************//** +Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes +the logs, and the name of this function should be innobase_checkpoint. +@return TRUE if error */ +static +bool +innobase_flush_logs( +/*================*/ + handlerton* hton) /*!< in/out: InnoDB handlerton */ { - return(THDVAR((THD*) thd, flush_log_at_trx_commit)); + bool result = 0; + + DBUG_ENTER("innobase_flush_logs"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + if (!srv_read_only_mode) { + log_buffer_flush_to_disk(); + } + + DBUG_RETURN(result); } -/******************************************************************//** -Returns the merge-sort block size used for the secondary index creation -for the current connection. -@return the merge-sort block size, in bytes */ -extern "C" UNIV_INTERN -ulong -thd_merge_sort_block_size( -/*================================*/ - void* thd) /*!< in: thread handle (THD*), or NULL to query -+ the global merge_sort_block_size */ +/************************************************************//** +Synchronously read and parse the redo log up to the last +checkpoint to write the changed page bitmap. +@return 0 to indicate success. Current implementation cannot fail. */ +static +my_bool +innobase_flush_changed_page_bitmaps() +/*=================================*/ { - return(THDVAR((THD*) thd, merge_sort_block_size)); + if (srv_track_changed_pages) { + os_event_reset(srv_checkpoint_completed_event); + log_online_follow_redo_log(); + } + return FALSE; } -/********************************************************************//** -Obtain the InnoDB transaction of a MySQL thread. -@return reference to transaction pointer */ -static inline -trx_t*& -thd_to_trx( -/*=======*/ - THD* thd) /*!< in: MySQL thread */ +/************************************************************//** +Delete all the bitmap files for data less than the specified LSN. +If called with lsn == IB_ULONGLONG_MAX (i.e. set by RESET request), +restart the bitmap file sequence, otherwise continue it. +@return 0 to indicate success, 1 for failure. */ +static +my_bool +innobase_purge_changed_page_bitmaps( +/*================================*/ + ulonglong lsn) /*!< in: LSN to purge files up to */ { - return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr)); + return (my_bool)log_online_purge_changed_page_bitmaps(lsn); } -#ifdef WITH_WSREP -ulonglong -thd_to_trx_id( -/*=======*/ - THD* thd) /*!< in: MySQL thread */ + +/*****************************************************************//** +Commits a transaction in an InnoDB database. */ +static +void +innobase_commit_low( +/*================*/ + trx_t* trx) /*!< in: transaction handle */ { - return(thd_to_trx(thd)->id); +#ifdef WITH_WSREP + THD* thd = (THD*)trx->mysql_thd; + const char* tmp = 0; + if (wsrep_on((void*)thd)) { +#ifdef WSREP_PROC_INFO + char info[64]; + info[sizeof(info) - 1] = '\0'; + snprintf(info, sizeof(info) - 1, + "innobase_commit_low():trx_commit_for_mysql(%lld)", + (long long) wsrep_thd_trx_seqno(thd)); + tmp = thd_proc_info(thd, info); + +#else + tmp = thd_proc_info(thd, "innobase_commit_low()"); +#endif /* WSREP_PROC_INFO */ + } +#endif /* WITH_WSREP */ + if (trx_is_started(trx)) { + + trx_commit_for_mysql(trx); + } +#ifdef WITH_WSREP + if (wsrep_on((void*)thd)) { thd_proc_info(thd, tmp); } +#endif /* WITH_WSREP */ } -#endif -my_bool -ha_innobase::is_fake_change_enabled(THD* thd) +#if NOT_USED +/*****************************************************************//** +Stores the current binlog coordinates in the trx system header. */ +static +int +innobase_store_binlog_info( +/*=======================*/ + handlerton* hton, /*!< in: InnoDB handlerton */ + THD* thd) /*!< in: MySQL thread handle */ + { - trx_t* trx = thd_to_trx(thd); - return(trx && UNIV_UNLIKELY(trx->fake_changes)); + const char* file_name; + unsigned long long pos; + mtr_t mtr; + + DBUG_ENTER("innobase_store_binlog_info"); + + thd_binlog_pos(thd, &file_name, &pos); + + mtr_start(&mtr); + + trx_sys_update_mysql_binlog_offset(file_name, pos, + TRX_SYS_MYSQL_LOG_INFO, &mtr); + + mtr_commit(&mtr); + + innobase_flush_logs(hton); + + DBUG_RETURN(0); } +#endif -/********************************************************************//** -Call this function when mysqld passes control to the client. That is to -avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more -documentation, see handler.cc. +/*****************************************************************//** +Creates an InnoDB transaction struct for the thd if it does not yet have one. +Starts a new InnoDB transaction if a transaction is not yet started. And +assigns a new snapshot for a consistent read if the transaction does not yet +have one. @return 0 */ static int @@@ -8000,2081 -4788,1847 +8004,2079 @@@ ha_innobase::build_template } } - dict_table_autoinc_initialize(prebuilt->table, auto_inc); -} + clust_index = dict_table_get_first_index(prebuilt->table); -/*****************************************************************//** -Creates and opens a handle to a table which already exists in an InnoDB -database. -@return 1 if error, 0 if success */ -UNIV_INTERN -int -ha_innobase::open( -/*==============*/ - const char* name, /*!< in: table name */ - int mode, /*!< in: not used */ - uint test_if_locked) /*!< in: not used */ -{ - dict_table_t* ib_table; - char norm_name[1000]; - THD* thd; - char* is_part = NULL; - ibool par_case_name_set = FALSE; - char par_case_name[MAX_FULL_NAME_LEN + 1]; - dict_err_ignore_t ignore_err = DICT_ERR_IGNORE_NONE; + index = whole_row ? clust_index : prebuilt->index; - DBUG_ENTER("ha_innobase::open"); + prebuilt->need_to_access_clustered = (index == clust_index); - UT_NOT_USED(mode); - UT_NOT_USED(test_if_locked); + /* Either prebuilt->index should be a secondary index, or it + should be the clustered index. */ + ut_ad(dict_index_is_clust(index) == (index == clust_index)); - thd = ha_thd(); + /* Below we check column by column if we need to access + the clustered index. */ + + n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */ - /* Under some cases MySQL seems to call this function while - holding btr_search_latch. This breaks the latching order as - we acquire dict_sys->mutex below and leads to a deadlock. */ - if (thd != NULL) { - innobase_release_temporary_latches(ht, thd); + if (!prebuilt->mysql_template) { + prebuilt->mysql_template = (mysql_row_templ_t*) + mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t)); } - normalize_table_name(norm_name, name); + prebuilt->template_type = whole_row + ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS; + prebuilt->null_bitmap_len = table->s->null_bytes; - user_thd = NULL; + /* Prepare to build prebuilt->mysql_template[]. */ + prebuilt->templ_contains_blob = FALSE; + prebuilt->mysql_prefix_len = 0; + prebuilt->n_template = 0; + prebuilt->idx_cond_n_cols = 0; - if (!(share=get_share(name))) { + /* Note that in InnoDB, i is the column number in the table. + MySQL calls columns 'fields'. */ - DBUG_RETURN(1); - } + if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) { + /* Push down an index condition or an end_range check. */ + for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { - if (UNIV_UNLIKELY(share->ib_table && - share->ib_table->is_corrupt && - srv_pass_corrupt_table <= 1)) { - free_share(share); + while (!table->field[sql_idx]->stored_in_db) { + sql_idx++; + } - DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); - } + const ibool index_contains + = dict_index_contains_col_or_prefix(index, i); - /* Will be allocated if it is needed in ::update_row() */ - upd_buf = NULL; - upd_buf_size = 0; + /* Test if an end_range or an index condition + refers to the field. Note that "index" and + "index_contains" may refer to the clustered index. + Index condition pushdown is relative to prebuilt->index + (the index that is being looked up first). */ - /* We look for pattern #P# to see if the table is partitioned - MySQL table. */ -#ifdef __WIN__ - is_part = strstr(norm_name, "#p#"); -#else - is_part = strstr(norm_name, "#P#"); -#endif /* __WIN__ */ + /* When join_read_always_key() invokes this + code via handler::ha_index_init() and + ha_innobase::index_init(), end_range is not + yet initialized. Because of that, we must + always check for index_contains, instead of + the subset + field->part_of_key.is_set(active_index) + which would be acceptable if end_range==NULL. */ + if (build_template_needs_field_in_icp( + index, prebuilt, index_contains, i)) { + /* Needed in ICP */ + const Field* field; + mysql_row_templ_t* templ; - /* Check whether FOREIGN_KEY_CHECKS is set to 0. If so, the table - can be opened even if some FK indexes are missing. If not, the table - can't be opened in the same situation */ - if (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) { - ignore_err = DICT_ERR_IGNORE_FK_NOKEY; - } + if (whole_row) { + field = table->field[sql_idx]; + } else { + field = build_template_needs_field( + index_contains, + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, sql_idx); + if (!field) { + continue; + } + } + + templ = build_template_field( + prebuilt, clust_index, index, + table, field, i); + prebuilt->idx_cond_n_cols++; + ut_ad(prebuilt->idx_cond_n_cols + == prebuilt->n_template); + + if (index == prebuilt->index) { + templ->icp_rec_field_no + = templ->rec_field_no; + } else { + templ->icp_rec_field_no + = dict_index_get_nth_col_pos( + prebuilt->index, i, + NULL); + } + + if (dict_index_is_clust(prebuilt->index)) { + ut_ad(templ->icp_rec_field_no + != ULINT_UNDEFINED); + /* If the primary key includes + a column prefix, use it in + index condition pushdown, + because the condition is + evaluated before fetching any + off-page (externally stored) + columns. */ + if (templ->icp_rec_field_no + < prebuilt->index->n_uniq) { + /* This is a key column; + all set. */ + continue; + } + } else if (templ->icp_rec_field_no + != ULINT_UNDEFINED) { + continue; + } + + /* This is a column prefix index. + The column prefix can be used in + an end_range comparison. */ + + templ->icp_rec_field_no + = dict_index_get_nth_col_or_prefix_pos( + prebuilt->index, i, TRUE, NULL); + ut_ad(templ->icp_rec_field_no + != ULINT_UNDEFINED); + + /* Index condition pushdown can be used on + all columns of a secondary index, and on + the PRIMARY KEY columns. On the clustered + index, it must never be used on other than + PRIMARY KEY columns, because those columns + may be stored off-page, and we will not + fetch externally stored columns before + checking the index condition. */ + /* TODO: test the above with an assertion + like this. Note that index conditions are + currently pushed down as part of the + "optimizer phase" while end_range is done + as part of the execution phase. Therefore, + we were unable to use an accurate condition + for end_range in the "if" condition above, + and the following assertion would fail. + ut_ad(!dict_index_is_clust(prebuilt->index) + || templ->rec_field_no + < prebuilt->index->n_uniq); + */ + } + } + + ut_ad(prebuilt->idx_cond_n_cols > 0); + ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template); - /* Get pointer to a table object in InnoDB dictionary cache */ - ib_table = dict_table_get(norm_name, TRUE, ignore_err); + /* Include the fields that are not needed in index condition + pushdown. */ + for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { - if (UNIV_UNLIKELY(ib_table && - ib_table->is_corrupt && - srv_pass_corrupt_table <= 1)) { - free_share(share); - my_free(upd_buf); - upd_buf = NULL; - upd_buf_size = 0; + while (!table->field[sql_idx]->stored_in_db) { + sql_idx++; + } - DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE); - } + const ibool index_contains + = dict_index_contains_col_or_prefix(index, i); - share->ib_table = ib_table; + if (!build_template_needs_field_in_icp( + index, prebuilt, index_contains, i)) { + /* Not needed in ICP */ + const Field* field; - if (NULL == ib_table) { - if (is_part) { - /* MySQL partition engine hard codes the file name - separator as "#P#". The text case is fixed even if - lower_case_table_names is set to 1 or 2. This is true - for sub-partition names as well. InnoDB always - normalises file names to lower case on Windows, this - can potentially cause problems when copying/moving - tables between platforms. + if (whole_row) { + field = table->field[sql_idx]; + } else { + field = build_template_needs_field( + index_contains, + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, sql_idx); + if (!field) { + continue; + } + } - 1) If boot against an installation from Windows - platform, then its partition table name could - be all be in lower case in system tables. So we - will need to check lower case name when load table. + build_template_field(prebuilt, + clust_index, index, + table, field, i); + } + } - 2) If we boot an installation from other case - sensitive platform in Windows, we might need to - check the existence of table name without lowering - case them in the system table. */ - if (innobase_get_lower_case_table_names() == 1) { + prebuilt->idx_cond = this; + } else { + /* No index condition pushdown */ + prebuilt->idx_cond = NULL; - if (!par_case_name_set) { -#ifndef __WIN__ - /* Check for the table using lower - case name, including the partition - separator "P" */ - memcpy(par_case_name, norm_name, - strlen(norm_name)); - par_case_name[strlen(norm_name)] = 0; - innobase_casedn_str(par_case_name); -#else - /* On Windows platfrom, check - whether there exists table name in - system table whose name is - not being normalized to lower case */ - normalize_table_name_low( - par_case_name, name, FALSE); -#endif - par_case_name_set = TRUE; - } + for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { + const Field* field; - ib_table = dict_table_get( - par_case_name, TRUE, ignore_err); + while (!table->field[sql_idx]->stored_in_db) { + sql_idx++; } - if (ib_table) { -#ifndef __WIN__ - sql_print_warning("Partition table %s opened " - "after converting to lower " - "case. The table may have " - "been moved from a case " - "in-sensitive file system. " - "Please recreate table in " - "the current file system\n", - norm_name); -#else - sql_print_warning("Partition table %s opened " - "after skipping the step to " - "lower case the table name. " - "The table may have been " - "moved from a case sensitive " - "file system. Please " - "recreate table in the " - "current file system\n", - norm_name); -#endif - /* We allow use of table if it is found. - this is consistent to current behavior - to innodb_plugin */ - share->ib_table = ib_table; - goto table_opened; + if (whole_row) { + field = table->field[sql_idx]; + } else { + field = build_template_needs_field( + dict_index_contains_col_or_prefix( + index, i), + prebuilt->read_just_key, + fetch_all_in_key, + fetch_primary_key_cols, + index, table, i, sql_idx); + if (!field) { + continue; + } } - } - if (is_part) { - sql_print_error("Failed to open table %s.\n", - norm_name); + build_template_field(prebuilt, clust_index, index, + table, field, i); } - - sql_print_error("Cannot find or open table %s from\n" - "the internal data dictionary of InnoDB " - "though the .frm file for the\n" - "table exists. Maybe you have deleted and " - "recreated InnoDB data\n" - "files but have forgotten to delete the " - "corresponding .frm files\n" - "of InnoDB tables, or you have moved .frm " - "files to another database?\n" - "or, the table contains indexes that this " - "version of the engine\n" - "doesn't support.\n" - "See " REFMAN "innodb-troubleshooting.html\n" - "how you can resolve the problem.\n", - norm_name); - free_share(share); - my_errno = ENOENT; - - DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); } -table_opened: + if (index != clust_index && prebuilt->need_to_access_clustered) { + /* Change rec_field_no's to correspond to the clustered index + record */ + for (i = 0; i < prebuilt->n_template; i++) { - if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) { - sql_print_error("MySQL is trying to open a table handle but " - "the .ibd file for\ntable %s does not exist.\n" - "Have you deleted the .ibd file from the " - "database directory under\nthe MySQL datadir, " - "or have you used DISCARD TABLESPACE?\n" - "See " REFMAN "innodb-troubleshooting.html\n" - "how you can resolve the problem.\n", - norm_name); - free_share(share); - my_errno = ENOENT; + mysql_row_templ_t* templ + = &prebuilt->mysql_template[i]; - dict_table_decrement_handle_count(ib_table, FALSE); - DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + templ->rec_field_no = templ->clust_rec_field_no; + } } +} - prebuilt = row_create_prebuilt(ib_table, table->s->stored_rec_length); - - prebuilt->default_rec = table->s->default_values; - ut_ad(prebuilt->default_rec); +/********************************************************************//** +This special handling is really to overcome the limitations of MySQL's +binlogging. We need to eliminate the non-determinism that will arise in +INSERT ... SELECT type of statements, since MySQL binlog only stores the +min value of the autoinc interval. Once that is fixed we can get rid of +the special lock handling. +@return DB_SUCCESS if all OK else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_lock_autoinc(void) +/*====================================*/ +{ + DBUG_ENTER("ha_innobase::innobase_lock_autoinc"); + dberr_t error = DB_SUCCESS; - /* Looks like MySQL-3.23 sometimes has primary key number != 0 */ + ut_ad(!srv_read_only_mode); - primary_key = table->s->primary_key; - key_used_on_scan = primary_key; + switch (innobase_autoinc_lock_mode) { + case AUTOINC_NO_LOCKING: + /* Acquire only the AUTOINC mutex. */ + dict_table_autoinc_lock(prebuilt->table); + break; - if (!innobase_build_index_translation(table, ib_table, share)) { - sql_print_error("Build InnoDB index translation table for" - " Table %s failed", name); - } + case AUTOINC_NEW_STYLE_LOCKING: + /* For simple (single/multi) row INSERTs/REPLACEs and RBR + events, we fallback to the old style only if another + transaction has already acquired the AUTOINC lock on + behalf of a LOAD FILE or INSERT ... SELECT etc. type of + statement. */ + if (thd_sql_command(user_thd) == SQLCOM_INSERT + || thd_sql_command(user_thd) == SQLCOM_REPLACE + || thd_sql_command(user_thd) == SQLCOM_END // RBR event + ) { + dict_table_t* ib_table = prebuilt->table; - /* Allocate a buffer for a 'row reference'. A row reference is - a string of bytes of length ref_length which uniquely specifies - a row in our table. Note that MySQL may also compare two row - references for equality by doing a simple memcmp on the strings - of length ref_length! */ + /* Acquire the AUTOINC mutex. */ + dict_table_autoinc_lock(ib_table); - if (!row_table_got_default_clust_index(ib_table)) { + /* We need to check that another transaction isn't + already holding the AUTOINC lock on the table. */ + if (ib_table->n_waiting_or_granted_auto_inc_locks) { + /* Release the mutex to avoid deadlocks and + fall back to old style locking. */ + dict_table_autoinc_unlock(ib_table); + } else { + /* Do not fall back to old style locking. */ + break; + } + } + /* Use old style locking. */ + /* fall through */ + case AUTOINC_OLD_STYLE_LOCKING: + DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used", + ut_ad(0);); + error = row_lock_table_autoinc_for_mysql(prebuilt); - prebuilt->clust_index_was_generated = FALSE; + if (error == DB_SUCCESS) { - if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) { - sql_print_error("Table %s has a primary key in " - "InnoDB data dictionary, but not " - "in MySQL!", name); + /* Acquire the AUTOINC mutex. */ + dict_table_autoinc_lock(prebuilt->table); + } + break; - /* This mismatch could cause further problems - if not attended, bring this to the user's attention - by printing a warning in addition to log a message - in the errorlog */ - push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, - ER_NO_SUCH_INDEX, - "InnoDB: Table %s has a " - "primary key in InnoDB data " - "dictionary, but not in " - "MySQL!", name); + default: + ut_error; + } - /* If primary_key >= MAX_KEY, its (primary_key) - value could be out of bound if continue to index - into key_info[] array. Find InnoDB primary index, - and assign its key_length to ref_length. - In addition, since MySQL indexes are sorted starting - with primary index, unique index etc., initialize - ref_length to the first index key length in - case we fail to find InnoDB cluster index. + DBUG_RETURN(error); +} - Please note, this will not resolve the primary - index mismatch problem, other side effects are - possible if users continue to use the table. - However, we allow this table to be opened so - that user can adopt necessary measures for the - mismatch while still being accessible to the table - date. */ - ref_length = table->key_info[0].key_length; +/********************************************************************//** +Reset the autoinc value in the table. +@return DB_SUCCESS if all went well else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_reset_autoinc( +/*================================*/ + ulonglong autoinc) /*!< in: value to store */ +{ + dberr_t error; - /* Find correspoinding cluster index - key length in MySQL's key_info[] array */ - for (ulint i = 0; i < table->s->keys; i++) { - dict_index_t* index; - index = innobase_get_index(i); - if (dict_index_is_clust(index)) { - ref_length = - table->key_info[i].key_length; - } - } - } else { - /* MySQL allocates the buffer for ref. - key_info->key_length includes space for all key - columns + one byte for each column that may be - NULL. ref_length must be as exact as possible to - save space, because all row reference buffers are - allocated based on ref_length. */ + error = innobase_lock_autoinc(); - ref_length = table->key_info[primary_key].key_length; - } - } else { - if (primary_key != MAX_KEY) { - sql_print_error( - "Table %s has no primary key in InnoDB data " - "dictionary, but has one in MySQL! If you " - "created the table with a MySQL version < " - "3.23.54 and did not define a primary key, " - "but defined a unique key with all non-NULL " - "columns, then MySQL internally treats that " - "key as the primary key. You can fix this " - "error by dump + DROP + CREATE + reimport " - "of the table.", name); + if (error == DB_SUCCESS) { - /* This mismatch could cause further problems - if not attended, bring this to the user attention - by printing a warning in addition to log a message - in the errorlog */ - push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, - ER_NO_SUCH_INDEX, - "InnoDB: Table %s has no " - "primary key in InnoDB data " - "dictionary, but has one in " - "MySQL!", name); - } + dict_table_autoinc_initialize(prebuilt->table, autoinc); - prebuilt->clust_index_was_generated = TRUE; + dict_table_autoinc_unlock(prebuilt->table); + } - ref_length = DATA_ROW_ID_LEN; + return(error); +} - /* If we automatically created the clustered index, then - MySQL does not know about it, and MySQL must NOT be aware - of the index used on scan, to make it avoid checking if we - update the column of the index. That is why we assert below - that key_used_on_scan is the undefined value MAX_KEY. - The column is the row id in the automatical generation case, - and it will never be updated anyway. */ +/********************************************************************//** +Store the autoinc value in the table. The autoinc value is only set if +it's greater than the existing autoinc value in the table. +@return DB_SUCCESS if all went well else error code */ +UNIV_INTERN +dberr_t +ha_innobase::innobase_set_max_autoinc( +/*==================================*/ + ulonglong auto_inc) /*!< in: value to store */ +{ + dberr_t error; - if (key_used_on_scan != MAX_KEY) { - sql_print_warning( - "Table %s key_used_on_scan is %lu even " - "though there is no primary key inside " - "InnoDB.", name, (ulong) key_used_on_scan); - } + error = innobase_lock_autoinc(); + + if (error == DB_SUCCESS) { + + dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc); + + dict_table_autoinc_unlock(prebuilt->table); } - /* Index block size in InnoDB: used by MySQL in query optimization */ - stats.block_size = 16 * 1024; + return(error); +} - /* Init table lock structure */ - thr_lock_data_init(&share->lock,&lock,(void*) 0); +/********************************************************************//** +Stores a row in an InnoDB database, to the table specified in this +handle. +@return error code */ +UNIV_INTERN +int +ha_innobase::write_row( +/*===================*/ + uchar* record) /*!< in: a row in MySQL format */ +{ + dberr_t error; + int error_result= 0; + ibool auto_inc_used= FALSE; +#ifdef WITH_WSREP + ibool auto_inc_inserted= FALSE; /* if NULL was inserted */ +#endif + ulint sql_command; + trx_t* trx = thd_to_trx(user_thd); - if (prebuilt->table) { - /* We update the highest file format in the system table - space, if this table has higher file format setting. */ + DBUG_ENTER("ha_innobase::write_row"); - trx_sys_file_format_max_upgrade( - (const char**) &innobase_file_format_max, - dict_table_get_format(prebuilt->table)); + if (high_level_read_only) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (prebuilt->trx != trx) { + sql_print_error("The transaction object for the table handle " + "is at %p, but for the current thread it is at " + "%p", + (const void*) prebuilt->trx, (const void*) trx); + + fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr); + ut_print_buf(stderr, ((const byte*) prebuilt) - 100, 200); + fputs("\n" + "InnoDB: Dump of 200 bytes around ha_data: ", + stderr); + ut_print_buf(stderr, ((const byte*) trx) - 100, 200); + putc('\n', stderr); + ut_error; + } else if (!trx_is_started(trx)) { + ++trx->will_lock; } - /* Only if the table has an AUTOINC column. */ - if (prebuilt->table != NULL && table->found_next_number_field != NULL) { - dict_table_autoinc_lock(prebuilt->table); + ha_statistic_increment(&SSV::ha_write_count); - /* Since a table can already be "open" in InnoDB's internal - data dictionary, we only init the autoinc counter once, the - first time the table is loaded. We can safely reuse the - autoinc value from a previous MySQL open. */ - if (dict_table_autoinc_read(prebuilt->table) == 0) { + if (share->ib_table != prebuilt->table) { + fprintf(stderr, + "InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.", + share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt); + } - innobase_initialize_autoinc(); + if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + sql_command = thd_sql_command(user_thd); + + if ((sql_command == SQLCOM_ALTER_TABLE + || sql_command == SQLCOM_OPTIMIZE + || sql_command == SQLCOM_CREATE_INDEX +#ifdef WITH_WSREP + || (wsrep_on(user_thd) && wsrep_load_data_splitting && + sql_command == SQLCOM_LOAD && + !thd_test_options( + user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) +#endif /* WITH_WSREP */ + || sql_command == SQLCOM_DROP_INDEX) + && num_write_row >= 10000) { +#ifdef WITH_WSREP + if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) { + WSREP_DEBUG("forced trx split for LOAD: %s", + wsrep_thd_query(user_thd)); } +#endif /* WITH_WSREP */ + /* ALTER TABLE is COMMITted at every 10000 copied rows. + The IX table lock for the original table has to be re-issued. + As this method will be called on a temporary table where the + contents of the original table is being copied to, it is + a bit tricky to determine the source table. The cursor + position in the source table need not be adjusted after the + intermediate COMMIT, since writes by other transactions are + being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */ - dict_table_autoinc_unlock(prebuilt->table); - } + dict_table_t* src_table; + enum lock_mode mode; - info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); + num_write_row = 0; - DBUG_RETURN(0); -} + /* Commit the transaction. This will release the table + locks, so they have to be acquired again. */ -UNIV_INTERN -handler* -ha_innobase::clone( -/*===============*/ - const char* name, /*!< in: table name */ - MEM_ROOT* mem_root) /*!< in: memory context */ -{ - ha_innobase* new_handler; + /* Altering an InnoDB table */ + /* Get the source table. */ + src_table = lock_get_src_table( + prebuilt->trx, prebuilt->table, &mode); + if (!src_table) { +no_commit: + /* Unknown situation: do not commit */ + /* + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: ALTER TABLE is holding lock" + " on %lu tables!\n", + prebuilt->trx->mysql_n_tables_locked); + */ + ; + } else if (src_table == prebuilt->table) { +#ifdef WITH_WSREP + if (wsrep_on(user_thd) && + wsrep_load_data_splitting && + sql_command == SQLCOM_LOAD && + !thd_test_options(user_thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) + { + switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1)) + { + case WSREP_TRX_OK: + break; + case WSREP_TRX_SIZE_EXCEEDED: + case WSREP_TRX_CERT_FAIL: + case WSREP_TRX_ERROR: + DBUG_RETURN(1); + } - DBUG_ENTER("ha_innobase::clone"); + if (binlog_hton->commit(binlog_hton, user_thd, 1)) + DBUG_RETURN(1); + wsrep_post_commit(user_thd, TRUE); + } +#endif /* WITH_WSREP */ + /* Source table is not in InnoDB format: + no need to re-acquire locks on it. */ - new_handler = static_cast<ha_innobase*>(handler::clone(name, - mem_root)); - if (new_handler) { - new_handler->prebuilt->select_lock_type - = prebuilt->select_lock_type; + /* Altering to InnoDB format */ + innobase_commit(ht, user_thd, 1); + /* Note that this transaction is still active. */ + trx_register_for_2pc(prebuilt->trx); + /* We will need an IX lock on the destination table. */ + prebuilt->sql_stat_start = TRUE; + } else { +#ifdef WITH_WSREP + if (wsrep_on(user_thd) && + wsrep_load_data_splitting && + sql_command == SQLCOM_LOAD && + !thd_test_options(user_thd, + OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) + { + switch (wsrep_run_wsrep_commit(user_thd, wsrep_hton, 1)) + { + case WSREP_TRX_OK: + break; + case WSREP_TRX_SIZE_EXCEEDED: + case WSREP_TRX_CERT_FAIL: + case WSREP_TRX_ERROR: + DBUG_RETURN(1); + } + + if (binlog_hton->commit(binlog_hton, user_thd, 1)) + DBUG_RETURN(1); + wsrep_post_commit(user_thd, TRUE); + } +#endif /* WITH_WSREP */ + /* Ensure that there are no other table locks than + LOCK_IX and LOCK_AUTO_INC on the destination table. */ + + if (!lock_is_table_exclusive(prebuilt->table, + prebuilt->trx)) { + goto no_commit; + } + + /* Commit the transaction. This will release the table + locks, so they have to be acquired again. */ + innobase_commit(ht, user_thd, 1); + /* Note that this transaction is still active. */ + trx_register_for_2pc(prebuilt->trx); + /* Re-acquire the table lock on the source table. */ + row_lock_table_for_mysql(prebuilt, src_table, mode); + /* We will need an IX lock on the destination table. */ + prebuilt->sql_stat_start = TRUE; + } } - DBUG_RETURN(new_handler); -} + num_write_row++; -UNIV_INTERN -uint -ha_innobase::max_supported_key_part_length() const -{ - /* A table format specific index column length check will be performed - at ha_innobase::add_index() and row_create_index_for_mysql() */ - return(innobase_large_prefix - ? REC_VERSION_56_MAX_INDEX_COL_LEN - : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1); -} + /* This is the case where the table has an auto-increment column */ + if (table->next_number_field && record == table->record[0]) { -/******************************************************************//** -Closes a handle to an InnoDB table. -@return 0 */ -UNIV_INTERN -int -ha_innobase::close(void) -/*====================*/ -{ - THD* thd; + /* Reset the error code before calling + innobase_get_auto_increment(). */ + prebuilt->autoinc_error = DB_SUCCESS; + +#ifdef WITH_WSREP + auto_inc_inserted= (table->next_number_field->val_int() == 0); +#endif + + if ((error_result = update_auto_increment())) { + /* We don't want to mask autoinc overflow errors. */ - DBUG_ENTER("ha_innobase::close"); + /* Handle the case where the AUTOINC sub-system + failed during initialization. */ + if (prebuilt->autoinc_error == DB_UNSUPPORTED) { + error_result = ER_AUTOINC_READ_FAILED; + /* Set the error message to report too. */ + my_error(ER_AUTOINC_READ_FAILED, MYF(0)); + goto func_exit; + } else if (prebuilt->autoinc_error != DB_SUCCESS) { + error = prebuilt->autoinc_error; + goto report_error; + } - thd = ha_thd(); - if (thd != NULL) { - innobase_release_temporary_latches(ht, thd); + /* MySQL errors are passed straight back. except for + ER_AUTOINC_READ_FAILED. This can only happen + for values out of range. + */ + goto func_exit; + } + + auto_inc_used = TRUE; } - row_prebuilt_free(prebuilt, FALSE); + if (prebuilt->mysql_template == NULL + || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) { - if (upd_buf != NULL) { - ut_ad(upd_buf_size != 0); - my_free(upd_buf); - upd_buf = NULL; - upd_buf_size = 0; + /* Build the template used in converting quickly between + the two database formats */ + + build_template(true); } - free_share(share); + innobase_srv_conc_enter_innodb(prebuilt->trx); - /* Tell InnoDB server that there might be work for - utility threads: */ + error = row_insert_for_mysql((byte*) record, prebuilt); + DEBUG_SYNC(user_thd, "ib_after_row_insert"); - srv_active_wake_master_thread(); + /* Handle duplicate key errors */ + if (auto_inc_used) { + ulonglong auto_inc; + ulonglong col_max_value; - DBUG_RETURN(0); -} + /* Note the number of rows processed for this statement, used + by get_auto_increment() to determine the number of AUTO-INC + values to reserve. This is only useful for a mult-value INSERT + and is a statement level counter.*/ + if (trx->n_autoinc_rows > 0) { + --trx->n_autoinc_rows; + } -/* The following accessor functions should really be inside MySQL code! */ + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); -/**************************************************************//** -Gets field offset for a field in a table. -@return offset */ -static inline -uint -get_field_offset( -/*=============*/ - const TABLE* table, /*!< in: MySQL table object */ - const Field* field) /*!< in: MySQL field object */ -{ - return((uint) (field->ptr - table->record[0])); -} + /* Get the value that MySQL attempted to store in the table.*/ + auto_inc = table->next_number_field->val_uint(); -/**************************************************************//** -Checks if a field in a record is SQL NULL. Uses the record format -information in table to track the null bit in record. -@return 1 if NULL, 0 otherwise */ -static inline -uint -field_in_record_is_null( -/*====================*/ - TABLE* table, /*!< in: MySQL table object */ - Field* field, /*!< in: MySQL field object */ - char* record) /*!< in: a row in MySQL format */ -{ - int null_offset; + switch (error) { + case DB_DUPLICATE_KEY: - if (!field->null_ptr) { + /* A REPLACE command and LOAD DATA INFILE REPLACE + handle a duplicate key error themselves, but we + must update the autoinc counter if we are performing + those statements. */ - return(0); - } + switch (sql_command) { + case SQLCOM_LOAD: + if (trx->duplicates) { - null_offset = (uint) ((char*) field->null_ptr - - (char*) table->record[0]); + goto set_max_autoinc; + } + break; - if (record[null_offset] & field->null_bit) { + case SQLCOM_REPLACE: + case SQLCOM_INSERT_SELECT: + case SQLCOM_REPLACE_SELECT: + goto set_max_autoinc; - return(1); - } +#ifdef WITH_WSREP + /* workaround for LP bug #355000, retrying the insert */ + case SQLCOM_INSERT: - return(0); -} + WSREP_DEBUG("DUPKEY error for autoinc\n" + "THD %ld, value %llu, off %llu inc %llu", + wsrep_thd_thread_id(current_thd), + auto_inc, + prebuilt->autoinc_offset, + prebuilt->autoinc_increment); -/*************************************************************//** -InnoDB uses this function to compare two data fields for which the data type -is such that we must use MySQL code to compare them. NOTE that the prototype -of this function is in rem0cmp.c in InnoDB source code! If you change this -function, remember to update the prototype there! -@return 1, 0, -1, if a is greater, equal, less than b, respectively */ -extern "C" UNIV_INTERN -int -innobase_mysql_cmp( -/*===============*/ - int mysql_type, /*!< in: MySQL type */ - uint charset_number, /*!< in: number of the charset */ - const unsigned char* a, /*!< in: data field */ - unsigned int a_length, /*!< in: data field length, - not UNIV_SQL_NULL */ - const unsigned char* b, /*!< in: data field */ - unsigned int b_length) /*!< in: data field length, - not UNIV_SQL_NULL */ -{ - CHARSET_INFO* charset; - enum_field_types mysql_tp; - int ret; + if (wsrep_on(current_thd) && + auto_inc_inserted && + wsrep_drupal_282555_workaround && + wsrep_thd_retry_counter(current_thd) == 0 && + !thd_test_options(current_thd, + OPTION_NOT_AUTOCOMMIT | + OPTION_BEGIN)) { + WSREP_DEBUG( + "retrying insert: %s", + (*wsrep_thd_query(current_thd)) ? + wsrep_thd_query(current_thd) : + (char *)"void"); + error= DB_SUCCESS; + wsrep_thd_set_conflict_state( + current_thd, MUST_ABORT); + innobase_srv_conc_exit_innodb(prebuilt->trx); + /* jump straight to func exit over + * later wsrep hooks */ + goto func_exit; + } + break; +#endif /* WITH_WSREP */ - DBUG_ASSERT(a_length != UNIV_SQL_NULL); - DBUG_ASSERT(b_length != UNIV_SQL_NULL); + default: + break; + } - mysql_tp = (enum_field_types) mysql_type; + break; - switch (mysql_tp) { + case DB_SUCCESS: + /* If the actual value inserted is greater than + the upper limit of the interval, then we try and + update the table upper limit. Note: last_value + will be 0 if get_auto_increment() was not called.*/ - case MYSQL_TYPE_BIT: - case MYSQL_TYPE_STRING: - case MYSQL_TYPE_VAR_STRING: - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - case MYSQL_TYPE_VARCHAR: - /* Use the charset number to pick the right charset struct for - the comparison. Since the MySQL function get_charset may be - slow before Bar removes the mutex operation there, we first - look at 2 common charsets directly. */ + if (auto_inc >= prebuilt->autoinc_last_value) { +set_max_autoinc: + /* This should filter out the negative + values set explicitly by the user. */ + if (auto_inc <= col_max_value) { + ut_a(prebuilt->autoinc_increment > 0); - if (charset_number == default_charset_info->number) { - charset = default_charset_info; - } else if (charset_number == my_charset_latin1.number) { - charset = &my_charset_latin1; - } else { - charset = get_charset(charset_number, MYF(MY_WME)); + ulonglong offset; + ulonglong increment; + dberr_t err; - if (charset == NULL) { - sql_print_error("InnoDB needs charset %lu for doing " - "a comparison, but MySQL cannot " - "find that charset.", - (ulong) charset_number); - ut_a(0); + offset = prebuilt->autoinc_offset; + increment = prebuilt->autoinc_increment; + + auto_inc = innobase_next_autoinc( + auto_inc, + 1, increment, offset, + col_max_value); + + err = innobase_set_max_autoinc( + auto_inc); + + if (err != DB_SUCCESS) { + error = err; + } + } } + break; + default: + break; } + } - /* Starting from 4.1.3, we use strnncollsp() in comparisons of - non-latin1_swedish_ci strings. NOTE that the collation order - changes then: 'b\0\0...' is ordered BEFORE 'b ...'. Users - having indexes on such data need to rebuild their tables! */ + innobase_srv_conc_exit_innodb(prebuilt->trx); - ret = charset->coll->strnncollsp(charset, - a, a_length, - b, b_length, 0); - if (ret < 0) { - return(-1); - } else if (ret > 0) { - return(1); - } else { - return(0); - } - default: - ut_error; +report_error: + if (error == DB_TABLESPACE_DELETED) { + ib_senderrf( + trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); } - return(0); -} + error_result = convert_error_code_to_mysql(error, + prebuilt->table->flags, + user_thd); + #ifdef WITH_WSREP - if (!error_result && - wsrep_thd_exec_mode(user_thd) == LOCAL_STATE && - wsrep_on(user_thd) && - !wsrep_consistency_check(user_thd) && - !wsrep_thd_skip_append_keys(user_thd)) - { - if (wsrep_append_keys(user_thd, false, record, NULL)) - { -extern "C" UNIV_INTERN -int -wsrep_innobase_mysql_sort( -/*===============*/ - /* out: str contains sort string */ - int mysql_type, /* in: MySQL type */ - uint charset_number, /* in: number of the charset */ - unsigned char* str, /* in: data field */ - unsigned int str_length, /* in: data field length, - not UNIV_SQL_NULL */ - unsigned int buf_length) /* in: total str buffer length */ ++ if (!error_result ++ && wsrep_on(user_thd) ++ && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE ++ && !wsrep_consistency_check(user_thd) ++ && !wsrep_thd_skip_append_keys(user_thd)) { ++ if (wsrep_append_keys(user_thd, false, record, NULL)) { + DBUG_PRINT("wsrep", ("row key failed")); + error_result = HA_ERR_INTERNAL_ERROR; + goto wsrep_error; + } + } +wsrep_error: +#endif /* WITH_WSREP */ + + if (error_result == HA_FTS_INVALID_DOCID) { + my_error(HA_FTS_INVALID_DOCID, MYF(0)); + } + +func_exit: + innobase_active_small(); + + if (share->ib_table != prebuilt->table) { + fprintf(stderr, + "InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.", + share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt); + } + + if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + DBUG_RETURN(error_result); +} +/**********************************************************************//** +Checks which fields have changed in a row and stores information +of them to an update vector. +@return DB_SUCCESS or error code */ +static +dberr_t +calc_row_difference( +/*================*/ + upd_t* uvect, /*!< in/out: update vector */ + uchar* old_row, /*!< in: old row in MySQL format */ + uchar* new_row, /*!< in: new row in MySQL format */ + TABLE* table, /*!< in: table in MySQL data + dictionary */ + uchar* upd_buff, /*!< in: buffer to use */ + ulint buff_len, /*!< in: buffer length */ + row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */ + THD* thd) /*!< in: user thread */ { - CHARSET_INFO* charset; - enum_field_types mysql_tp; - int ret_length = str_length; + uchar* original_upd_buff = upd_buff; + Field* field; + enum_field_types field_mysql_type; + uint n_fields; + ulint o_len; + ulint n_len; + ulint col_pack_len; + const byte* new_mysql_row_col; + const byte* o_ptr; + const byte* n_ptr; + byte* buf; + upd_field_t* ufield; + ulint col_type; + ulint n_changed = 0; + dfield_t dfield; + dict_index_t* clust_index; + uint sql_idx, innodb_idx= 0; + ibool changes_fts_column = FALSE; + ibool changes_fts_doc_col = FALSE; + trx_t* trx = thd_to_trx(thd); + doc_id_t doc_id = FTS_NULL_DOC_ID; - DBUG_ASSERT(str_length != UNIV_SQL_NULL); + ut_ad(!srv_read_only_mode); - mysql_tp = (enum_field_types) mysql_type; + n_fields = table->s->fields; + clust_index = dict_table_get_first_index(prebuilt->table); - switch (mysql_tp) { + /* We use upd_buff to convert changed fields */ + buf = (byte*) upd_buff; - case MYSQL_TYPE_BIT: - case MYSQL_TYPE_STRING: - case MYSQL_TYPE_VAR_STRING: - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - case MYSQL_TYPE_VARCHAR: - { - uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN]; - uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN; + for (sql_idx = 0; sql_idx < n_fields; sql_idx++) { + field = table->field[sql_idx]; + if (!field->stored_in_db) + continue; - /* Use the charset number to pick the right charset struct for - the comparison. Since the MySQL function get_charset may be - slow before Bar removes the mutex operation there, we first - look at 2 common charsets directly. */ + o_ptr = (const byte*) old_row + get_field_offset(table, field); + n_ptr = (const byte*) new_row + get_field_offset(table, field); - if (charset_number == default_charset_info->number) { - charset = default_charset_info; - } else if (charset_number == my_charset_latin1.number) { - charset = &my_charset_latin1; - } else { - charset = get_charset(charset_number, MYF(MY_WME)); + /* Use new_mysql_row_col and col_pack_len save the values */ - if (charset == NULL) { - sql_print_error("InnoDB needs charset %lu for doing " - "a comparison, but MySQL cannot " - "find that charset.", - (ulong) charset_number); - ut_a(0); - } - } + new_mysql_row_col = n_ptr; + col_pack_len = field->pack_length(); - ut_a(str_length <= tmp_length); - memcpy(tmp_str, str, str_length); + o_len = col_pack_len; + n_len = col_pack_len; - if (wsrep_protocol_version < 3) { - tmp_length = charset->coll->strnxfrm( - charset, str, str_length, - tmp_str, str_length); - DBUG_ASSERT(tmp_length <= str_length); - } else { - /* strnxfrm will expand the destination string, - protocols < 3 truncated the sorted sring - protocols > 3 gets full sorted sring - */ - /* 5.5 strnxfrm pads the tail with spaces and - always returns the full destination buffer lenght - we cannot know how many characters were converted - using 2 * str length here as best guess - */ - uint dst_length = (str_length * 2 < tmp_length) ? - (str_length * 2) : tmp_length; - tmp_length = charset->coll->strnxfrm( - charset, str, dst_length, - tmp_str, str_length); - DBUG_ASSERT(tmp_length <= buf_length); - ret_length = tmp_length; - } - - break; - } - case MYSQL_TYPE_DECIMAL : - case MYSQL_TYPE_TINY : - case MYSQL_TYPE_SHORT : - case MYSQL_TYPE_LONG : - case MYSQL_TYPE_FLOAT : - case MYSQL_TYPE_DOUBLE : - case MYSQL_TYPE_NULL : - case MYSQL_TYPE_TIMESTAMP : - case MYSQL_TYPE_LONGLONG : - case MYSQL_TYPE_INT24 : - case MYSQL_TYPE_DATE : - case MYSQL_TYPE_TIME : - case MYSQL_TYPE_DATETIME : - case MYSQL_TYPE_YEAR : - case MYSQL_TYPE_NEWDATE : - case MYSQL_TYPE_NEWDECIMAL : - case MYSQL_TYPE_ENUM : - case MYSQL_TYPE_SET : - case MYSQL_TYPE_GEOMETRY : - break; - default: - break; - } + /* We use o_ptr and n_ptr to dig up the actual data for + comparison. */ - return ret_length; -} -#endif // WITH_WSREP -/**************************************************************//** -Converts a MySQL type to an InnoDB type. Note that this function returns -the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1 -VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'. -@return DATA_BINARY, DATA_VARCHAR, ... */ -extern "C" UNIV_INTERN -ulint -get_innobase_type_from_mysql_type( -/*==============================*/ - ulint* unsigned_flag, /*!< out: DATA_UNSIGNED if an - 'unsigned type'; - at least ENUM and SET, - and unsigned integer - types are 'unsigned types' */ - const void* f) /*!< in: MySQL Field */ -{ - const class Field* field = reinterpret_cast<const class Field*>(f); + field_mysql_type = field->type(); - /* The following asserts try to check that the MySQL type code fits in - 8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to - the type */ + col_type = prebuilt->table->cols[innodb_idx].mtype; - DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256); - DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256); - DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256); - DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256); - DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256); + switch (col_type) { - if (field->flags & UNSIGNED_FLAG) { + case DATA_BLOB: + /* Do not compress blob column while comparing*/ + o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len); + n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len); - *unsigned_flag = DATA_UNSIGNED; - } else { - *unsigned_flag = 0; - } + break; - if (field->real_type() == MYSQL_TYPE_ENUM - || field->real_type() == MYSQL_TYPE_SET) { + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_VARMYSQL: + if (field_mysql_type == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR where + the real payload data length is stored in + 1 or 2 bytes */ - /* MySQL has field->type() a string type for these, but the - data is actually internally stored as an unsigned integer - code! */ + o_ptr = row_mysql_read_true_varchar( + &o_len, o_ptr, + (ulint) + (((Field_varstring*) field)->length_bytes)); - *unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned - flag set to zero, even though - internally this is an unsigned - integer type */ - return(DATA_INT); - } + n_ptr = row_mysql_read_true_varchar( + &n_len, n_ptr, + (ulint) + (((Field_varstring*) field)->length_bytes)); + } - switch (field->type()) { - /* NOTE that we only allow string types in DATA_MYSQL and - DATA_VARMYSQL */ - case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */ - case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */ - if (field->binary()) { - return(DATA_BINARY); - } else if (strcmp( - field->charset()->name, - "latin1_swedish_ci") == 0) { - return(DATA_VARCHAR); - } else { - return(DATA_VARMYSQL); + break; + default: + ; } - case MYSQL_TYPE_BIT: - case MYSQL_TYPE_STRING: if (field->binary()) { - return(DATA_FIXBINARY); - } else if (strcmp( - field->charset()->name, - "latin1_swedish_ci") == 0) { - return(DATA_CHAR); - } else { - return(DATA_MYSQL); + if (field_mysql_type == MYSQL_TYPE_LONGLONG + && prebuilt->table->fts + && innobase_strcasecmp( + field->field_name, FTS_DOC_ID_COL_NAME) == 0) { + doc_id = (doc_id_t) mach_read_from_n_little_endian( + n_ptr, 8); + if (doc_id == 0) { + return(DB_FTS_INVALID_DOCID); + } } - case MYSQL_TYPE_NEWDECIMAL: - return(DATA_FIXBINARY); - case MYSQL_TYPE_LONG: - case MYSQL_TYPE_LONGLONG: - case MYSQL_TYPE_TINY: - case MYSQL_TYPE_SHORT: - case MYSQL_TYPE_INT24: - case MYSQL_TYPE_DATE: - case MYSQL_TYPE_YEAR: - case MYSQL_TYPE_NEWDATE: - return(DATA_INT); - - case MYSQL_TYPE_TIME: - case MYSQL_TYPE_DATETIME: - case MYSQL_TYPE_TIMESTAMP: - /* - XtraDB should ideally just check field->keytype() and never - field->type(). The following check is here to only - change the new hires datetime/timestamp/time fields to - use DATA_FIXBINARY. We can't convert this function to - just test for field->keytype() as then the check if a - table is compatible will fail for old tables. - */ - if (field->key_type() == HA_KEYTYPE_BINARY) - return(DATA_FIXBINARY); - return(DATA_INT); - case MYSQL_TYPE_FLOAT: - return(DATA_FLOAT); - case MYSQL_TYPE_DOUBLE: - return(DATA_DOUBLE); - case MYSQL_TYPE_DECIMAL: - return(DATA_DECIMAL); - case MYSQL_TYPE_GEOMETRY: - case MYSQL_TYPE_TINY_BLOB: - case MYSQL_TYPE_MEDIUM_BLOB: - case MYSQL_TYPE_BLOB: - case MYSQL_TYPE_LONG_BLOB: - return(DATA_BLOB); - case MYSQL_TYPE_NULL: - return(DATA_FIXBINARY); - default: - ut_error; - } - - return(0); -} - -/*******************************************************************//** -Writes an unsigned integer value < 64k to 2 bytes, in the little-endian -storage format. */ -static inline -void -innobase_write_to_2_little_endian( -/*==============================*/ - byte* buf, /*!< in: where to store */ - ulint val) /*!< in: value to write, must be < 64k */ -{ - ut_a(val < 256 * 256); - buf[0] = (byte)(val & 0xFF); - buf[1] = (byte)(val / 256); -} -/*******************************************************************//** -Reads an unsigned integer value < 64k from 2 bytes, in the little-endian -storage format. -@return value */ -static inline -uint -innobase_read_from_2_little_endian( -/*===============================*/ - const uchar* buf) /*!< in: from where to read */ -{ - return (uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))); -} + if (field->real_maybe_null()) { + if (field->is_null_in_record(old_row)) { + o_len = UNIV_SQL_NULL; + } -/*******************************************************************//** -Stores a key value for a row to a buffer. -@return key value length as stored in buff */ -#ifdef WITH_WSREP -UNIV_INTERN -uint -wsrep_store_key_val_for_row( -/*===============================*/ - TABLE* table, - uint keynr, /*!< in: key number */ - char* buff, /*!< in/out: buffer for the key value (in MySQL - format) */ - uint buff_len,/*!< in: buffer length */ - const uchar* record, - ibool* key_is_null)/*!< out: full key was null */ -{ - KEY* key_info = table->key_info + keynr; - KEY_PART_INFO* key_part = key_info->key_part; - KEY_PART_INFO* end = key_part + key_info->key_parts; - char* buff_start = buff; - enum_field_types mysql_type; - Field* field; - - DBUG_ENTER("store_key_val_for_row"); + if (field->is_null_in_record(new_row)) { + n_len = UNIV_SQL_NULL; + } + } - bzero(buff, buff_len); - *key_is_null = TRUE; + if (o_len != n_len || (o_len != 0 && o_len != UNIV_SQL_NULL + && 0 != memcmp(o_ptr, n_ptr, o_len))) { + /* The field has changed */ - for (; key_part != end; key_part++) { - uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'}; - ibool part_is_null = FALSE; + ufield = uvect->fields + n_changed; + UNIV_MEM_INVALID(ufield, sizeof *ufield); - if (key_part->null_bit) { - if (record[key_part->null_offset] & - key_part->null_bit) { - *buff = 1; - part_is_null = TRUE; + /* Let us use a dummy dfield to make the conversion + from the MySQL column format to the InnoDB format */ + + if (n_len != UNIV_SQL_NULL) { + dict_col_copy_type(prebuilt->table->cols + innodb_idx, + dfield_get_type(&dfield)); + + buf = row_mysql_store_col_in_innobase_format( + &dfield, + (byte*) buf, + TRUE, + new_mysql_row_col, + col_pack_len, + dict_table_is_comp(prebuilt->table)); + dfield_copy(&ufield->new_val, &dfield); } else { - *buff = 0; + dfield_set_null(&ufield->new_val); } - buff++; - } - if (!part_is_null) *key_is_null = FALSE; - field = key_part->field; - mysql_type = field->type(); + ufield->exp = NULL; + ufield->orig_len = 0; + ufield->field_no = dict_col_get_clust_pos( + &prebuilt->table->cols[innodb_idx], clust_index); + n_changed++; - if (mysql_type == MYSQL_TYPE_VARCHAR) { - /* >= 5.0.3 true VARCHAR */ - ulint lenlen; - ulint len; - const byte* data; - ulint key_len; - ulint true_len; - CHARSET_INFO* cs; - int error=0; + /* If an FTS indexed column was changed by this + UPDATE then we need to inform the FTS sub-system. - key_len = key_part->length; + NOTE: Currently we re-index all FTS indexed columns + even if only a subset of the FTS indexed columns + have been updated. That is the reason we are + checking only once here. Later we will need to + note which columns have been updated and do + selective processing. */ + if (prebuilt->table->fts != NULL) { + ulint offset; + dict_table_t* innodb_table; - if (part_is_null) { - buff += key_len + 2; + innodb_table = prebuilt->table; - continue; + if (!changes_fts_column) { + offset = row_upd_changes_fts_column( + innodb_table, ufield); + + if (offset != ULINT_UNDEFINED) { + changes_fts_column = TRUE; + } + } + + if (!changes_fts_doc_col) { + changes_fts_doc_col = + row_upd_changes_doc_id( + innodb_table, ufield); + } + } + } + if (field->stored_in_db) + innodb_idx++; + } + + /* If the update changes a column with an FTS index on it, we + then add an update column node with a new document id to the + other changes. We piggy back our changes on the normal UPDATE + to reduce processing and IO overhead. */ + if (!prebuilt->table->fts) { + trx->fts_next_doc_id = 0; + } else if (changes_fts_column || changes_fts_doc_col) { + dict_table_t* innodb_table = prebuilt->table; + + ufield = uvect->fields + n_changed; + + if (!DICT_TF2_FLAG_IS_SET( + innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) { + + /* If Doc ID is managed by user, and if any + FTS indexed column has been updated, its corresponding + Doc ID must also be updated. Otherwise, return + error */ + if (changes_fts_column && !changes_fts_doc_col) { + ut_print_timestamp(stderr); + fprintf(stderr, " InnoDB: A new Doc ID" + " must be supplied while updating" + " FTS indexed columns.\n"); + return(DB_FTS_INVALID_DOCID); } - cs = field->charset(); - lenlen = (ulint) - (((Field_varstring*)field)->length_bytes); + /* Doc ID must monotonically increase */ + ut_ad(innodb_table->fts->cache); + if (doc_id < prebuilt->table->fts->cache->next_doc_id) { + fprintf(stderr, + "InnoDB: FTS Doc ID must be larger than" + " " IB_ID_FMT " for table", + innodb_table->fts->cache->next_doc_id + - 1); + ut_print_name(stderr, trx, + TRUE, innodb_table->name); + putc('\n', stderr); + + return(DB_FTS_INVALID_DOCID); + } else if ((doc_id + - prebuilt->table->fts->cache->next_doc_id) + >= FTS_DOC_ID_MAX_STEP) { + fprintf(stderr, + "InnoDB: Doc ID " UINT64PF " is too" + " big. Its difference with largest" + " Doc ID used " UINT64PF " cannot" + " exceed or equal to %d\n", + doc_id, + prebuilt->table->fts->cache->next_doc_id - 1, + FTS_DOC_ID_MAX_STEP); + } - data = row_mysql_read_true_varchar(&len, - (byte*) (record - + (ulint)get_field_offset(table, field)), - lenlen); - true_len = len; + trx->fts_next_doc_id = doc_id; + } else { + /* If the Doc ID is a hidden column, it can't be + changed by user */ + ut_ad(!changes_fts_doc_col); - /* For multi byte character sets we need to calculate - the true length of the key */ + /* Doc ID column is hidden, a new Doc ID will be + generated by following fts_update_doc_id() call */ + trx->fts_next_doc_id = 0; + } - if (len > 0 && cs->mbmaxlen > 1) { - true_len = (ulint) cs->cset->well_formed_len(cs, - (const char *) data, - (const char *) data + len, - (uint) (key_len / - cs->mbmaxlen), - &error); - } + fts_update_doc_id( + innodb_table, ufield, &trx->fts_next_doc_id); - /* In a column prefix index, we may need to truncate - the stored value: */ + ++n_changed; + } else { + /* We have a Doc ID column, but none of FTS indexed + columns are touched, nor the Doc ID column, so set + fts_next_doc_id to UINT64_UNDEFINED, which means do not + update the Doc ID column */ + trx->fts_next_doc_id = UINT64_UNDEFINED; + } - if (true_len > key_len) { - true_len = key_len; - } + uvect->n_fields = n_changed; + uvect->info_bits = 0; - memcpy(sorted, data, true_len); - true_len = wsrep_innobase_mysql_sort( - mysql_type, cs->number, sorted, true_len, - REC_VERSION_56_MAX_INDEX_COL_LEN); + ut_a(buf <= (byte*) original_upd_buff + buff_len); - if (wsrep_protocol_version > 1) { - memcpy(buff, sorted, true_len); - /* Note that we always reserve the maximum possible - length of the true VARCHAR in the key value, though - only len first bytes after the 2 length bytes contain - actual data. The rest of the space was reset to zero - in the bzero() call above. */ - buff += true_len; - } else { - buff += key_len; - } - } else if (mysql_type == MYSQL_TYPE_TINY_BLOB - || mysql_type == MYSQL_TYPE_MEDIUM_BLOB - || mysql_type == MYSQL_TYPE_BLOB - || mysql_type == MYSQL_TYPE_LONG_BLOB - /* MYSQL_TYPE_GEOMETRY data is treated - as BLOB data in innodb. */ - || mysql_type == MYSQL_TYPE_GEOMETRY) { + return(DB_SUCCESS); +} - CHARSET_INFO* cs; - ulint key_len; - ulint true_len; - int error=0; - ulint blob_len; - const byte* blob_data; +#ifdef WITH_WSREP +static +int +wsrep_calc_row_hash( +/*================*/ + byte* digest, /*!< in/out: md5 sum */ + const uchar* row, /*!< in: row in MySQL format */ + TABLE* table, /*!< in: table in MySQL data + dictionary */ + row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */ + THD* thd) /*!< in: user thread */ +{ + Field* field; + enum_field_types field_mysql_type; + uint n_fields; + ulint len; + const byte* ptr; + ulint col_type; + uint i; - ut_a(key_part->key_part_flag & HA_PART_KEY_SEG); + void *ctx = wsrep_md5_init(); - key_len = key_part->length; + n_fields = table->s->fields; - if (part_is_null) { - buff += key_len + 2; + for (i = 0; i < n_fields; i++) { + byte null_byte=0; + byte true_byte=1; - continue; - } + field = table->field[i]; - cs = field->charset(); + ptr = (const byte*) row + get_field_offset(table, field); + len = field->pack_length(); - blob_data = row_mysql_read_blob_ref(&blob_len, - (byte*) (record - + (ulint)get_field_offset(table, field)), - (ulint) field->pack_length()); + field_mysql_type = field->type(); - true_len = blob_len; + col_type = prebuilt->table->cols[i].mtype; - ut_a(get_field_offset(table, field) - == key_part->offset); + switch (col_type) { - /* For multi byte character sets we need to calculate - the true length of the key */ + case DATA_BLOB: + ptr = row_mysql_read_blob_ref(&len, ptr, len); + break; - if (blob_len > 0 && cs->mbmaxlen > 1) { - true_len = (ulint) cs->cset->well_formed_len(cs, - (const char *) blob_data, - (const char *) blob_data - + blob_len, - (uint) (key_len / - cs->mbmaxlen), - &error); - } + case DATA_VARCHAR: + case DATA_BINARY: + case DATA_VARMYSQL: + if (field_mysql_type == MYSQL_TYPE_VARCHAR) { + /* This is a >= 5.0.3 type true VARCHAR where + the real payload data length is stored in + 1 or 2 bytes */ - /* All indexes on BLOB and TEXT are column prefix - indexes, and we may need to truncate the data to be - stored in the key value: */ + ptr = row_mysql_read_true_varchar( + &len, ptr, + (ulint) + (((Field_varstring*)field)->length_bytes)); - if (true_len > key_len) { - true_len = key_len; } - memcpy(sorted, blob_data, true_len); - true_len = wsrep_innobase_mysql_sort( - mysql_type, cs->number, sorted, true_len, - REC_VERSION_56_MAX_INDEX_COL_LEN); + break; + default: + ; + } + /* + if (field->null_ptr && + field_in_record_is_null(table, field, (char*) row)) { + */ + + if (field->is_null_in_record(row)) { + wsrep_md5_update(ctx, (char*)&null_byte, 1); + } else { + wsrep_md5_update(ctx, (char*)&true_byte, 1); + wsrep_md5_update(ctx, (char*)ptr, len); + } + } + + wsrep_compute_md5_hash((char*)digest, ctx); + + return(0); +} +#endif /* WITH_WSREP */ +/**********************************************************************//** +Updates a row given as a parameter to a new value. Note that we are given +whole rows, not just the fields which are updated: this incurs some +overhead for CPU when we check which fields are actually updated. +TODO: currently InnoDB does not prevent the 'Halloween problem': +in a searched update a single row can get updated several times +if its index columns are updated! +@return error number or 0 */ +UNIV_INTERN +int +ha_innobase::update_row( +/*====================*/ + const uchar* old_row, /*!< in: old row in MySQL format */ + uchar* new_row) /*!< in: new row in MySQL format */ +{ + upd_t* uvect; + dberr_t error; + trx_t* trx = thd_to_trx(user_thd); - memcpy(buff, sorted, true_len); + DBUG_ENTER("ha_innobase::update_row"); - /* Note that we always reserve the maximum possible - length of the BLOB prefix in the key value. */ - if (wsrep_protocol_version > 1) { - buff += true_len; - } else { - buff += key_len; - } - } else { - /* Here we handle all other data types except the - true VARCHAR, BLOB and TEXT. Note that the column - value we store may be also in a column prefix - index. */ + ut_a(prebuilt->trx == trx); - CHARSET_INFO* cs; - ulint true_len; - ulint key_len; - const uchar* src_start; - int error=0; - enum_field_types real_type; + if (high_level_read_only) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (!trx_is_started(trx)) { + ++trx->will_lock; + } - key_len = key_part->length; + if (upd_buf == NULL) { + ut_ad(upd_buf_size == 0); - if (part_is_null) { - buff += key_len; + /* Create a buffer for packing the fields of a record. Why + table->stored_rec_length did not work here? Obviously, because char + fields when packed actually became 1 byte longer, when we also + stored the string length as the first byte. */ - continue; - } + upd_buf_size = table->s->stored_rec_length + table->s->max_key_length + + MAX_REF_PARTS * 3; + upd_buf = (uchar*) my_malloc(upd_buf_size, MYF(MY_WME)); + if (upd_buf == NULL) { + upd_buf_size = 0; + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + } + } - src_start = record + key_part->offset; - real_type = field->real_type(); - true_len = key_len; + ha_statistic_increment(&SSV::ha_update_count); - /* Character set for the field is defined only - to fields whose type is string and real field - type is not enum or set. For these fields check - if character set is multi byte. */ + if (share->ib_table != prebuilt->table) { + fprintf(stderr, + "InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.", + share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt); + } - if (real_type != MYSQL_TYPE_ENUM - && real_type != MYSQL_TYPE_SET - && ( mysql_type == MYSQL_TYPE_VAR_STRING - || mysql_type == MYSQL_TYPE_STRING)) { + if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } - cs = field->charset(); + if (prebuilt->upd_node) { + uvect = prebuilt->upd_node->update; + } else { + uvect = row_get_prebuilt_update_vector(prebuilt); + } - /* For multi byte character sets we need to - calculate the true length of the key */ + /* Build an update vector from the modified fields in the rows + (uses upd_buf of the handle) */ - if (key_len > 0 && cs->mbmaxlen > 1) { + error = calc_row_difference(uvect, (uchar*) old_row, new_row, table, + upd_buf, upd_buf_size, prebuilt, user_thd); - true_len = (ulint) - cs->cset->well_formed_len(cs, - (const char *)src_start, - (const char *)src_start - + key_len, - (uint) (key_len / - cs->mbmaxlen), - &error); - } - memcpy(sorted, src_start, true_len); - true_len = wsrep_innobase_mysql_sort( - mysql_type, cs->number, sorted, true_len, - REC_VERSION_56_MAX_INDEX_COL_LEN); + if (error != DB_SUCCESS) { + goto func_exit; + } - memcpy(buff, sorted, true_len); - } else { - memcpy(buff, src_start, true_len); - } - buff += true_len; + /* This is not a delete */ + prebuilt->upd_node->is_delete = FALSE; - /* Pad the unused space with spaces. */ + ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW); -#ifdef REMOVED - if (true_len < key_len) { - ulint pad_len = key_len - true_len; - ut_a(!(pad_len % cs->mbminlen)); + innobase_srv_conc_enter_innodb(trx); - cs->cset->fill(cs, buff, pad_len, - 0x20 /* space */); - buff += pad_len; - } -#endif /* REMOVED */ - } - } + error = row_update_for_mysql((byte*) old_row, prebuilt); - ut_a(buff <= buff_start + buff_len); + /* We need to do some special AUTOINC handling for the following case: - DBUG_RETURN((uint)(buff - buff_start)); -} -#endif /* WITH_WSREP */ -UNIV_INTERN -uint -ha_innobase::store_key_val_for_row( -/*===============================*/ - uint keynr, /*!< in: key number */ - char* buff, /*!< in/out: buffer for the key value (in MySQL - format) */ - uint buff_len,/*!< in: buffer length */ - const uchar* record)/*!< in: row in MySQL format */ -{ - KEY* key_info = table->key_info + keynr; - KEY_PART_INFO* key_part = key_info->key_part; - KEY_PART_INFO* end = key_part + key_info->key_parts; - char* buff_start = buff; - enum_field_types mysql_type; - Field* field; - ibool is_null; + INSERT INTO t (c1,c2) VALUES(x,y) ON DUPLICATE KEY UPDATE ... - DBUG_ENTER("store_key_val_for_row"); + We need to use the AUTOINC counter that was actually used by + MySQL in the UPDATE statement, which can be different from the + value used in the INSERT statement.*/ - /* The format for storing a key field in MySQL is the following: + if (error == DB_SUCCESS + && table->next_number_field + && new_row == table->record[0] + && thd_sql_command(user_thd) == SQLCOM_INSERT + && trx->duplicates) { - 1. If the column can be NULL, then in the first byte we put 1 if the - field value is NULL, 0 otherwise. + ulonglong auto_inc; + ulonglong col_max_value; - 2. If the column is of a BLOB type (it must be a column prefix field - in this case), then we put the length of the data in the field to the - next 2 bytes, in the little-endian format. If the field is SQL NULL, - then these 2 bytes are set to 0. Note that the length of data in the - field is <= column prefix length. + auto_inc = table->next_number_field->val_uint(); - 3. In a column prefix field, prefix_len next bytes are reserved for - data. In a normal field the max field length next bytes are reserved - for data. For a VARCHAR(n) the max field length is n. If the stored - value is the SQL NULL then these data bytes are set to 0. + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + col_max_value = innobase_get_int_col_max_value( + table->next_number_field); - 4. We always use a 2 byte length for a true >= 5.0.3 VARCHAR. Note that - in the MySQL row format, the length is stored in 1 or 2 bytes, - depending on the maximum allowed length. But in the MySQL key value - format, the length always takes 2 bytes. + if (auto_inc <= col_max_value && auto_inc != 0) { - We have to zero-fill the buffer so that MySQL is able to use a - simple memcmp to compare two key values to determine if they are - equal. MySQL does this to compare contents of two 'ref' values. */ + ulonglong offset; + ulonglong increment; - bzero(buff, buff_len); + offset = prebuilt->autoinc_offset; + increment = prebuilt->autoinc_increment; - for (; key_part != end; key_part++) { - is_null = FALSE; + auto_inc = innobase_next_autoinc( + auto_inc, 1, increment, offset, col_max_value); - if (key_part->null_bit) { - if (record[key_part->null_offset] - & key_part->null_bit) { - *buff = 1; - is_null = TRUE; - } else { - *buff = 0; - } - buff++; + error = innobase_set_max_autoinc(auto_inc); } + } - field = key_part->field; - mysql_type = field->type(); + innobase_srv_conc_exit_innodb(trx); - if (mysql_type == MYSQL_TYPE_VARCHAR) { - /* >= 5.0.3 true VARCHAR */ - ulint lenlen; - ulint len; - const byte* data; - ulint key_len; - ulint true_len; - CHARSET_INFO* cs; - int error=0; +func_exit: + int err = convert_error_code_to_mysql(error, + prebuilt->table->flags, user_thd); - key_len = key_part->length; + /* If success and no columns were updated. */ + if (err == 0 && uvect->n_fields == 0) { - if (is_null) { - buff += key_len + 2; + /* This is the same as success, but instructs + MySQL that the row is not really updated and it + should not increase the count of updated rows. + This is fix for http://bugs.mysql.com/29157 */ + err = HA_ERR_RECORD_IS_THE_SAME; + } else if (err == HA_FTS_INVALID_DOCID) { + my_error(HA_FTS_INVALID_DOCID, MYF(0)); + } - continue; - } - cs = field->charset(); + /* Tell InnoDB server that there might be work for + utility threads: */ - lenlen = (ulint) - (((Field_varstring*)field)->length_bytes); + innobase_active_small(); - data = row_mysql_read_true_varchar(&len, - (byte*) (record - + (ulint)get_field_offset(table, field)), - lenlen); +#ifdef WITH_WSREP + if (error == DB_SUCCESS && + wsrep_thd_exec_mode(user_thd) == LOCAL_STATE && + wsrep_on(user_thd) && + !wsrep_thd_skip_append_keys(user_thd)) + { + DBUG_PRINT("wsrep", ("update row key")); - true_len = len; + if (wsrep_append_keys(user_thd, false, old_row, new_row)) { + WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED"); + DBUG_PRINT("wsrep", ("row key failed")); + err = HA_ERR_INTERNAL_ERROR; + goto wsrep_error; + } + } +wsrep_error: +#endif /* WITH_WSREP */ - /* For multi byte character sets we need to calculate - the true length of the key */ + if (share->ib_table != prebuilt->table) { + fprintf(stderr, + "InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.", + share->ib_table, prebuilt->table, prebuilt->table->name, prebuilt->table->is_corrupt); + } - if (len > 0 && cs->mbmaxlen > 1) { - true_len = (ulint) cs->cset->well_formed_len(cs, - (const char *) data, - (const char *) data + len, - (uint) (key_len / - cs->mbmaxlen), - &error); - } + if (UNIV_UNLIKELY(share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } - /* In a column prefix index, we may need to truncate - the stored value: */ + DBUG_RETURN(err); +} - if (true_len > key_len) { - true_len = key_len; - } +/**********************************************************************//** +Deletes a row given as the parameter. +@return error number or 0 */ +UNIV_INTERN +int +ha_innobase::delete_row( +/*====================*/ + const uchar* record) /*!< in: a row in MySQL format */ +{ + dberr_t error; + trx_t* trx = thd_to_trx(user_thd); - /* The length in a key value is always stored in 2 - bytes */ + DBUG_ENTER("ha_innobase::delete_row"); - row_mysql_store_true_var_len((byte*)buff, true_len, 2); - buff += 2; + ut_a(prebuilt->trx == trx); - memcpy(buff, data, true_len); + if (high_level_read_only) { + ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); + DBUG_RETURN(HA_ERR_TABLE_READONLY); + } else if (!trx_is_started(trx)) { + ++trx->will_lock; + } - /* Note that we always reserve the maximum possible - length of the true VARCHAR in the key value, though - only len first bytes after the 2 length bytes contain - actual data. The rest of the space was reset to zero - in the bzero() call above. */ + ha_statistic_increment(&SSV::ha_delete_count); - buff += key_len; + if (UNIV_UNLIKELY(share && share->ib_table + && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } - } else if (mysql_type == MYSQL_TYPE_TINY_BLOB - || mysql_type == MYSQL_TYPE_MEDIUM_BLOB - || mysql_type == MYSQL_TYPE_BLOB - || mysql_type == MYSQL_TYPE_LONG_BLOB - /* MYSQL_TYPE_GEOMETRY data is treated - as BLOB data in innodb. */ - || mysql_type == MYSQL_TYPE_GEOMETRY) { + if (!prebuilt->upd_node) { + row_get_prebuilt_update_vector(prebuilt); + } - CHARSET_INFO* cs; - ulint key_len; - ulint true_len; - int error=0; - ulint blob_len; - const byte* blob_data; + /* This is a delete */ - ut_a(key_part->key_part_flag & HA_PART_KEY_SEG); + prebuilt->upd_node->is_delete = TRUE; - key_len = key_part->length; + innobase_srv_conc_enter_innodb(trx); - if (is_null) { - buff += key_len + 2; + error = row_update_for_mysql((byte*) record, prebuilt); - continue; - } + innobase_srv_conc_exit_innodb(trx); - cs = field->charset(); + /* Tell the InnoDB server that there might be work for + utility threads: */ - blob_data = row_mysql_read_blob_ref(&blob_len, - (byte*) (record - + (ulint)get_field_offset(table, field)), - (ulint) field->pack_length()); + innobase_active_small(); - true_len = blob_len; +#ifdef WITH_WSREP + if (error == DB_SUCCESS && + wsrep_thd_exec_mode(user_thd) == LOCAL_STATE && + wsrep_on(user_thd) && + !wsrep_thd_skip_append_keys(user_thd)) + { + if (wsrep_append_keys(user_thd, false, record, NULL)) { + DBUG_PRINT("wsrep", ("delete fail")); + error = DB_ERROR; + goto wsrep_error; + } + } +wsrep_error: +#endif /* WITH_WSREP */ - ut_a(get_field_offset(table, field) - == key_part->offset); + if (UNIV_UNLIKELY(share && share->ib_table + && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } - /* For multi byte character sets we need to calculate - the true length of the key */ + DBUG_RETURN(convert_error_code_to_mysql( + error, prebuilt->table->flags, user_thd)); +} - if (blob_len > 0 && cs->mbmaxlen > 1) { - true_len = (ulint) cs->cset->well_formed_len(cs, - (const char *) blob_data, - (const char *) blob_data - + blob_len, - (uint) (key_len / - cs->mbmaxlen), - &error); - } +/**********************************************************************//** +Removes a new lock set on a row, if it was not read optimistically. This can +be called after a row has been read in the processing of an UPDATE or a DELETE +query, if the option innodb_locks_unsafe_for_binlog is set. */ +UNIV_INTERN +void +ha_innobase::unlock_row(void) +/*=========================*/ +{ + DBUG_ENTER("ha_innobase::unlock_row"); - /* All indexes on BLOB and TEXT are column prefix - indexes, and we may need to truncate the data to be - stored in the key value: */ + /* Consistent read does not take any locks, thus there is + nothing to unlock. */ - if (true_len > key_len) { - true_len = key_len; - } + if (prebuilt->select_lock_type == LOCK_NONE) { + DBUG_VOID_RETURN; + } - /* MySQL reserves 2 bytes for the length and the - storage of the number is little-endian */ + /* Ideally, this assert must be in the beginning of the function. + But there are some calls to this function from the SQL layer when the + transaction is in state TRX_STATE_NOT_STARTED. The check on + prebuilt->select_lock_type above gets around this issue. */ + ut_ad(trx_state_eq(prebuilt->trx, TRX_STATE_ACTIVE)); - innobase_write_to_2_little_endian( - (byte*)buff, true_len); - buff += 2; + switch (prebuilt->row_read_type) { + case ROW_READ_WITH_LOCKS: + if (!srv_locks_unsafe_for_binlog + && prebuilt->trx->isolation_level + > TRX_ISO_READ_COMMITTED) { + break; + } + /* fall through */ + case ROW_READ_TRY_SEMI_CONSISTENT: + row_unlock_for_mysql(prebuilt, FALSE); + break; + case ROW_READ_DID_SEMI_CONSISTENT: + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + break; + } - memcpy(buff, blob_data, true_len); + DBUG_VOID_RETURN; +} - /* Note that we always reserve the maximum possible - length of the BLOB prefix in the key value. */ +/* See handler.h and row0mysql.h for docs on this function. */ +UNIV_INTERN +bool +ha_innobase::was_semi_consistent_read(void) +/*=======================================*/ +{ + return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT); +} - buff += key_len; - } else { - /* Here we handle all other data types except the - true VARCHAR, BLOB and TEXT. Note that the column - value we store may be also in a column prefix - index. */ +/* See handler.h and row0mysql.h for docs on this function. */ +UNIV_INTERN +void +ha_innobase::try_semi_consistent_read(bool yes) +/*===========================================*/ +{ + ut_a(prebuilt->trx == thd_to_trx(ha_thd())); - CHARSET_INFO* cs; - ulint true_len; - ulint key_len; - const uchar* src_start; - int error=0; - enum_field_types real_type; + /* Row read type is set to semi consistent read if this was + requested by the MySQL and either innodb_locks_unsafe_for_binlog + option is used or this session is using READ COMMITTED isolation + level. */ - key_len = key_part->length; + if (yes + && (srv_locks_unsafe_for_binlog + || prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED)) { + prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT; + } else { + prebuilt->row_read_type = ROW_READ_WITH_LOCKS; + } +} - if (is_null) { - buff += key_len; +/******************************************************************//** +Initializes a handle to use an index. +@return 0 or error number */ +UNIV_INTERN +int +ha_innobase::index_init( +/*====================*/ + uint keynr, /*!< in: key (index) number */ + bool sorted) /*!< in: 1 if result MUST be sorted according to index */ +{ + DBUG_ENTER("index_init"); - continue; - } + DBUG_RETURN(change_active_index(keynr)); +} - src_start = record + key_part->offset; - real_type = field->real_type(); - true_len = key_len; +/******************************************************************//** +Currently does nothing. +@return 0 */ +UNIV_INTERN +int +ha_innobase::index_end(void) +/*========================*/ +{ + int error = 0; + DBUG_ENTER("index_end"); + active_index = MAX_KEY; + in_range_check_pushed_down = FALSE; + ds_mrr.dsmrr_close(); + DBUG_RETURN(error); +} - /* Character set for the field is defined only - to fields whose type is string and real field - type is not enum or set. For these fields check - if character set is multi byte. */ +/*********************************************************************//** +Converts a search mode flag understood by MySQL to a flag understood +by InnoDB. */ +static inline +ulint +convert_search_mode_to_innobase( +/*============================*/ + enum ha_rkey_function find_flag) +{ + switch (find_flag) { + case HA_READ_KEY_EXACT: + /* this does not require the index to be UNIQUE */ + return(PAGE_CUR_GE); + case HA_READ_KEY_OR_NEXT: + return(PAGE_CUR_GE); + case HA_READ_KEY_OR_PREV: + return(PAGE_CUR_LE); + case HA_READ_AFTER_KEY: + return(PAGE_CUR_G); + case HA_READ_BEFORE_KEY: + return(PAGE_CUR_L); + case HA_READ_PREFIX: + return(PAGE_CUR_GE); + case HA_READ_PREFIX_LAST: + return(PAGE_CUR_LE); + case HA_READ_PREFIX_LAST_OR_PREV: + return(PAGE_CUR_LE); + /* In MySQL-4.0 HA_READ_PREFIX and HA_READ_PREFIX_LAST always + pass a complete-field prefix of a key value as the search + tuple. I.e., it is not allowed that the last field would + just contain n first bytes of the full field value. + MySQL uses a 'padding' trick to convert LIKE 'abc%' + type queries so that it can use as a search tuple + a complete-field-prefix of a key value. Thus, the InnoDB + search mode PAGE_CUR_LE_OR_EXTENDS is never used. + TODO: when/if MySQL starts to use also partial-field + prefixes, we have to deal with stripping of spaces + and comparison of non-latin1 char type fields in + innobase_mysql_cmp() to get PAGE_CUR_LE_OR_EXTENDS to + work correctly. */ + case HA_READ_MBR_CONTAIN: + case HA_READ_MBR_INTERSECT: + case HA_READ_MBR_WITHIN: + case HA_READ_MBR_DISJOINT: + case HA_READ_MBR_EQUAL: + return(PAGE_CUR_UNSUPP); + /* do not use "default:" in order to produce a gcc warning: + enumeration value '...' not handled in switch + (if -Wswitch or -Wall is used) */ + } - if (real_type != MYSQL_TYPE_ENUM - && real_type != MYSQL_TYPE_SET - && ( mysql_type == MYSQL_TYPE_VAR_STRING - || mysql_type == MYSQL_TYPE_STRING)) { + my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality"); - cs = field->charset(); + return(PAGE_CUR_UNSUPP); +} + +/* + BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED + --------------------------------------------------- +The following does not cover all the details, but explains how we determine +the start of a new SQL statement, and what is associated with it. - /* For multi byte character sets we need to - calculate the true length of the key */ +For each table in the database the MySQL interpreter may have several +table handle instances in use, also in a single SQL query. For each table +handle instance there is an InnoDB 'prebuilt' struct which contains most +of the InnoDB data associated with this table handle instance. - if (key_len > 0 && cs->mbmaxlen > 1) { + A) if the user has not explicitly set any MySQL table level locks: - true_len = (ulint) - cs->cset->well_formed_len(cs, - (const char *)src_start, - (const char *)src_start - + key_len, - (uint) (key_len / - cs->mbmaxlen), - &error); - } - } + 1) MySQL calls ::external_lock to set an 'intention' table level lock on +the table of the handle instance. There we set +prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set +true if we are taking this table handle instance to use in a new SQL +statement issued by the user. We also increment trx->n_mysql_tables_in_use. - memcpy(buff, src_start, true_len); - buff += true_len; + 2) If prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search +instructions to prebuilt->template of the table handle instance in +::index_read. The template is used to save CPU time in large joins. - /* Pad the unused space with spaces. */ + 3) In row_search_for_mysql, if prebuilt->sql_stat_start is true, we +allocate a new consistent read view for the trx if it does not yet have one, +or in the case of a locking read, set an InnoDB 'intention' table level +lock on the table. - if (true_len < key_len) { - ulint pad_len = key_len - true_len; - ut_a(!(pad_len % cs->mbminlen)); + 4) We do the SELECT. MySQL may repeatedly call ::index_read for the +same table handle instance, if it is a join. - cs->cset->fill(cs, buff, pad_len, - 0x20 /* space */); - buff += pad_len; - } - } - } + 5) When the SELECT ends, MySQL removes its intention table level locks +in ::external_lock. When trx->n_mysql_tables_in_use drops to zero, + (a) we execute a COMMIT there if the autocommit is on, + (b) we also release possible 'SQL statement level resources' InnoDB may +have for this SQL statement. The MySQL interpreter does NOT execute +autocommit for pure read transactions, though it should. That is why the +table handler in that case has to execute the COMMIT in ::external_lock. - ut_a(buff <= buff_start + buff_len); + B) If the user has explicitly set MySQL table level locks, then MySQL +does NOT call ::external_lock at the start of the statement. To determine +when we are at the start of a new SQL statement we at the start of +::index_read also compare the query id to the latest query id where the +table handle instance was used. If it has changed, we know we are at the +start of a new SQL statement. Since the query id can theoretically +overwrap, we use this test only as a secondary way of determining the +start of a new SQL statement. */ - DBUG_RETURN((uint)(buff - buff_start)); -} -/**************************************************************//** -Determines if a field is needed in a prebuilt struct 'template'. -@return field to use, or NULL if the field is not needed */ -static -const Field* -build_template_needs_field( -/*=======================*/ - ibool index_contains, /*!< in: - dict_index_contains_col_or_prefix( - index, i) */ - ibool read_just_key, /*!< in: TRUE when MySQL calls - ha_innobase::extra with the - argument HA_EXTRA_KEYREAD; it is enough - to read just columns defined in - the index (i.e., no read of the - clustered index record necessary) */ - ibool fetch_all_in_key, - /*!< in: true=fetch all fields in - the index */ - ibool fetch_primary_key_cols, - /*!< in: true=fetch the - primary key columns */ - dict_index_t* index, /*!< in: InnoDB index to use */ - const TABLE* table, /*!< in: MySQL table object */ - ulint i, /*!< in: field index in InnoDB table */ - ulint sql_idx) /*!< in: field index in SQL table */ +/**********************************************************************//** +Positions an index cursor to the index specified in the handle. Fetches the +row if any. +@return 0, HA_ERR_KEY_NOT_FOUND, or error number */ +UNIV_INTERN +int +ha_innobase::index_read( +/*====================*/ + uchar* buf, /*!< in/out: buffer for the returned + row */ + const uchar* key_ptr, /*!< in: key value; if this is NULL + we position the cursor at the + start or end of index; this can + also contain an InnoDB row id, in + which case key_len is the InnoDB + row id length; the key value can + also be a prefix of a full key value, + and the last column can be a prefix + of a full column */ + uint key_len,/*!< in: key value length */ + enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */ { - const Field* field = table->field[sql_idx]; + ulint mode; + dict_index_t* index; + ulint match_mode = 0; + int error; + dberr_t ret; - ut_ad(index_contains == dict_index_contains_col_or_prefix(index, i)); + DBUG_ENTER("index_read"); + DEBUG_SYNC_C("ha_innobase_index_read_begin"); - if (!index_contains) { - if (read_just_key) { - /* If this is a 'key read', we do not need - columns that are not in the key */ + ut_a(prebuilt->trx == thd_to_trx(user_thd)); + ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT); - return(NULL); - } - } else if (fetch_all_in_key) { - /* This field is needed in the query */ + ha_statistic_increment(&SSV::ha_read_key_count); - return(field); + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); } - if (bitmap_is_set(table->read_set, sql_idx) - || bitmap_is_set(table->write_set, sql_idx)) { - /* This field is needed in the query */ + index = prebuilt->index; - return(field); + if (UNIV_UNLIKELY(index == NULL) || dict_index_is_corrupted(index)) { + prebuilt->index_usable = FALSE; + DBUG_RETURN(HA_ERR_CRASHED); + } + if (UNIV_UNLIKELY(!prebuilt->index_usable)) { + DBUG_RETURN(dict_index_is_corrupted(index) + ? HA_ERR_INDEX_CORRUPT + : HA_ERR_TABLE_DEF_CHANGED); } - if (fetch_primary_key_cols - && dict_table_col_in_clustered_key(index->table, i)) { - /* This field is needed in the query */ + if (index->type & DICT_FTS) { + DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); + } - return(field); + /* Note that if the index for which the search template is built is not + necessarily prebuilt->index, but can also be the clustered index */ + + if (prebuilt->sql_stat_start) { + build_template(false); } - /* This field is not needed in the query, skip it */ + if (key_ptr) { + /* Convert the search key value to InnoDB format into + prebuilt->search_tuple */ - return(NULL); -} + row_sel_convert_mysql_key_to_innobase( + prebuilt->search_tuple, + prebuilt->srch_key_val1, + prebuilt->srch_key_val_len, + index, + (byte*) key_ptr, + (ulint) key_len, + prebuilt->trx); + DBUG_ASSERT(prebuilt->search_tuple->n_fields > 0); + } else { + /* We position the cursor to the last or the first entry + in the index */ -/**************************************************************//** -Adds a field is to a prebuilt struct 'template'. -@return the field template */ -static -mysql_row_templ_t* -build_template_field( -/*=================*/ - row_prebuilt_t* prebuilt, /*!< in/out: template */ - dict_index_t* clust_index, /*!< in: InnoDB clustered index */ - dict_index_t* index, /*!< in: InnoDB index to use */ - TABLE* table, /*!< in: MySQL table object */ - const Field* field, /*!< in: field in MySQL table */ - ulint i) /*!< in: field index in InnoDB table */ -{ - mysql_row_templ_t* templ; - const dict_col_t* col; + dtuple_set_n_fields(prebuilt->search_tuple, 0); + } - //ut_ad(field == table->field[i]); - ut_ad(clust_index->table == index->table); + mode = convert_search_mode_to_innobase(find_flag); - col = dict_table_get_nth_col(index->table, i); + match_mode = 0; - templ = prebuilt->mysql_template + prebuilt->n_template++; - UNIV_MEM_INVALID(templ, sizeof *templ); - templ->col_no = i; - templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index); - ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED); + if (find_flag == HA_READ_KEY_EXACT) { - if (dict_index_is_clust(index)) { - templ->rec_field_no = templ->clust_rec_field_no; - } else { - templ->rec_field_no = dict_index_get_nth_col_pos(index, i); - } + match_mode = ROW_SEL_EXACT; - if (field->null_ptr) { - templ->mysql_null_byte_offset = - (ulint) ((char*) field->null_ptr - - (char*) table->record[0]); + } else if (find_flag == HA_READ_PREFIX + || find_flag == HA_READ_PREFIX_LAST) { - templ->mysql_null_bit_mask = (ulint) field->null_bit; - } else { - templ->mysql_null_bit_mask = 0; + match_mode = ROW_SEL_EXACT_PREFIX; } - templ->mysql_col_offset = (ulint) get_field_offset(table, field); + last_match_mode = (uint) match_mode; - templ->mysql_col_len = (ulint) field->pack_length(); - templ->type = col->mtype; - templ->mysql_type = (ulint)field->type(); + if (mode != PAGE_CUR_UNSUPP) { - if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) { - templ->mysql_length_bytes = (ulint) - (((Field_varstring*)field)->length_bytes); - } + innobase_srv_conc_enter_innodb(prebuilt->trx); - templ->charset = dtype_get_charset_coll(col->prtype); - templ->mbminlen = col->mbminlen; - templ->mbmaxlen = col->mbmaxlen; - templ->is_unsigned = col->prtype & DATA_UNSIGNED; + ret = row_search_for_mysql((byte*) buf, mode, prebuilt, + match_mode, 0); - if (!dict_index_is_clust(index) - && templ->rec_field_no == ULINT_UNDEFINED) { - prebuilt->need_to_access_clustered = TRUE; + innobase_srv_conc_exit_innodb(prebuilt->trx); + } else { + + ret = DB_UNSUPPORTED; } - if (prebuilt->mysql_prefix_len < templ->mysql_col_offset - + templ->mysql_col_len) { - prebuilt->mysql_prefix_len = templ->mysql_col_offset - + templ->mysql_col_len; + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); } - if (templ->type == DATA_BLOB) { - prebuilt->templ_contains_blob = TRUE; + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + if (prebuilt->table->is_system_db) { + srv_stats.n_system_rows_read.add( + (size_t) prebuilt->trx->id, 1); + } else { + srv_stats.n_rows_read.add( + (size_t) prebuilt->trx->id, 1); + } + break; + case DB_RECORD_NOT_FOUND: + error = HA_ERR_KEY_NOT_FOUND; + table->status = STATUS_NOT_FOUND; + break; + case DB_END_OF_INDEX: + error = HA_ERR_KEY_NOT_FOUND; + table->status = STATUS_NOT_FOUND; + break; + case DB_TABLESPACE_DELETED: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, MYF(0), + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; + default: + error = convert_error_code_to_mysql( + ret, prebuilt->table->flags, user_thd); + + table->status = STATUS_NOT_FOUND; + break; } - return(templ); + DBUG_RETURN(error); } -/**************************************************************//** -Builds a 'template' to the prebuilt struct. The template is used in fast -retrieval of just those column values MySQL needs in its processing. */ +/*******************************************************************//** +The following functions works like index_read, but it find the last +row with the current key value or prefix. +@return 0, HA_ERR_KEY_NOT_FOUND, or an error code */ UNIV_INTERN -void -ha_innobase::build_template( -/*========================*/ - bool whole_row) /*!< in: true=ROW_MYSQL_WHOLE_ROW, - false=ROW_MYSQL_REC_FIELDS */ +int +ha_innobase::index_read_last( +/*=========================*/ + uchar* buf, /*!< out: fetched row */ + const uchar* key_ptr,/*!< in: key value, or a prefix of a full + key value */ + uint key_len)/*!< in: length of the key val or prefix + in bytes */ { - dict_index_t* index; - dict_index_t* clust_index; - ulint n_stored_fields; - ibool fetch_all_in_key = FALSE; - ibool fetch_primary_key_cols = FALSE; - ulint i, sql_idx; - - if (prebuilt->select_lock_type == LOCK_X) { - /* We always retrieve the whole clustered index record if we - use exclusive row level locks, for example, if the read is - done in an UPDATE statement. */ + return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST)); +} - whole_row = true; - } else if (!whole_row) { - if (prebuilt->hint_need_to_fetch_extra_cols - == ROW_RETRIEVE_ALL_COLS) { +/********************************************************************//** +Get the index for a handle. Does not change active index. +@return NULL or index instance. */ +UNIV_INTERN +dict_index_t* +ha_innobase::innobase_get_index( +/*============================*/ + uint keynr) /*!< in: use this index; MAX_KEY means always + clustered index, even if it was internally + generated by InnoDB */ +{ + KEY* key = 0; + dict_index_t* index = 0; - /* We know we must at least fetch all columns in the - key, or all columns in the table */ + DBUG_ENTER("innobase_get_index"); - if (prebuilt->read_just_key) { - /* MySQL has instructed us that it is enough - to fetch the columns in the key; looks like - MySQL can set this flag also when there is - only a prefix of the column in the key: in - that case we retrieve the whole column from - the clustered index */ + if (keynr != MAX_KEY && table->s->keys > 0) { + key = table->key_info + keynr; - fetch_all_in_key = TRUE; - } else { - whole_row = true; - } - } else if (prebuilt->hint_need_to_fetch_extra_cols - == ROW_RETRIEVE_PRIMARY_KEY) { - /* We must at least fetch all primary key cols. Note - that if the clustered index was internally generated - by InnoDB on the row id (no primary key was - defined), then row_search_for_mysql() will always - retrieve the row id to a special buffer in the - prebuilt struct. */ + index = innobase_index_lookup(share, keynr); - fetch_primary_key_cols = TRUE; - } - } + if (index) { - clust_index = dict_table_get_first_index(prebuilt->table); + if (!key || ut_strcmp(index->name, key->name) != 0) { + fprintf(stderr, "InnoDB: [Error] Index for key no %u" + " mysql name %s , InnoDB name %s for table %s\n", + keynr, key ? key->name : "NULL", + index->name, + prebuilt->table->name); - index = whole_row ? clust_index : prebuilt->index; + for(ulint i=0; i < table->s->keys; i++) { + index = innobase_index_lookup(share, i); + key = table->key_info + keynr; - prebuilt->need_to_access_clustered = (index == clust_index); + if (index) { - /* Below we check column by column if we need to access - the clustered index. */ + fprintf(stderr, "InnoDB: [Note] Index for key no %u" + " mysql name %s , InnoDB name %s for table %s\n", + keynr, key ? key->name : "NULL", + index->name, + prebuilt->table->name); + } + } + } - n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */ + ut_a(ut_strcmp(index->name, key->name) == 0); + } else { + /* Can't find index with keynr in the translation + table. Only print message if the index translation + table exists */ + if (share->idx_trans_tbl.index_mapping) { + sql_print_warning("InnoDB could not find " + "index %s key no %u for " + "table %s through its " + "index translation table", + key ? key->name : "NULL", + keynr, + prebuilt->table->name); + } - if (!prebuilt->mysql_template) { - prebuilt->mysql_template = (mysql_row_templ_t*) - mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t)); + index = dict_table_get_index_on_name(prebuilt->table, + key->name); + } + } else { + index = dict_table_get_first_index(prebuilt->table); } - prebuilt->template_type = whole_row - ? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS; - prebuilt->null_bitmap_len = table->s->null_bytes; - - /* Prepare to build prebuilt->mysql_template[]. */ - prebuilt->templ_contains_blob = FALSE; - prebuilt->mysql_prefix_len = 0; - prebuilt->n_template = 0; - prebuilt->idx_cond_n_cols = 0; + if (!index) { + sql_print_error( + "Innodb could not find key n:o %u with name %s " + "from dict cache for table %s", + keynr, key ? key->name : "NULL", + prebuilt->table->name); + } - /* Note that in InnoDB, i is the column number in the table. - MySQL calls columns 'fields'. */ + DBUG_RETURN(index); +} - if (active_index != MAX_KEY && active_index == pushed_idx_cond_keyno) { - /* Push down an index condition or an end_range check. */ - for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { +/********************************************************************//** +Changes the active index of a handle. +@return 0 or error code */ +UNIV_INTERN +int +ha_innobase::change_active_index( +/*=============================*/ + uint keynr) /*!< in: use this index; MAX_KEY means always clustered + index, even if it was internally generated by + InnoDB */ +{ + DBUG_ENTER("change_active_index"); - while (!table->field[sql_idx]->stored_in_db) { - sql_idx++; - } - - const ibool index_contains - = dict_index_contains_col_or_prefix(index, i); + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } - /* Test if an end_range or an index condition - refers to the field. Note that "index" and - "index_contains" may refer to the clustered index. - Index condition pushdown is relative to prebuilt->index - (the index that is being looked up first). */ + ut_ad(user_thd == ha_thd()); + ut_a(prebuilt->trx == thd_to_trx(user_thd)); - /* When join_read_always_key() invokes this - code via handler::ha_index_init() and - ha_innobase::index_init(), end_range is not - yet initialized. Because of that, we must - always check for index_contains, instead of - the subset - field->part_of_key.is_set(active_index) - which would be acceptable if end_range==NULL. */ - if (index == prebuilt->index - ? index_contains - : dict_index_contains_col_or_prefix( - prebuilt->index, i)) { - /* Needed in ICP */ - const Field* field; - mysql_row_templ_t* templ; + active_index = keynr; - if (whole_row) { - field = table->field[sql_idx]; - } else { - field = build_template_needs_field( - index_contains, - prebuilt->read_just_key, - fetch_all_in_key, - fetch_primary_key_cols, - index, table, i, sql_idx); - if (!field) { - continue; - } - } + prebuilt->index = innobase_get_index(keynr); - templ = build_template_field( - prebuilt, clust_index, index, - table, field, i); - prebuilt->idx_cond_n_cols++; - ut_ad(prebuilt->idx_cond_n_cols - == prebuilt->n_template); + if (UNIV_UNLIKELY(!prebuilt->index)) { + sql_print_warning("InnoDB: change_active_index(%u) failed", + keynr); + prebuilt->index_usable = FALSE; + DBUG_RETURN(1); + } - if (index == prebuilt->index) { - templ->icp_rec_field_no - = templ->rec_field_no; - } else { - templ->icp_rec_field_no - = dict_index_get_nth_col_pos( - prebuilt->index, i); - } + prebuilt->index_usable = row_merge_is_index_usable(prebuilt->trx, + prebuilt->index); - if (dict_index_is_clust(prebuilt->index)) { - ut_ad(templ->icp_rec_field_no - != ULINT_UNDEFINED); - /* If the primary key includes - a column prefix, use it in - index condition pushdown, - because the condition is - evaluated before fetching any - off-page (externally stored) - columns. */ - if (templ->icp_rec_field_no - < prebuilt->index->n_uniq) { - /* This is a key column; - all set. */ - continue; - } - } else if (templ->icp_rec_field_no - != ULINT_UNDEFINED) { - continue; - } + if (UNIV_UNLIKELY(!prebuilt->index_usable)) { + if (dict_index_is_corrupted(prebuilt->index)) { + char index_name[MAX_FULL_NAME_LEN + 1]; + char table_name[MAX_FULL_NAME_LEN + 1]; - /* This is a column prefix index. - The column prefix can be used in - an end_range comparison. */ + innobase_format_name( + index_name, sizeof index_name, + prebuilt->index->name, TRUE); - templ->icp_rec_field_no - = dict_index_get_nth_col_or_prefix_pos( - prebuilt->index, i, TRUE); - ut_ad(templ->icp_rec_field_no - != ULINT_UNDEFINED); + innobase_format_name( + table_name, sizeof table_name, + prebuilt->index->table->name, FALSE); - /* Index condition pushdown can be used on - all columns of a secondary index, and on - the PRIMARY KEY columns. */ - /* TODO: enable this assertion - (but first ensure that end_range is - valid here and use an accurate condition - for end_range) - ut_ad(!dict_index_is_clust(prebuilt->index) - || templ->rec_field_no - < prebuilt->index->n_uniq); - */ - } + push_warning_printf( + user_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_INDEX_CORRUPT, + "InnoDB: Index %s for table %s is" + " marked as corrupted", + index_name, table_name); + DBUG_RETURN(HA_ERR_INDEX_CORRUPT); + } else { + push_warning_printf( + user_thd, Sql_condition::WARN_LEVEL_WARN, + HA_ERR_TABLE_DEF_CHANGED, + "InnoDB: insufficient history for index %u", + keynr); } - ut_ad(prebuilt->idx_cond_n_cols > 0); - ut_ad(prebuilt->idx_cond_n_cols == prebuilt->n_template); - - /* Include the fields that are not needed in index condition - pushdown. */ - for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { - - while (!table->field[sql_idx]->stored_in_db) { - sql_idx++; - } - - const ibool index_contains - = dict_index_contains_col_or_prefix(index, i); - - if (index == prebuilt->index - ? !index_contains - : !dict_index_contains_col_or_prefix( - prebuilt->index, i)) { - /* Not needed in ICP */ - const Field* field; + /* The caller seems to ignore this. Thus, we must check + this again in row_search_for_mysql(). */ + DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY, + 0, NULL)); + } - if (whole_row) { - field = table->field[sql_idx]; - } else { - field = build_template_needs_field( - index_contains, - prebuilt->read_just_key, - fetch_all_in_key, - fetch_primary_key_cols, - index, table, i, sql_idx); - if (!field) { - continue; - } - } + ut_a(prebuilt->search_tuple != 0); - build_template_field(prebuilt, - clust_index, index, - table, field, i); - } - } + dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields); - prebuilt->idx_cond = this; - } else { - /* No index condition pushdown */ - prebuilt->idx_cond = NULL; + dict_index_copy_types(prebuilt->search_tuple, prebuilt->index, + prebuilt->index->n_fields); - for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) { - const Field* field; + /* MySQL changes the active index for a handle also during some + queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX() + and then calculates the sum. Previously we played safe and used + the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary + copying. Starting from MySQL-4.1 we use a more efficient flag here. */ - while (!table->field[sql_idx]->stored_in_db) { - sql_idx++; - } + build_template(false); - if (whole_row) { - field = table->field[sql_idx]; - } else { - field = build_template_needs_field( - dict_index_contains_col_or_prefix( - index, i), - prebuilt->read_just_key, - fetch_all_in_key, - fetch_primary_key_cols, - index, table, i, sql_idx); - if (!field) { - continue; - } - } + DBUG_RETURN(0); +} - build_template_field(prebuilt, clust_index, index, - table, field, i); - } - } +/**********************************************************************//** +Positions an index cursor to the index specified in keynr. Fetches the +row if any. +??? This is only used to read whole keys ??? +@return error number or 0 */ +UNIV_INTERN +int +ha_innobase::index_read_idx( +/*========================*/ + uchar* buf, /*!< in/out: buffer for the returned + row */ + uint keynr, /*!< in: use this index */ + const uchar* key, /*!< in: key value; if this is NULL + we position the cursor at the + start or end of index */ + uint key_len, /*!< in: key value length */ + enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */ +{ + if (change_active_index(keynr)) { - if (index != clust_index && prebuilt->need_to_access_clustered) { - /* Change rec_field_no's to correspond to the clustered index - record */ - for (i = 0; i < prebuilt->n_template; i++) { - mysql_row_templ_t* templ - = &prebuilt->mysql_template[i]; - templ->rec_field_no = templ->clust_rec_field_no; - } + return(1); } + + return(index_read(buf, key, key_len, find_flag)); } -/********************************************************************//** -This special handling is really to overcome the limitations of MySQL's -binlogging. We need to eliminate the non-determinism that will arise in -INSERT ... SELECT type of statements, since MySQL binlog only stores the -min value of the autoinc interval. Once that is fixed we can get rid of -the special lock handling. -@return DB_SUCCESS if all OK else error code */ +/***********************************************************************//** +Reads the next or previous row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ UNIV_INTERN -ulint -ha_innobase::innobase_lock_autoinc(void) -/*====================================*/ +int +ha_innobase::general_fetch( +/*=======================*/ + uchar* buf, /*!< in/out: buffer for next row in MySQL + format */ + uint direction, /*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */ + uint match_mode) /*!< in: 0, ROW_SEL_EXACT, or + ROW_SEL_EXACT_PREFIX */ { - ulint error = DB_SUCCESS; + dberr_t ret; + int error; - switch (innobase_autoinc_lock_mode) { - case AUTOINC_NO_LOCKING: - /* Acquire only the AUTOINC mutex. */ - dict_table_autoinc_lock(prebuilt->table); - break; + DBUG_ENTER("general_fetch"); - case AUTOINC_NEW_STYLE_LOCKING: - /* For simple (single/multi) row INSERTs/REPLACEs and RBR - events, we fallback to the old style only if another - transaction has already acquired the AUTOINC lock on - behalf of a LOAD FILE or INSERT ... SELECT etc. type of - statement. */ - if (thd_sql_command(user_thd) == SQLCOM_INSERT - || thd_sql_command(user_thd) == SQLCOM_REPLACE - || thd_sql_command(user_thd) == SQLCOM_END // RBR event - ) { - dict_table_t* table = prebuilt->table; + /* If transaction is not startted do not continue, instead return a error code. */ + if(!(prebuilt->sql_stat_start || (prebuilt->trx && prebuilt->trx->state == 1))) { + DBUG_RETURN(HA_ERR_END_OF_FILE); + } - /* Acquire the AUTOINC mutex. */ - dict_table_autoinc_lock(table); + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } - /* We need to check that another transaction isn't - already holding the AUTOINC lock on the table. */ - if (table->n_waiting_or_granted_auto_inc_locks) { - /* Release the mutex to avoid deadlocks and - fall back to old style locking. */ - dict_table_autoinc_unlock(table); - } else { - /* Do not fall back to old style locking. */ - break; - } - } - /* fall through */ + ut_a(prebuilt->trx == thd_to_trx(user_thd)); - case AUTOINC_OLD_STYLE_LOCKING: - error = row_lock_table_autoinc_for_mysql(prebuilt); + innobase_srv_conc_enter_innodb(prebuilt->trx); - if (error == DB_SUCCESS) { + ret = row_search_for_mysql( + (byte*) buf, 0, prebuilt, match_mode, direction); - /* Acquire the AUTOINC mutex. */ - dict_table_autoinc_lock(prebuilt->table); - } + innobase_srv_conc_exit_innodb(prebuilt->trx); + + if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share + && share->ib_table && share->ib_table->is_corrupt)) { + DBUG_RETURN(HA_ERR_CRASHED); + } + + switch (ret) { + case DB_SUCCESS: + error = 0; + table->status = 0; + srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1); + break; + case DB_RECORD_NOT_FOUND: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_END_OF_INDEX: + error = HA_ERR_END_OF_FILE; + table->status = STATUS_NOT_FOUND; + break; + case DB_TABLESPACE_DELETED: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_DISCARDED, + table->s->table_name.str); + + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; break; + case DB_TABLESPACE_NOT_FOUND: + + ib_senderrf( + prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR, + ER_TABLESPACE_MISSING, + table->s->table_name.str); + table->status = STATUS_NOT_FOUND; + error = HA_ERR_NO_SUCH_TABLE; + break; default: - ut_error; + error = convert_error_code_to_mysql( + ret, prebuilt->table->flags, user_thd); + + table->status = STATUS_NOT_FOUND; + break; } - return(ulong(error)); + DBUG_RETURN(error); +} + +/***********************************************************************//** +Reads the next row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ +UNIV_INTERN +int +ha_innobase::index_next( +/*====================*/ + uchar* buf) /*!< in/out: buffer for next row in MySQL + format */ +{ + return(general_fetch(buf, ROW_SEL_NEXT, 0)); +} + +/*******************************************************************//** +Reads the next row matching to the key value given as the parameter. +@return 0, HA_ERR_END_OF_FILE, or error number */ +UNIV_INTERN +int +ha_innobase::index_next_same( +/*=========================*/ + uchar* buf, /*!< in/out: buffer for the row */ + const uchar* key, /*!< in: key value */ + uint keylen) /*!< in: key value length */ +{ + return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode)); +} + +/***********************************************************************//** +Reads the previous row from a cursor, which must have previously been +positioned using index_read. +@return 0, HA_ERR_END_OF_FILE, or error number */ +UNIV_INTERN +int +ha_innobase::index_prev( +/*====================*/ + uchar* buf) /*!< in/out: buffer for previous row in MySQL format */ +{ + return(general_fetch(buf, ROW_SEL_PREV, 0)); } /********************************************************************//** diff --cc storage/xtradb/log/log0online.cc index 1a30501f266,00000000000..1e373c8345f mode 100644,000000..100644 --- a/storage/xtradb/log/log0online.cc +++ b/storage/xtradb/log/log0online.cc @@@ -1,1924 -1,0 +1,1921 @@@ +/***************************************************************************** + +Copyright (c) 2011-2012 Percona Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 51 Franklin +Street, Fifth Floor, Boston, MA 02110-1301, USA + +*****************************************************************************/ + +/**************************************************//** +@file log/log0online.cc +Online database log parsing for changed page tracking + +*******************************************************/ + +#include "log0online.h" + +#include "my_dbug.h" + +#include "log0recv.h" +#include "mach0data.h" +#include "mtr0log.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "trx0sys.h" +#include "ut0rbt.h" + +enum { FOLLOW_SCAN_SIZE = 4 * (UNIV_PAGE_SIZE_MAX) }; + +#ifdef UNIV_PFS_MUTEX +/* Key to register log_bmp_sys->mutex with PFS */ +UNIV_INTERN mysql_pfs_key_t log_bmp_sys_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/** Log parsing and bitmap output data structure */ +struct log_bitmap_struct { + byte* read_buf_ptr; /*!< Unaligned log read buffer */ + byte* read_buf; /*!< log read buffer */ + byte parse_buf[RECV_PARSING_BUF_SIZE]; + /*!< log parse buffer */ + byte* parse_buf_end; /*!< parse buffer position where the + next read log data should be copied to. + If the previous log records were fully + parsed, it points to the start, + otherwise points immediatelly past the + end of the incomplete log record. */ + char bmp_file_home[FN_REFLEN]; + /*!< directory for bitmap files */ + log_online_bitmap_file_t out; /*!< The current bitmap file */ + ulint out_seq_num; /*!< the bitmap file sequence number */ + lsn_t start_lsn; /*!< the LSN of the next unparsed + record and the start of the next LSN + interval to be parsed. */ + lsn_t end_lsn; /*!< the end of the LSN interval to be + parsed, equal to the next checkpoint + LSN at the time of parse */ + lsn_t next_parse_lsn; /*!< the LSN of the next unparsed + record in the current parse */ + ib_rbt_t* modified_pages; /*!< the current modified page set, + organized as the RB-tree with the keys + of (space, 4KB-block-start-page-id) + pairs */ + ib_rbt_node_t* page_free_list; /*!< Singly-linked list of freed nodes + of modified_pages tree for later + reuse. Nodes are linked through + ib_rbt_node_t.left as this field has + both the correct type and the tree does + not mind its overwrite during + rbt_next() tree traversal. */ +}; + +/* The log parsing and bitmap output struct instance */ +static struct log_bitmap_struct* log_bmp_sys; + +/* Mutex protecting log_bmp_sys */ +static ib_mutex_t log_bmp_sys_mutex; + +/** File name stem for bitmap files. */ +static const char* bmp_file_name_stem = "ib_modified_log_"; + +/** File name template for bitmap files. The 1st format tag is a directory +name, the 2nd tag is the stem, the 3rd tag is a file sequence number, the 4th +tag is the start LSN for the file. */ +static const char* bmp_file_name_template = "%s%s%lu_%llu.xdb"; + +/* On server startup with empty database srv_start_lsn == 0, in +which case the first LSN of actual log records will be this. */ +#define MIN_TRACKED_LSN ((LOG_START_LSN) + (LOG_BLOCK_HDR_SIZE)) + +/* Tests if num bit of bitmap is set */ +#define IS_BIT_SET(bitmap, num) \ + (*((bitmap) + ((num) >> 3)) & (1UL << ((num) & 7UL))) + +/** The bitmap file block size in bytes. All writes will be multiples of this. + */ +enum { + MODIFIED_PAGE_BLOCK_SIZE = 4096 +}; + + +/** Offsets in a file bitmap block */ +enum { + MODIFIED_PAGE_IS_LAST_BLOCK = 0,/* 1 if last block in the current + write, 0 otherwise. */ + MODIFIED_PAGE_START_LSN = 4, /* The starting tracked LSN of this and + other blocks in the same write */ + MODIFIED_PAGE_END_LSN = 12, /* The ending tracked LSN of this and + other blocks in the same write */ + MODIFIED_PAGE_SPACE_ID = 20, /* The space ID of tracked pages in + this block */ + MODIFIED_PAGE_1ST_PAGE_ID = 24, /* The page ID of the first tracked + page in this block */ + MODIFIED_PAGE_BLOCK_UNUSED_1 = 28,/* Unused in order to align the start + of bitmap at 8 byte boundary */ + MODIFIED_PAGE_BLOCK_BITMAP = 32,/* Start of the bitmap itself */ + MODIFIED_PAGE_BLOCK_UNUSED_2 = MODIFIED_PAGE_BLOCK_SIZE - 8, + /* Unused in order to align the end of + bitmap at 8 byte boundary */ + MODIFIED_PAGE_BLOCK_CHECKSUM = MODIFIED_PAGE_BLOCK_SIZE - 4 + /* The checksum of the current block */ +}; + +/** Length of the bitmap data in a block in bytes */ +enum { MODIFIED_PAGE_BLOCK_BITMAP_LEN + = MODIFIED_PAGE_BLOCK_UNUSED_2 - MODIFIED_PAGE_BLOCK_BITMAP }; + +/** Length of the bitmap data in a block in page ids */ +enum { MODIFIED_PAGE_BLOCK_ID_COUNT = MODIFIED_PAGE_BLOCK_BITMAP_LEN * 8 }; + +/****************************************************************//** +Provide a comparisson function for the RB-tree tree (space, +block_start_page) pairs. Actual implementation does not matter as +long as the ordering is full. +@return -1 if p1 < p2, 0 if p1 == p2, 1 if p1 > p2 +*/ +static +int +log_online_compare_bmp_keys( +/*========================*/ + const void* p1, /*!<in: 1st key to compare */ + const void* p2) /*!<in: 2nd key to compare */ +{ + const byte *k1 = (const byte *)p1; + const byte *k2 = (const byte *)p2; + + ulint k1_space = mach_read_from_4(k1 + MODIFIED_PAGE_SPACE_ID); + ulint k2_space = mach_read_from_4(k2 + MODIFIED_PAGE_SPACE_ID); + if (k1_space == k2_space) { + ulint k1_start_page + = mach_read_from_4(k1 + MODIFIED_PAGE_1ST_PAGE_ID); + ulint k2_start_page + = mach_read_from_4(k2 + MODIFIED_PAGE_1ST_PAGE_ID); + return k1_start_page < k2_start_page + ? -1 : k1_start_page > k2_start_page ? 1 : 0; + } + return k1_space < k2_space ? -1 : 1; +} + +/****************************************************************//** +Set a bit for tracked page in the bitmap. Expand the bitmap tree as +necessary. */ +static +void +log_online_set_page_bit( +/*====================*/ + ulint space, /*!<in: log record space id */ + ulint page_no)/*!<in: log record page id */ +{ + ut_ad(mutex_own(&log_bmp_sys_mutex)); + + ut_a(space != ULINT_UNDEFINED); + ut_a(page_no != ULINT_UNDEFINED); + + ulint block_start_page = page_no / MODIFIED_PAGE_BLOCK_ID_COUNT + * MODIFIED_PAGE_BLOCK_ID_COUNT; + ulint block_pos = block_start_page ? (page_no % block_start_page / 8) + : (page_no / 8); + uint bit_pos = page_no % 8; + + byte search_page[MODIFIED_PAGE_BLOCK_SIZE]; + mach_write_to_4(search_page + MODIFIED_PAGE_SPACE_ID, space); + mach_write_to_4(search_page + MODIFIED_PAGE_1ST_PAGE_ID, + block_start_page); + + byte *page_ptr; + ib_rbt_bound_t tree_search_pos; + if (!rbt_search(log_bmp_sys->modified_pages, &tree_search_pos, + search_page)) { + page_ptr = rbt_value(byte, tree_search_pos.last); + } + else { + ib_rbt_node_t *new_node; + + if (log_bmp_sys->page_free_list) { + new_node = log_bmp_sys->page_free_list; + log_bmp_sys->page_free_list = new_node->left; + } + else { + new_node = static_cast<ib_rbt_node_t *> + (ut_malloc + (SIZEOF_NODE(log_bmp_sys->modified_pages))); + } + memset(new_node, 0, SIZEOF_NODE(log_bmp_sys->modified_pages)); + + page_ptr = rbt_value(byte, new_node); + mach_write_to_4(page_ptr + MODIFIED_PAGE_SPACE_ID, space); + mach_write_to_4(page_ptr + MODIFIED_PAGE_1ST_PAGE_ID, + block_start_page); + + rbt_add_preallocated_node(log_bmp_sys->modified_pages, + &tree_search_pos, new_node); + } + page_ptr[MODIFIED_PAGE_BLOCK_BITMAP + block_pos] |= (1U << bit_pos); +} + +/****************************************************************//** +Calculate a bitmap block checksum. Algorithm borrowed from +log_block_calc_checksum. +@return checksum */ +UNIV_INLINE +ulint +log_online_calc_checksum( +/*=====================*/ + const byte* block) /*!<in: bitmap block */ +{ + ulint sum; + ulint sh; + ulint i; + + sum = 1; + sh = 0; + + for (i = 0; i < MODIFIED_PAGE_BLOCK_CHECKSUM; i++) { + + ulint b = block[i]; + sum &= 0x7FFFFFFFUL; + sum += b; + sum += b << sh; + sh++; + if (sh > 24) { + sh = 0; + } + } + + return sum; +} + +/****************************************************************//** +Read one bitmap data page and check it for corruption. + +@return TRUE if page read OK, FALSE if I/O error */ +static +ibool +log_online_read_bitmap_page( +/*========================*/ + log_online_bitmap_file_t *bitmap_file, /*!<in/out: bitmap + file */ + byte *page, /*!<out: read page. + Must be at least + MODIFIED_PAGE_BLOCK_SIZE + bytes long */ + ibool *checksum_ok) /*!<out: TRUE if page + checksum OK */ +{ + ulint checksum; + ulint actual_checksum; + ibool success; + + ut_a(bitmap_file->size >= MODIFIED_PAGE_BLOCK_SIZE); + ut_a(bitmap_file->offset + <= bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE); + ut_a(bitmap_file->offset % MODIFIED_PAGE_BLOCK_SIZE == 0); + + success = os_file_read(bitmap_file->file, page, bitmap_file->offset, + MODIFIED_PAGE_BLOCK_SIZE); + + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_WARN, + "failed reading changed page bitmap file \'%s\'", + bitmap_file->name); + return FALSE; + } + + bitmap_file->offset += MODIFIED_PAGE_BLOCK_SIZE; + ut_ad(bitmap_file->offset <= bitmap_file->size); + + checksum = mach_read_from_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM); + actual_checksum = log_online_calc_checksum(page); + *checksum_ok = (checksum == actual_checksum); + + return TRUE; +} + +/****************************************************************//** +Get the last tracked fully LSN from the bitmap file by reading +backwards untile a correct end page is found. Detects incomplete +writes and corrupted data. Sets the start output position for the +written bitmap data. + +Multiple bitmap files are handled using the following assumptions: +1) Only the last file might be corrupted. In case where no good data was found +in the last file, assume that the next to last file is OK. This assumption +does not limit crash recovery capability in any way. +2) If the whole of the last file was corrupted, assume that the start LSN in +its name is correct and use it for (re-)tracking start. + +@return the last fully tracked LSN */ +static +lsn_t +log_online_read_last_tracked_lsn(void) +/*==================================*/ +{ + byte page[MODIFIED_PAGE_BLOCK_SIZE]; + ibool is_last_page = FALSE; + ibool checksum_ok = FALSE; + lsn_t result; + os_offset_t read_offset = log_bmp_sys->out.offset; + + while ((!checksum_ok || !is_last_page) && read_offset > 0) + { + read_offset -= MODIFIED_PAGE_BLOCK_SIZE; + log_bmp_sys->out.offset = read_offset; + + if (!log_online_read_bitmap_page(&log_bmp_sys->out, page, + &checksum_ok)) { + checksum_ok = FALSE; + result = 0; + break; + } + + if (checksum_ok) { + is_last_page + = mach_read_from_4 + (page + MODIFIED_PAGE_IS_LAST_BLOCK); + } else { + + ib_logf(IB_LOG_LEVEL_WARN, + "corruption detected in \'%s\' at offset " + UINT64PF, + log_bmp_sys->out.name, read_offset); + } + }; + + result = (checksum_ok && is_last_page) + ? mach_read_from_8(page + MODIFIED_PAGE_END_LSN) : 0; + + /* Truncate the output file to discard the corrupted bitmap data, if + any */ + if (!os_file_set_eof_at(log_bmp_sys->out.file, + log_bmp_sys->out.offset)) { + ib_logf(IB_LOG_LEVEL_WARN, + "failed truncating changed page bitmap file \'%s\' to " + UINT64PF " bytes", + log_bmp_sys->out.name, log_bmp_sys->out.offset); + result = 0; + } + return result; +} + +/****************************************************************//** +Safely write the log_sys->tracked_lsn value. Uses atomic operations +if available, otherwise this field is protected with the log system +mutex. The reader counterpart function is log_get_tracked_lsn() in +log0log.c. */ +UNIV_INLINE +void +log_set_tracked_lsn( +/*================*/ + lsn_t tracked_lsn) /*!<in: new value */ +{ + log_sys->tracked_lsn = tracked_lsn; + os_wmb; +} + +/*********************************************************************//** +Check if missing, if any, LSN interval can be read and tracked using the +current LSN value, the LSN value where the tracking stopped, and the log group +capacity. + +@return TRUE if the missing interval can be tracked or if there's no missing +data. */ +static +ibool +log_online_can_track_missing( +/*=========================*/ + lsn_t last_tracked_lsn, /*!<in: last tracked LSN */ + lsn_t tracking_start_lsn) /*!<in: current LSN */ +{ + /* last_tracked_lsn might be < MIN_TRACKED_LSN in the case of empty + bitmap file, handle this too. */ + last_tracked_lsn = ut_max(last_tracked_lsn, MIN_TRACKED_LSN); + + if (last_tracked_lsn > tracking_start_lsn) { + ib_logf(IB_LOG_LEVEL_ERROR, + "last tracked LSN " LSN_PF " is ahead of tracking " + "start LSN " LSN_PF ". This can be caused by " + "mismatched bitmap files.", + last_tracked_lsn, tracking_start_lsn); + exit(1); + } + + return (last_tracked_lsn == tracking_start_lsn) + || (log_sys->lsn - last_tracked_lsn + <= log_sys->log_group_capacity); +} + + +/****************************************************************//** +Diagnose a gap in tracked LSN range on server startup due to crash or +very fast shutdown and try to close it by tracking the data +immediatelly, if possible. */ +static +void +log_online_track_missing_on_startup( +/*================================*/ + lsn_t last_tracked_lsn, /*!<in: last tracked LSN read from the + bitmap file */ + lsn_t tracking_start_lsn) /*!<in: last checkpoint LSN of the + current server startup */ +{ + ut_ad(last_tracked_lsn != tracking_start_lsn); + ut_ad(srv_track_changed_pages); + + ib_logf(IB_LOG_LEVEL_WARN, "last tracked LSN in \'%s\' is " LSN_PF + ", but the last checkpoint LSN is " LSN_PF ". This might be " + "due to a server crash or a very fast shutdown.", + log_bmp_sys->out.name, last_tracked_lsn, tracking_start_lsn); + + /* See if we can fully recover the missing interval */ + if (log_online_can_track_missing(last_tracked_lsn, + tracking_start_lsn)) { + + ib_logf(IB_LOG_LEVEL_INFO, + "reading the log to advance the last tracked LSN."); + + log_bmp_sys->start_lsn = ut_max(last_tracked_lsn, + MIN_TRACKED_LSN); + log_set_tracked_lsn(log_bmp_sys->start_lsn); + if (!log_online_follow_redo_log()) { + exit(1); + } + ut_ad(log_bmp_sys->end_lsn >= tracking_start_lsn); + + ib_logf(IB_LOG_LEVEL_INFO, + "continuing tracking changed pages from LSN " LSN_PF, + log_bmp_sys->end_lsn); + } + else { + ib_logf(IB_LOG_LEVEL_WARN, + "the age of last tracked LSN exceeds log capacity, " + "tracking-based incremental backups will work only " + "from the higher LSN!"); + + log_bmp_sys->end_lsn = log_bmp_sys->start_lsn + = tracking_start_lsn; + log_set_tracked_lsn(log_bmp_sys->start_lsn); + + ib_logf(IB_LOG_LEVEL_INFO, + "starting tracking changed pages from LSN " LSN_PF, + log_bmp_sys->end_lsn); + } +} + +/*********************************************************************//** +Format a bitmap output file name to log_bmp_sys->out.name. */ +static +void +log_online_make_bitmap_name( +/*=========================*/ + lsn_t start_lsn) /*!< in: the start LSN name part */ +{ + ut_snprintf(log_bmp_sys->out.name, sizeof(log_bmp_sys->out.name), + bmp_file_name_template, log_bmp_sys->bmp_file_home, + bmp_file_name_stem, log_bmp_sys->out_seq_num, start_lsn); +} + - /*********************************************************************//** - } - +/*********************************************************************//** +Check if an old file that has the name of a new bitmap file we are about to +create should be overwritten. */ +static +ibool +log_online_should_overwrite( +/*========================*/ + const char *path) /*!< in: path to file */ +{ + dberr_t err; + os_file_stat_t file_info; + + /* Currently, it's OK to overwrite 0-sized files only */ + err = os_file_get_status(path, &file_info, false); + return err == DB_SUCCESS && file_info.type == OS_FILE_TYPE_FILE + && file_info.size == 0LL; +} + +/*********************************************************************//** +Create a new empty bitmap output file. + +@return TRUE if operation succeeded, FALSE if I/O error */ +static +ibool +log_online_start_bitmap_file(void) +/*==============================*/ +{ + ibool success = TRUE; + + /* Check for an old file that should be deleted first */ + if (log_online_should_overwrite(log_bmp_sys->out.name)) { + + success = static_cast<ibool>( + os_file_delete_if_exists(innodb_file_bmp_key, + log_bmp_sys->out.name)); + } + + if (UNIV_LIKELY(success)) { + log_bmp_sys->out.file + = os_file_create_simple_no_error_handling( + innodb_file_bmp_key, + log_bmp_sys->out.name, + OS_FILE_CREATE, + OS_FILE_READ_WRITE_CACHED, + &success); + } + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, + "cannot create \'%s\'", log_bmp_sys->out.name); + return FALSE; + } + + log_bmp_sys->out.offset = 0; + return TRUE; +} + +/*********************************************************************//** +Close the current bitmap output file and create the next one. + +@return TRUE if operation succeeded, FALSE if I/O error */ +static +ibool +log_online_rotate_bitmap_file( +/*===========================*/ + lsn_t next_file_start_lsn) /*!<in: the start LSN name + part */ +{ + if (!os_file_is_invalid(log_bmp_sys->out.file)) { + os_file_close(log_bmp_sys->out.file); + os_file_mark_invalid(&log_bmp_sys->out.file); + } + log_bmp_sys->out_seq_num++; + log_online_make_bitmap_name(next_file_start_lsn); + return log_online_start_bitmap_file(); +} + +/*********************************************************************//** +Check the name of a given file if it's a changed page bitmap file and +return file sequence and start LSN name components if it is. If is not, +the values of output parameters are undefined. + +@return TRUE if a given file is a changed page bitmap file. */ +static +ibool +log_online_is_bitmap_file( +/*======================*/ + const os_file_stat_t* file_info, /*!<in: file to + check */ + ulong* bitmap_file_seq_num, /*!<out: bitmap file + sequence number */ + lsn_t* bitmap_file_start_lsn) /*!<out: bitmap file + start LSN */ +{ + char stem[FN_REFLEN]; + + ut_ad (strlen(file_info->name) < OS_FILE_MAX_PATH); + + return ((file_info->type == OS_FILE_TYPE_FILE + || file_info->type == OS_FILE_TYPE_LINK) + && (sscanf(file_info->name, "%[a-z_]%lu_%llu.xdb", stem, + bitmap_file_seq_num, + (unsigned long long *)bitmap_file_start_lsn) == 3) + && (!strcmp(stem, bmp_file_name_stem))); +} + +/** Initialize the constant part of the log tracking subsystem */ +UNIV_INTERN +void +log_online_init(void) +{ + mutex_create(log_bmp_sys_mutex_key, &log_bmp_sys_mutex, + SYNC_LOG_ONLINE); +} + +/** Initialize the dynamic part of the log tracking subsystem */ +UNIV_INTERN +void +log_online_read_init(void) +{ + ibool success; + lsn_t tracking_start_lsn + = ut_max(log_sys->last_checkpoint_lsn, MIN_TRACKED_LSN); + os_file_dir_t bitmap_dir; + os_file_stat_t bitmap_dir_file_info; + lsn_t last_file_start_lsn = MIN_TRACKED_LSN; + size_t srv_data_home_len; + + /* Bitmap data start and end in a bitmap block must be 8-byte + aligned. */ + compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP % 8 == 0); + compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP_LEN % 8 == 0); + + ut_ad(srv_track_changed_pages); + + log_bmp_sys = static_cast<log_bitmap_struct *> + (ut_malloc(sizeof(*log_bmp_sys))); + log_bmp_sys->read_buf_ptr = static_cast<byte *> + (ut_malloc(FOLLOW_SCAN_SIZE + OS_FILE_LOG_BLOCK_SIZE)); + log_bmp_sys->read_buf = static_cast<byte *> + (ut_align(log_bmp_sys->read_buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); + + /* Initialize bitmap file directory from srv_data_home and add a path + separator if needed. */ + srv_data_home_len = strlen(srv_data_home); + ut_a (srv_data_home_len < FN_REFLEN); + strcpy(log_bmp_sys->bmp_file_home, srv_data_home); + if (srv_data_home_len + && log_bmp_sys->bmp_file_home[srv_data_home_len - 1] + != SRV_PATH_SEPARATOR) { + + ut_a (srv_data_home_len < FN_REFLEN - 1); + log_bmp_sys->bmp_file_home[srv_data_home_len] + = SRV_PATH_SEPARATOR; + log_bmp_sys->bmp_file_home[srv_data_home_len + 1] = '\0'; + } + + /* Enumerate existing bitmap files to either open the last one to get + the last tracked LSN either to find that there are none and start + tracking from scratch. */ + log_bmp_sys->out.name[0] = '\0'; + log_bmp_sys->out_seq_num = 0; + + bitmap_dir = os_file_opendir(log_bmp_sys->bmp_file_home, TRUE); + ut_a(bitmap_dir); + while (!os_file_readdir_next_file(log_bmp_sys->bmp_file_home, + bitmap_dir, &bitmap_dir_file_info)) { + + ulong file_seq_num; + lsn_t file_start_lsn; + + if (!log_online_is_bitmap_file(&bitmap_dir_file_info, + &file_seq_num, + &file_start_lsn)) { + continue; + } + + if (file_seq_num > log_bmp_sys->out_seq_num + && bitmap_dir_file_info.size > 0) { + log_bmp_sys->out_seq_num = file_seq_num; + last_file_start_lsn = file_start_lsn; + /* No dir component (log_bmp_sys->bmp_file_home) here, + because that's the cwd */ + strncpy(log_bmp_sys->out.name, + bitmap_dir_file_info.name, FN_REFLEN - 1); + log_bmp_sys->out.name[FN_REFLEN - 1] = '\0'; + } + } + + if (os_file_closedir(bitmap_dir)) { + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'", + log_bmp_sys->bmp_file_home); + exit(1); + } + + if (!log_bmp_sys->out_seq_num) { + log_bmp_sys->out_seq_num = 1; + log_online_make_bitmap_name(0); + } + + log_bmp_sys->modified_pages = rbt_create(MODIFIED_PAGE_BLOCK_SIZE, + log_online_compare_bmp_keys); + log_bmp_sys->page_free_list = NULL; + + log_bmp_sys->out.file + = os_file_create_simple_no_error_handling + (innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN, + OS_FILE_READ_WRITE_CACHED, &success); + + if (!success) { + + /* New file, tracking from scratch */ + if (!log_online_start_bitmap_file()) { + exit(1); + } + } + else { + + /* Read the last tracked LSN from the last file */ + lsn_t last_tracked_lsn; + lsn_t file_start_lsn; + + log_bmp_sys->out.size + = os_file_get_size(log_bmp_sys->out.file); + log_bmp_sys->out.offset = log_bmp_sys->out.size; + + if (log_bmp_sys->out.offset % MODIFIED_PAGE_BLOCK_SIZE != 0) { + + ib_logf(IB_LOG_LEVEL_WARN, + "truncated block detected in \'%s\' at offset " + UINT64PF, + log_bmp_sys->out.name, + log_bmp_sys->out.offset); + log_bmp_sys->out.offset -= + log_bmp_sys->out.offset + % MODIFIED_PAGE_BLOCK_SIZE; + } + + last_tracked_lsn = log_online_read_last_tracked_lsn(); + /* Do not rotate if we truncated the file to zero length - we + can just start writing there */ + const bool need_rotate = (last_tracked_lsn != 0); + if (!last_tracked_lsn) { + + last_tracked_lsn = last_file_start_lsn; + } + + /* Start a new file. Choose the LSN value in its name based on + if we can retrack any missing data. */ + if (log_online_can_track_missing(last_tracked_lsn, + tracking_start_lsn)) { + file_start_lsn = last_tracked_lsn; + } else { + file_start_lsn = tracking_start_lsn; + } + + if (need_rotate + && !log_online_rotate_bitmap_file(file_start_lsn)) { + + exit(1); + } + + if (last_tracked_lsn < tracking_start_lsn) { + + log_online_track_missing_on_startup + (last_tracked_lsn, tracking_start_lsn); + return; + } + + if (last_tracked_lsn > tracking_start_lsn) { + + ib_logf(IB_LOG_LEVEL_WARN, + "last tracked LSN is " LSN_PF ", but the last " + "checkpoint LSN is " LSN_PF ". The " + "tracking-based incremental backups will work " + "only from the latter LSN!", + last_tracked_lsn, tracking_start_lsn); + } + + } + + ib_logf(IB_LOG_LEVEL_INFO, "starting tracking changed pages from LSN " + LSN_PF, tracking_start_lsn); + log_bmp_sys->start_lsn = tracking_start_lsn; + log_set_tracked_lsn(tracking_start_lsn); +} + +/** Shut down the dynamic part of the log tracking subsystem */ +UNIV_INTERN +void +log_online_read_shutdown(void) +{ + mutex_enter(&log_bmp_sys_mutex); + + srv_track_changed_pages = FALSE; + + ib_rbt_node_t *free_list_node = log_bmp_sys->page_free_list; + + if (!os_file_is_invalid(log_bmp_sys->out.file)) { + os_file_close(log_bmp_sys->out.file); + os_file_mark_invalid(&log_bmp_sys->out.file); + } + + rbt_free(log_bmp_sys->modified_pages); + + while (free_list_node) { + ib_rbt_node_t *next = free_list_node->left; + ut_free(free_list_node); + free_list_node = next; + } + + ut_free(log_bmp_sys->read_buf_ptr); + ut_free(log_bmp_sys); + log_bmp_sys = NULL; + + srv_redo_log_thread_started = false; + + mutex_exit(&log_bmp_sys_mutex); +} + +/** Shut down the constant part of the log tracking subsystem */ +UNIV_INTERN +void +log_online_shutdown(void) +{ + mutex_free(&log_bmp_sys_mutex); +} + +/*********************************************************************//** +For the given minilog record type determine if the record has (space; page) +associated with it. +@return TRUE if the record has (space; page) in it */ +static +ibool +log_online_rec_has_page( +/*====================*/ + byte type) /*!<in: the minilog record type */ +{ + return type != MLOG_MULTI_REC_END && type != MLOG_DUMMY_RECORD; +} + +/*********************************************************************//** +Check if a page field for a given log record type actually contains a page +id. It does not for file operations and MLOG_LSN. +@return TRUE if page field contains actual page id, FALSE otherwise */ +static +ibool +log_online_rec_page_means_page( +/*===========================*/ + byte type) /*!<in: log record type */ +{ + return log_online_rec_has_page(type) +#ifdef UNIV_LOG_LSN_DEBUG + && type != MLOG_LSN +#endif + && type != MLOG_FILE_CREATE + && type != MLOG_FILE_RENAME + && type != MLOG_FILE_DELETE + && type != MLOG_FILE_CREATE2; +} + +/*********************************************************************//** +Parse the log data in the parse buffer for the (space, page) pairs and add +them to the modified page set as necessary. Removes the fully-parsed records +from the buffer. If an incomplete record is found, moves it to the end of the +buffer. */ +static +void +log_online_parse_redo_log(void) +/*===========================*/ +{ + ut_ad(mutex_own(&log_bmp_sys_mutex)); + + byte *ptr = log_bmp_sys->parse_buf; + byte *end = log_bmp_sys->parse_buf_end; + ulint len = 0; + + while (ptr != end + && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) { + + byte type; + ulint space; + ulint page_no; + byte* body; + + /* recv_sys is not initialized, so on corrupt log we will + SIGSEGV. But the log of a live database should not be + corrupt. */ + len = recv_parse_log_rec(ptr, end, &type, &space, &page_no, + &body); + if (len > 0) { + + if (log_online_rec_page_means_page(type)) { + + ut_a(len >= 3); + log_online_set_page_bit(space, page_no); + } + + ptr += len; + ut_ad(ptr <= end); + log_bmp_sys->next_parse_lsn + = recv_calc_lsn_on_data_add + (log_bmp_sys->next_parse_lsn, len); + } + else { + + /* Incomplete log record. Shift it to the + beginning of the parse buffer and leave it to be + completed on the next read. */ + ut_memmove(log_bmp_sys->parse_buf, ptr, end - ptr); + log_bmp_sys->parse_buf_end + = log_bmp_sys->parse_buf + (end - ptr); + ptr = end; + } + } + + if (len > 0) { + + log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf; + } +} + +/*********************************************************************//** +Check the log block checksum. +@return TRUE if the log block checksum is OK, FALSE otherwise. */ +static +ibool +log_online_is_valid_log_seg( +/*========================*/ + const byte* log_block) /*!< in: read log data */ +{ + ibool checksum_is_ok + = log_block_checksum_is_ok_or_old_format(log_block); + + if (!checksum_is_ok) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "log block checksum mismatch: expected " ULINTPF ", " + "calculated checksum " ULINTPF, + log_block_get_checksum(log_block), + log_block_calc_checksum(log_block)); + } + + return checksum_is_ok; +} + +/*********************************************************************//** +Copy new log data to the parse buffer while skipping log block header, +trailer and already parsed data. */ +static +void +log_online_add_to_parse_buf( +/*========================*/ + const byte* log_block, /*!< in: read log data */ + ulint data_len, /*!< in: length of read log data */ + ulint skip_len) /*!< in: how much of log data to + skip */ +{ + ut_ad(mutex_own(&log_bmp_sys_mutex)); + + ulint start_offset = skip_len ? skip_len : LOG_BLOCK_HDR_SIZE; + ulint end_offset + = (data_len == OS_FILE_LOG_BLOCK_SIZE) + ? data_len - LOG_BLOCK_TRL_SIZE + : data_len; + ulint actual_data_len = (end_offset >= start_offset) + ? end_offset - start_offset : 0; + + ut_memcpy(log_bmp_sys->parse_buf_end, log_block + start_offset, + actual_data_len); + + log_bmp_sys->parse_buf_end += actual_data_len; + + ut_a(log_bmp_sys->parse_buf_end - log_bmp_sys->parse_buf + <= RECV_PARSING_BUF_SIZE); +} + +/*********************************************************************//** +Parse the log block: first copies the read log data to the parse buffer while +skipping log block header, trailer and already parsed data. Then it actually +parses the log to add to the modified page bitmap. */ +static +void +log_online_parse_redo_log_block( +/*============================*/ + const byte* log_block, /*!< in: read log data */ + ulint skip_already_parsed_len) /*!< in: how many bytes of + log data should be skipped as + they were parsed before */ +{ + ut_ad(mutex_own(&log_bmp_sys_mutex)); + + ulint block_data_len = log_block_get_data_len(log_block); + + ut_ad(block_data_len % OS_FILE_LOG_BLOCK_SIZE == 0 + || block_data_len < OS_FILE_LOG_BLOCK_SIZE); + + log_online_add_to_parse_buf(log_block, block_data_len, + skip_already_parsed_len); + log_online_parse_redo_log(); +} + +/*********************************************************************//** +Read and parse one redo log chunk and updates the modified page bitmap. */ +static +void +log_online_follow_log_seg( +/*======================*/ + log_group_t* group, /*!< in: the log group to use */ + lsn_t block_start_lsn, /*!< in: the LSN to read from */ + lsn_t block_end_lsn) /*!< in: the LSN to read to */ +{ + ut_ad(mutex_own(&log_bmp_sys_mutex)); + + /* Pointer to the current OS_FILE_LOG_BLOCK-sized chunk of the read log + data to parse */ + byte* log_block = log_bmp_sys->read_buf; + byte* log_block_end = log_bmp_sys->read_buf + + (block_end_lsn - block_start_lsn); + + mutex_enter(&log_sys->mutex); + log_group_read_log_seg(LOG_RECOVER, log_bmp_sys->read_buf, + group, block_start_lsn, block_end_lsn, TRUE); + /* log_group_read_log_seg will release the log_sys->mutex for us */ + + while (log_block < log_block_end + && log_bmp_sys->next_parse_lsn < log_bmp_sys->end_lsn) { + + /* How many bytes of log data should we skip in the current log + block. Skipping is necessary because we round down the next + parse LSN thus it is possible to read the already-processed log + data many times */ + ulint skip_already_parsed_len = 0; + + if (!log_online_is_valid_log_seg(log_block)) { + break; + } + + if ((block_start_lsn <= log_bmp_sys->next_parse_lsn) + && (block_start_lsn + OS_FILE_LOG_BLOCK_SIZE + > log_bmp_sys->next_parse_lsn)) { + + /* The next parse LSN is inside the current block, skip + data preceding it. */ + skip_already_parsed_len + = (ulint)(log_bmp_sys->next_parse_lsn + - block_start_lsn); + } + else { + + /* If the next parse LSN is not inside the current + block, then the only option is that we have processed + ahead already. */ + ut_a(block_start_lsn > log_bmp_sys->next_parse_lsn); + } + + /* TODO: merge the copying to the parse buf code with + skip_already_len calculations */ + log_online_parse_redo_log_block(log_block, + skip_already_parsed_len); + + log_block += OS_FILE_LOG_BLOCK_SIZE; + block_start_lsn += OS_FILE_LOG_BLOCK_SIZE; + } + + return; +} + +/*********************************************************************//** +Read and parse the redo log in a given group in FOLLOW_SCAN_SIZE-sized +chunks and updates the modified page bitmap. */ +static +void +log_online_follow_log_group( +/*========================*/ + log_group_t* group, /*!< in: the log group to use */ + lsn_t contiguous_lsn) /*!< in: the LSN of log block start + containing the log_parse_start_lsn */ +{ + ut_ad(mutex_own(&log_bmp_sys_mutex)); + + lsn_t block_start_lsn = contiguous_lsn; + lsn_t block_end_lsn; + + log_bmp_sys->next_parse_lsn = log_bmp_sys->start_lsn; + log_bmp_sys->parse_buf_end = log_bmp_sys->parse_buf; + + do { + block_end_lsn = block_start_lsn + FOLLOW_SCAN_SIZE; + + log_online_follow_log_seg(group, block_start_lsn, + block_end_lsn); + + /* Next parse LSN can become higher than the last read LSN + only in the case when the read LSN falls right on the block + boundary, in which case next parse lsn is bumped to the actual + data LSN on the next (not yet read) block. This assert is + slightly conservative. */ + ut_a(log_bmp_sys->next_parse_lsn + <= block_end_lsn + LOG_BLOCK_HDR_SIZE + + LOG_BLOCK_TRL_SIZE); + + block_start_lsn = block_end_lsn; + } while (block_end_lsn < log_bmp_sys->end_lsn); + + /* Assert that the last read log record is a full one */ + ut_a(log_bmp_sys->parse_buf_end == log_bmp_sys->parse_buf); +} + +/*********************************************************************//** +Write, flush one bitmap block to disk and advance the output position if +successful. + +@return TRUE if page written OK, FALSE if I/O error */ +static +ibool +log_online_write_bitmap_page( +/*=========================*/ + const byte *block) /*!< in: block to write */ +{ + ut_ad(mutex_own(&log_bmp_sys_mutex)); + + /* Simulate a write error */ + DBUG_EXECUTE_IF("bitmap_page_write_error", + { + ulint space_id + = mach_read_from_4(block + + MODIFIED_PAGE_SPACE_ID); + if (space_id > 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "simulating bitmap write " + "error in " + "log_online_write_bitmap_page " + "for space ID %lu", + space_id); + return FALSE; + } + }); + + /* A crash injection site that ensures last checkpoint LSN > last + tracked LSN, so that LSN tracking for this interval is tested. */ + DBUG_EXECUTE_IF("crash_before_bitmap_write", + { + ulint space_id + = mach_read_from_4(block + + MODIFIED_PAGE_SPACE_ID); + if (space_id > 0) + DBUG_SUICIDE(); + }); + + + ibool success = os_file_write(log_bmp_sys->out.name, + log_bmp_sys->out.file, block, + log_bmp_sys->out.offset, + MODIFIED_PAGE_BLOCK_SIZE); + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "failed writing changed page " + "bitmap file \'%s\'", log_bmp_sys->out.name); + return FALSE; + } + + success = os_file_flush(log_bmp_sys->out.file); + if (UNIV_UNLIKELY(!success)) { + + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "failed flushing changed page " + "bitmap file \'%s\'", log_bmp_sys->out.name); + return FALSE; + } + + os_file_advise(log_bmp_sys->out.file, log_bmp_sys->out.offset, + MODIFIED_PAGE_BLOCK_SIZE, OS_FILE_ADVISE_DONTNEED); + + log_bmp_sys->out.offset += MODIFIED_PAGE_BLOCK_SIZE; + return TRUE; +} + +/*********************************************************************//** +Append the current changed page bitmap to the bitmap file. Clears the +bitmap tree and recycles its nodes to the free list. + +@return TRUE if bitmap written OK, FALSE if I/O error*/ +static +ibool +log_online_write_bitmap(void) +/*=========================*/ +{ + ut_ad(mutex_own(&log_bmp_sys_mutex)); + + if (log_bmp_sys->out.offset >= srv_max_bitmap_file_size) { + if (!log_online_rotate_bitmap_file(log_bmp_sys->start_lsn)) { + return FALSE; + } + } + + ib_rbt_node_t *bmp_tree_node + = (ib_rbt_node_t *)rbt_first(log_bmp_sys->modified_pages); + const ib_rbt_node_t * const last_bmp_tree_node + = rbt_last(log_bmp_sys->modified_pages); + + ibool success = TRUE; + + while (bmp_tree_node) { + + byte *page = rbt_value(byte, bmp_tree_node); + + /* In case of a bitmap page write error keep on looping over + the tree to reclaim its memory through the free list instead of + returning immediatelly. */ + if (UNIV_LIKELY(success)) { + if (bmp_tree_node == last_bmp_tree_node) { + mach_write_to_4(page + + MODIFIED_PAGE_IS_LAST_BLOCK, + 1); + } + + mach_write_to_8(page + MODIFIED_PAGE_START_LSN, + log_bmp_sys->start_lsn); + mach_write_to_8(page + MODIFIED_PAGE_END_LSN, + log_bmp_sys->end_lsn); + mach_write_to_4(page + MODIFIED_PAGE_BLOCK_CHECKSUM, + log_online_calc_checksum(page)); + + success = log_online_write_bitmap_page(page); + } + + bmp_tree_node->left = log_bmp_sys->page_free_list; + log_bmp_sys->page_free_list = bmp_tree_node; + + bmp_tree_node = (ib_rbt_node_t*) + rbt_next(log_bmp_sys->modified_pages, bmp_tree_node); + + DBUG_EXECUTE_IF("bitmap_page_2_write_error", + if (bmp_tree_node) + { + DBUG_SET("+d,bitmap_page_write_error"); + DBUG_SET("-d,bitmap_page_2_write_error"); + }); + } + + rbt_reset(log_bmp_sys->modified_pages); + return success; +} + +/*********************************************************************//** +Read and parse the redo log up to last checkpoint LSN to build the changed +page bitmap which is then written to disk. + +@return TRUE if log tracking succeeded, FALSE if bitmap write I/O error */ +UNIV_INTERN +ibool +log_online_follow_redo_log(void) +/*============================*/ +{ + lsn_t contiguous_start_lsn; + log_group_t* group; + ibool result; + + ut_ad(!srv_read_only_mode); + + if (!srv_track_changed_pages) + return TRUE; + + DEBUG_SYNC_C("log_online_follow_redo_log"); + + mutex_enter(&log_bmp_sys_mutex); + + if (!srv_track_changed_pages) { + mutex_exit(&log_bmp_sys_mutex); + return TRUE; + } + + /* Grab the LSN of the last checkpoint, we will parse up to it */ + mutex_enter(&(log_sys->mutex)); + log_bmp_sys->end_lsn = log_sys->last_checkpoint_lsn; + mutex_exit(&(log_sys->mutex)); + + if (log_bmp_sys->end_lsn == log_bmp_sys->start_lsn) { + mutex_exit(&log_bmp_sys_mutex); + return TRUE; + } + + group = UT_LIST_GET_FIRST(log_sys->log_groups); + ut_a(group); + + contiguous_start_lsn = ut_uint64_align_down(log_bmp_sys->start_lsn, + OS_FILE_LOG_BLOCK_SIZE); + + while (group) { + log_online_follow_log_group(group, contiguous_start_lsn); + group = UT_LIST_GET_NEXT(log_groups, group); + } + + result = log_online_write_bitmap(); + log_bmp_sys->start_lsn = log_bmp_sys->end_lsn; + log_set_tracked_lsn(log_bmp_sys->start_lsn); + + mutex_exit(&log_bmp_sys_mutex); + return result; +} + +/*********************************************************************//** +Diagnose a bitmap file range setup failure and free the partially-initialized +bitmap file range. */ +UNIV_COLD +static +void +log_online_diagnose_inconsistent_dir( +/*=================================*/ + log_online_bitmap_file_range_t *bitmap_files) /*!<in/out: bitmap file + range */ +{ + ib_logf(IB_LOG_LEVEL_WARN, + "InnoDB: Warning: inconsistent bitmap file " + "directory for a " + "INFORMATION_SCHEMA.INNODB_CHANGED_PAGES query"); + free(bitmap_files->files); +} + +/*********************************************************************//** +List the bitmap files in srv_data_home and setup their range that contains the +specified LSN interval. This range, if non-empty, will start with a file that +has the greatest LSN equal to or less than the start LSN and will include all +the files up to the one with the greatest LSN less than the end LSN. Caller +must free bitmap_files->files when done if bitmap_files set to non-NULL and +this function returned TRUE. Field bitmap_files->count might be set to a +larger value than the actual count of the files, and space for the unused array +slots will be allocated but cleared to zeroes. + +@return TRUE if succeeded +*/ +static +ibool +log_online_setup_bitmap_file_range( +/*===============================*/ + log_online_bitmap_file_range_t *bitmap_files, /*!<in/out: bitmap file + range */ + lsn_t range_start, /*!<in: start LSN */ + lsn_t range_end) /*!<in: end LSN */ +{ + os_file_dir_t bitmap_dir; + os_file_stat_t bitmap_dir_file_info; + ulong first_file_seq_num = ULONG_MAX; + ulong last_file_seq_num = 0; + lsn_t first_file_start_lsn = LSN_MAX; + + ut_ad(range_end >= range_start); + + bitmap_files->count = 0; + bitmap_files->files = NULL; + + /* 1st pass: size the info array */ + + bitmap_dir = os_file_opendir(srv_data_home, FALSE); + if (UNIV_UNLIKELY(!bitmap_dir)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "failed to open bitmap directory \'%s\'", + srv_data_home); + return FALSE; + } + + while (!os_file_readdir_next_file(srv_data_home, bitmap_dir, + &bitmap_dir_file_info)) { + + ulong file_seq_num; + lsn_t file_start_lsn; + + if (!log_online_is_bitmap_file(&bitmap_dir_file_info, + &file_seq_num, + &file_start_lsn) + || file_start_lsn >= range_end) { + + continue; + } + + if (file_seq_num > last_file_seq_num) { + + last_file_seq_num = file_seq_num; + } + + if (file_start_lsn >= range_start + || file_start_lsn == first_file_start_lsn + || first_file_start_lsn > range_start) { + + /* A file that falls into the range */ + + if (file_start_lsn < first_file_start_lsn) { + + first_file_start_lsn = file_start_lsn; + } + if (file_seq_num < first_file_seq_num) { + + first_file_seq_num = file_seq_num; + } + } else if (file_start_lsn > first_file_start_lsn) { + + /* A file that has LSN closer to the range start + but smaller than it, replacing another such file */ + first_file_start_lsn = file_start_lsn; + first_file_seq_num = file_seq_num; + } + } + + if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) { + + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'", + srv_data_home); + return FALSE; + } + + if (first_file_seq_num == ULONG_MAX && last_file_seq_num == 0) { + + bitmap_files->count = 0; + return TRUE; + } + + bitmap_files->count = last_file_seq_num - first_file_seq_num + 1; + + DEBUG_SYNC_C("setup_bitmap_range_middle"); + + /* 2nd pass: get the file names in the file_seq_num order */ + + bitmap_dir = os_file_opendir(srv_data_home, FALSE); + if (UNIV_UNLIKELY(!bitmap_dir)) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "failed to open bitmap directory \'%s\'", + srv_data_home); + return FALSE; + } + + bitmap_files->files + = static_cast<log_online_bitmap_file_range_struct::files_t *> + (ut_malloc(bitmap_files->count + * sizeof(bitmap_files->files[0]))); + memset(bitmap_files->files, 0, + bitmap_files->count * sizeof(bitmap_files->files[0])); + + while (!os_file_readdir_next_file(srv_data_home, bitmap_dir, + &bitmap_dir_file_info)) { + + ulong file_seq_num; + lsn_t file_start_lsn; + size_t array_pos; + + if (!log_online_is_bitmap_file(&bitmap_dir_file_info, + &file_seq_num, + &file_start_lsn) + || file_start_lsn >= range_end + || file_start_lsn < first_file_start_lsn) { + + continue; + } + + array_pos = file_seq_num - first_file_seq_num; + if (UNIV_UNLIKELY(array_pos >= bitmap_files->count)) { + + log_online_diagnose_inconsistent_dir(bitmap_files); + os_file_closedir(bitmap_dir); + return FALSE; + } + + + if (file_seq_num > bitmap_files->files[array_pos].seq_num) { + + bitmap_files->files[array_pos].seq_num = file_seq_num; + strncpy(bitmap_files->files[array_pos].name, + bitmap_dir_file_info.name, FN_REFLEN); + bitmap_files->files[array_pos].name[FN_REFLEN - 1] + = '\0'; + bitmap_files->files[array_pos].start_lsn + = file_start_lsn; + } + } + + if (UNIV_UNLIKELY(os_file_closedir(bitmap_dir))) { + + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_ERROR, "cannot close \'%s\'", + srv_data_home); + free(bitmap_files->files); + return FALSE; + } + + if (!bitmap_files->files[0].seq_num + || bitmap_files->files[0].seq_num != first_file_seq_num) { + + log_online_diagnose_inconsistent_dir(bitmap_files); + return FALSE; + } + + { + size_t i; + for (i = 1; i < bitmap_files->count; i++) { + if (!bitmap_files->files[i].seq_num) { + break; + } + if ((bitmap_files->files[i].seq_num + <= bitmap_files->files[i - 1].seq_num) + || (bitmap_files->files[i].start_lsn + < bitmap_files->files[i - 1].start_lsn)) { + + log_online_diagnose_inconsistent_dir( + bitmap_files); + return FALSE; + } + } + } + + return TRUE; +} + +/****************************************************************//** +Open a bitmap file for reading. + +@return TRUE if opened successfully */ +static +ibool +log_online_open_bitmap_file_read_only( +/*==================================*/ + const char* name, /*!<in: bitmap file + name without directory, + which is assumed to be + srv_data_home */ + log_online_bitmap_file_t* bitmap_file) /*!<out: opened bitmap + file */ +{ + ibool success = FALSE; + size_t srv_data_home_len; + + ut_ad(name[0] != '\0'); + + srv_data_home_len = strlen(srv_data_home); + if (srv_data_home_len + && srv_data_home[srv_data_home_len-1] + != SRV_PATH_SEPARATOR) { + ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%c%s", + srv_data_home, SRV_PATH_SEPARATOR, name); + } else { + ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%s", + srv_data_home, name); + } + bitmap_file->file + = os_file_create_simple_no_error_handling(innodb_file_bmp_key, + bitmap_file->name, + OS_FILE_OPEN, + OS_FILE_READ_ONLY, + &success); + if (UNIV_UNLIKELY(!success)) { + + /* Here and below assume that bitmap file names do not + contain apostrophes, thus no need for ut_print_filename(). */ + ib_logf(IB_LOG_LEVEL_WARN, + "error opening the changed page bitmap \'%s\'", + bitmap_file->name); + return FALSE; + } + + bitmap_file->size = os_file_get_size(bitmap_file->file); + bitmap_file->offset = 0; + + os_file_advise(bitmap_file->file, 0, 0, OS_FILE_ADVISE_SEQUENTIAL); + os_file_advise(bitmap_file->file, 0, 0, OS_FILE_ADVISE_NOREUSE); + + return TRUE; +} + +/****************************************************************//** +Diagnose one or both of the following situations if we read close to +the end of bitmap file: +1) Warn if the remainder of the file is less than one page. +2) Error if we cannot read any more full pages but the last read page +did not have the last-in-run flag set. + +@return FALSE for the error */ +static +ibool +log_online_diagnose_bitmap_eof( +/*===========================*/ + const log_online_bitmap_file_t* bitmap_file, /*!< in: bitmap file */ + ibool last_page_in_run)/*!< in: "last page in + run" flag value in the + last read page */ +{ + /* Check if we are too close to EOF to read a full page */ + if ((bitmap_file->size < MODIFIED_PAGE_BLOCK_SIZE) + || (bitmap_file->offset + > bitmap_file->size - MODIFIED_PAGE_BLOCK_SIZE)) { + + if (UNIV_UNLIKELY(bitmap_file->offset != bitmap_file->size)) { + + /* If we are not at EOF and we have less than one page + to read, it's junk. This error is not fatal in + itself. */ + + ib_logf(IB_LOG_LEVEL_WARN, + "junk at the end of changed page bitmap file " + "\'%s\'.", bitmap_file->name); + } + + if (UNIV_UNLIKELY(!last_page_in_run)) { + + /* We are at EOF but the last read page did not finish + a run */ + /* It's a "Warning" here because it's not a fatal error + for the whole server */ + ib_logf(IB_LOG_LEVEL_WARN, + "changed page bitmap file \'%s\', size " + UINT64PF " bytes, does not " + "contain a complete run at the next read " + "offset " UINT64PF, + bitmap_file->name, bitmap_file->size, + bitmap_file->offset); + return FALSE; + } + } + return TRUE; +} + +/*********************************************************************//** +Initialize the log bitmap iterator for a given range. The records are +processed at a bitmap block granularity, i.e. all the records in the same block +share the same start and end LSN values, the exact LSN of each record is +unavailable (nor is it defined for blocks that are touched more than once in +the LSN interval contained in the block). Thus min_lsn and max_lsn should be +set at block boundaries or bigger, otherwise the records at the 1st and the +last blocks will not be returned. Also note that there might be returned +records with LSN < min_lsn, as min_lsn is used to select the correct starting +file but not block. + +@return TRUE if the iterator is initialized OK, FALSE otherwise. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_init( +/*============================*/ + log_bitmap_iterator_t *i, /*!<in/out: iterator */ + lsn_t min_lsn,/*!< in: start LSN */ + lsn_t max_lsn)/*!< in: end LSN */ +{ + ut_a(i); + + i->max_lsn = max_lsn; + + if (UNIV_UNLIKELY(min_lsn > max_lsn)) { + + /* Empty range */ + i->in_files.count = 0; + i->in_files.files = NULL; + os_file_mark_invalid(&i->in.file); + i->page = NULL; + i->failed = FALSE; + return TRUE; + } + + if (!log_online_setup_bitmap_file_range(&i->in_files, min_lsn, + max_lsn)) { + + i->failed = TRUE; + return FALSE; + } + + i->in_i = 0; + + if (i->in_files.count == 0) { + + /* Empty range */ + os_file_mark_invalid(&i->in.file); + i->page = NULL; + i->failed = FALSE; + return TRUE; + } + + /* Open the 1st bitmap file */ + if (UNIV_UNLIKELY(!log_online_open_bitmap_file_read_only( + i->in_files.files[i->in_i].name, + &i->in))) { + + i->in_i = i->in_files.count; + free(i->in_files.files); + i->failed = TRUE; + return FALSE; + } + + i->page = static_cast<byte *>(ut_malloc(MODIFIED_PAGE_BLOCK_SIZE)); + i->bit_offset = MODIFIED_PAGE_BLOCK_BITMAP_LEN; + i->start_lsn = i->end_lsn = 0; + i->space_id = 0; + i->first_page_id = 0; + i->last_page_in_run = TRUE; + i->changed = FALSE; + i->failed = FALSE; + + return TRUE; +} + +/*********************************************************************//** +Releases log bitmap iterator. */ +UNIV_INTERN +void +log_online_bitmap_iterator_release( +/*===============================*/ + log_bitmap_iterator_t *i) /*!<in/out: iterator */ +{ + ut_a(i); + + if (!os_file_is_invalid(i->in.file)) { + + os_file_close(i->in.file); + os_file_mark_invalid(&i->in.file); + } + if (i->in_files.files) { + + ut_free(i->in_files.files); + } + if (i->page) { + + ut_free(i->page); + } + i->failed = TRUE; +} + +/*********************************************************************//** +Iterates through bits of saved bitmap blocks. +Sequentially reads blocks from bitmap file(s) and interates through +their bits. Ignores blocks with wrong checksum. +@return TRUE if iteration is successful, FALSE if all bits are iterated. */ +UNIV_INTERN +ibool +log_online_bitmap_iterator_next( +/*============================*/ + log_bitmap_iterator_t *i) /*!<in/out: iterator */ +{ + ibool checksum_ok = FALSE; + ibool success; + + ut_a(i); + + if (UNIV_UNLIKELY(i->in_files.count == 0)) { + + return FALSE; + } + + if (UNIV_LIKELY(i->bit_offset < MODIFIED_PAGE_BLOCK_BITMAP_LEN)) + { + ++i->bit_offset; + i->changed = + IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP, + i->bit_offset); + return TRUE; + } + + if (i->end_lsn >= i->max_lsn && i->last_page_in_run) + return FALSE; + + while (!checksum_ok) + { + while (i->in.size < MODIFIED_PAGE_BLOCK_SIZE + || (i->in.offset + > i->in.size - MODIFIED_PAGE_BLOCK_SIZE)) { + + /* Advance file */ + i->in_i++; + success = os_file_close_no_error_handling( + i->in.file); + os_file_mark_invalid(&i->in.file); + if (UNIV_UNLIKELY(!success)) { + + os_file_get_last_error(TRUE); + i->failed = TRUE; + return FALSE; + } + + success = log_online_diagnose_bitmap_eof( + &i->in, i->last_page_in_run); + if (UNIV_UNLIKELY(!success)) { + + i->failed = TRUE; + return FALSE; + + } + + if (i->in_i == i->in_files.count) { + + return FALSE; + } + + if (UNIV_UNLIKELY(i->in_files.files[i->in_i].seq_num + == 0)) { + + i->failed = TRUE; + return FALSE; + } + + success = log_online_open_bitmap_file_read_only( + i->in_files.files[i->in_i].name, + &i->in); + if (UNIV_UNLIKELY(!success)) { + + i->failed = TRUE; + return FALSE; + } + } + + success = log_online_read_bitmap_page(&i->in, i->page, + &checksum_ok); + if (UNIV_UNLIKELY(!success)) { + + os_file_get_last_error(TRUE); + ib_logf(IB_LOG_LEVEL_WARN, + "failed reading changed page bitmap file " + "\'%s\'", i->in_files.files[i->in_i].name); + i->failed = TRUE; + return FALSE; + } + } + + i->start_lsn = mach_read_from_8(i->page + MODIFIED_PAGE_START_LSN); + i->end_lsn = mach_read_from_8(i->page + MODIFIED_PAGE_END_LSN); + i->space_id = mach_read_from_4(i->page + MODIFIED_PAGE_SPACE_ID); + i->first_page_id = mach_read_from_4(i->page + + MODIFIED_PAGE_1ST_PAGE_ID); + i->last_page_in_run = mach_read_from_4(i->page + + MODIFIED_PAGE_IS_LAST_BLOCK); + i->bit_offset = 0; + i->changed = IS_BIT_SET(i->page + MODIFIED_PAGE_BLOCK_BITMAP, + i->bit_offset); + + return TRUE; +} + +/************************************************************//** +Delete all the bitmap files for data less than the specified LSN. +If called with lsn == 0 (i.e. set by RESET request) or LSN_MAX, +restart the bitmap file sequence, otherwise continue it. + +@return FALSE to indicate success, TRUE for failure. */ +UNIV_INTERN +ibool +log_online_purge_changed_page_bitmaps( +/*==================================*/ + lsn_t lsn) /*!< in: LSN to purge files up to */ +{ + log_online_bitmap_file_range_t bitmap_files; + size_t i; + ibool result = FALSE; + + if (lsn == 0) { + lsn = LSN_MAX; + } + + bool log_bmp_sys_inited = false; + if (srv_redo_log_thread_started) { + /* User requests might happen with both enabled and disabled + tracking */ + log_bmp_sys_inited = true; + mutex_enter(&log_bmp_sys_mutex); + if (!srv_redo_log_thread_started) { + log_bmp_sys_inited = false; + mutex_exit(&log_bmp_sys_mutex); + } + } + + if (!log_online_setup_bitmap_file_range(&bitmap_files, 0, LSN_MAX)) { + if (log_bmp_sys_inited) { + mutex_exit(&log_bmp_sys_mutex); + } + return TRUE; + } + + if (srv_redo_log_thread_started && lsn > log_bmp_sys->end_lsn) { + /* If we have to delete the current output file, close it + first. */ + os_file_close(log_bmp_sys->out.file); + os_file_mark_invalid(&log_bmp_sys->out.file); + } + + for (i = 0; i < bitmap_files.count; i++) { + + /* We consider the end LSN of the current bitmap, derived from + the start LSN of the subsequent bitmap file, to determine + whether to remove the current bitmap. Note that bitmap_files + does not contain an entry for the bitmap past the given LSN so + we must check the boundary conditions as well. For example, + consider 1_0.xdb and 2_10.xdb and querying LSN 5. bitmap_files + will only contain 1_0.xdb and we must not delete it since it + represents LSNs 0-9. */ + if ((i + 1 == bitmap_files.count + || bitmap_files.files[i + 1].seq_num == 0 + || bitmap_files.files[i + 1].start_lsn > lsn) + && (lsn != LSN_MAX)) { + + break; + } + if (!os_file_delete_if_exists(innodb_file_bmp_key, + bitmap_files.files[i].name)) { + + os_file_get_last_error(TRUE); + result = TRUE; + break; + } + } + + if (log_bmp_sys_inited) { + if (lsn > log_bmp_sys->end_lsn) { + lsn_t new_file_lsn; + if (lsn == LSN_MAX) { + /* RESET restarts the sequence */ + log_bmp_sys->out_seq_num = 0; + new_file_lsn = 0; + } else { + new_file_lsn = log_bmp_sys->end_lsn; + } + if (!log_online_rotate_bitmap_file(new_file_lsn)) { + /* If file create failed, stop log tracking */ + srv_track_changed_pages = FALSE; + } + } + + mutex_exit(&log_bmp_sys_mutex); + } + + free(bitmap_files.files); + return result; +} diff --cc storage/xtradb/os/os0file.cc index 89013d9068f,00000000000..b4fafb127ec mode 100644,000000..100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@@ -1,6177 -1,0 +1,6177 @@@ +/*********************************************************************** + +Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. - Copyright (c) 2013, 2017, MariaDB Corporation. ++Copyright (c) 2013, 2019, MariaDB Corporation. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file os/os0file.cc +The interface to the operating system file i/o primitives + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" + +#ifdef UNIV_NONINL +#include "os0file.ic" +#endif +#include "ha_prototypes.h" +#include "ut0mem.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "buf0buf.h" +#include "btr0types.h" +#include "trx0trx.h" +#include "srv0mon.h" +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" +# include "os0thread.h" +#else /* !UNIV_HOTBACKUP */ +# ifdef __WIN__ +/* Add includes for the _stat() call to compile on Windows */ +# include <sys/types.h> +# include <sys/stat.h> +# include <errno.h> +# endif /* __WIN__ */ +#endif /* !UNIV_HOTBACKUP */ + +#if defined(LINUX_NATIVE_AIO) +#include <libaio.h> +#endif + +#ifdef _WIN32 +#define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1 +#endif + +#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) +# include <sys/ioctl.h> +# ifndef DFS_IOCTL_ATOMIC_WRITE_SET +# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) +# endif +#endif + +/** Insert buffer segment id */ +static const ulint IO_IBUF_SEGMENT = 0; + +/** Log segment id */ +static const ulint IO_LOG_SEGMENT = 1; + +/* This specifies the file permissions InnoDB uses when it creates files in +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to +my_umask */ + +#ifndef __WIN__ +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +# define os_file_invalid (-1) +#else +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = 0; +# define os_file_invalid INVALID_HANDLE_VALUE +#endif /* __WIN__ */ + +#ifndef UNIV_HOTBACKUP +/* We use these mutexes to protect lseek + file i/o operation, if the +OS does not provide an atomic pread or pwrite, or similar */ +#define OS_FILE_N_SEEK_MUTEXES 16 +UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; + +/* In simulated aio, merge at most this many consecutive i/os */ +#define OS_AIO_MERGE_N_CONSECUTIVE 64 + +#ifdef WITH_INNODB_DISALLOW_WRITES +#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event) +#else +#define WAIT_ALLOW_WRITES() do { } while (0) +#endif /* WITH_INNODB_DISALLOW_WRITES */ + +/********************************************************************** + +InnoDB AIO Implementation: +========================= + +We support native AIO for windows and linux. For rest of the platforms +we simulate AIO by special io-threads servicing the IO-requests. + +Simulated AIO: +============== + +In platforms where we 'simulate' AIO following is a rough explanation +of the high level design. +There are four io-threads (for ibuf, log, read, write). +All synchronous IO requests are serviced by the calling thread using +os_file_write/os_file_read. The Asynchronous requests are queued up +in an array (there are four such arrays) by the calling thread. +Later these requests are picked up by the io-thread and are serviced +synchronously. + +Windows native AIO: +================== + +If srv_use_native_aio is not set then windows follow the same +code as simulated AIO. If the flag is set then native AIO interface +is used. On windows, one of the limitation is that if a file is opened +for AIO no synchronous IO can be done on it. Therefore we have an +extra fifth array to queue up synchronous IO requests. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. No thread is +required for the sync array. +If a synchronous IO request is made, it is first queued in the sync +array. Then the calling thread itself waits on the request, thus +making the call synchronous. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +Linux native AIO: +================= + +If we have libaio installed on the system and innodb_use_native_aio +is set to TRUE we follow the code path of native AIO, otherwise we +do simulated AIO. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. +If a synchronous IO request is made, it is handled by calling +os_file_write/os_file_read. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +**********************************************************************/ + +/** Flag: enable debug printout for asynchronous i/o */ +UNIV_INTERN ibool os_aio_print_debug = FALSE; + +#ifdef UNIV_PFS_IO +/* Keys to register InnoDB I/O with performance schema */ +UNIV_INTERN mysql_pfs_key_t innodb_file_data_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_log_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_bmp_key; +#endif /* UNIV_PFS_IO */ + +/** The asynchronous i/o array slot structure */ +struct os_aio_slot_t{ +#ifdef WIN_ASYNC_IO + OVERLAPPED control; /*!< Windows control block for the + aio request, MUST be first element in the structure*/ + void *arr; /*!< Array this slot belongs to*/ +#endif + + ibool is_read; /*!< TRUE if a read operation */ + ulint pos; /*!< index of the slot in the aio + array */ + ibool reserved; /*!< TRUE if this slot is reserved */ + time_t reservation_time;/*!< time when reserved */ + ulint len; /*!< length of the block to read or + write */ + byte* buf; /*!< buffer used in i/o */ + ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */ + os_offset_t offset; /*!< file offset in bytes */ + pfs_os_file_t file; /*!< file where to read or write */ + const char* name; /*!< file name or path */ + ibool io_already_done;/*!< used only in simulated aio: + TRUE if the physical i/o already + made and only the slot message + needs to be passed to the caller + of os_aio_simulated_handle */ + ulint space_id; + fil_node_t* message1; /*!< message which is given by the */ + void* message2; /*!< the requester of an aio operation + and which can be used to identify + which pending aio operation was + completed */ +#ifdef LINUX_NATIVE_AIO + struct iocb control; /* Linux control block for aio */ + int n_bytes; /* bytes written/read. */ + int ret; /* AIO return code */ +#endif /* WIN_ASYNC_IO */ +}; + +/** The asynchronous i/o array structure */ +struct os_aio_array_t{ + os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */ + os_event_t not_full; + /*!< The event which is set to the + signaled state when there is space in + the aio outside the ibuf segment; + os_event_set() and os_event_reset() + are protected by os_aio_array_t::mutex */ + os_event_t is_empty; + /*!< The event which is set to the + signaled state when there are no + pending i/os in this array; + os_event_set() and os_event_reset() + are protected by os_aio_array_t::mutex */ + ulint n_slots;/*!< Total number of slots in the aio + array. This must be divisible by + n_threads. */ + ulint n_segments; + /*!< Number of segments in the aio + array of pending aio requests. A + thread can wait separately for any one + of the segments. */ + ulint cur_seg;/*!< We reserve IO requests in round + robin fashion to different segments. + This points to the segment that is to + be used to service next IO request. */ + ulint n_reserved; + /*!< Number of reserved slots in the + aio array outside the ibuf segment */ + os_aio_slot_t* slots; /*!< Pointer to the slots in the array */ + +#if defined(LINUX_NATIVE_AIO) + io_context_t* aio_ctx; + /* completion queue for IO. There is + one such queue per segment. Each thread + will work on one ctx exclusively. */ + struct io_event* aio_events; + /* The array to collect completed IOs. + There is one such event for each + possible pending IO. The size of the + array is equal to n_slots. */ + struct iocb** pending; + /* Array to buffer the not-submitted aio + requests. The array length is n_slots. + It is divided into n_segments segments. + pending requests on each segment are buffered + separately.*/ + ulint* count; + /* Array of length n_segments. Each element + counts the number of not-submitted aio + request on that segment.*/ +#endif /* LINUX_NATIV_AIO */ +}; + +#if defined(LINUX_NATIVE_AIO) +/** timeout for each io_getevents() call = 500ms. */ +#define OS_AIO_REAP_TIMEOUT (500000000UL) + +/** time to sleep, in microseconds if io_setup() returns EAGAIN. */ +#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL) + +/** number of attempts before giving up on io_setup(). */ +#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5 +#endif + +/** Array of events used in simulated aio. */ +static os_event_t* os_aio_segment_wait_events; + +/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These +are NULL when the module has not yet been initialized. @{ */ +static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */ +static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */ +static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */ +static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */ +static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */ +/* @} */ + +/** Number of asynchronous I/O segments. Set by os_aio_init(). */ +static ulint os_aio_n_segments = ULINT_UNDEFINED; + +/** If the following is TRUE, read i/o handler threads try to +wait until a batch of new read requests have been posted */ +static ibool os_aio_recommend_sleep_for_read_threads = FALSE; +#endif /* !UNIV_HOTBACKUP */ + +UNIV_INTERN ulint os_n_file_reads = 0; +UNIV_INTERN ulint os_bytes_read_since_printout = 0; +UNIV_INTERN ulint os_n_file_writes = 0; +UNIV_INTERN ulint os_n_fsyncs = 0; +UNIV_INTERN ulint os_n_file_reads_old = 0; +UNIV_INTERN ulint os_n_file_writes_old = 0; +UNIV_INTERN ulint os_n_fsyncs_old = 0; +UNIV_INTERN time_t os_last_printout; + +UNIV_INTERN ibool os_has_said_disk_full = FALSE; + +#ifdef UNIV_DEBUG +# ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Validates the consistency the aio system some of the time. +@return TRUE if ok or the check was skipped */ +UNIV_INTERN +ibool +os_aio_validate_skip(void) +/*======================*/ +{ +/** Try os_aio_validate() every this many times */ +# define OS_AIO_VALIDATE_SKIP 13 + + /** The os_aio_validate() call skip counter. + Use a signed type because of the race condition below. */ + static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly os_aio_validate() + check in debug builds. */ + if (--os_aio_validate_count > 0) { + return(TRUE); + } + + os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + return(os_aio_validate()); +} +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_DEBUG */ + +#ifdef _WIN32 +/** IO completion port used by background io threads */ +static HANDLE completion_port; +/** IO completion port used by background io READ threads */ +static HANDLE read_completion_port; +/** Thread local storage index for the per-thread event used for synchronous IO */ +static DWORD tls_sync_io = TLS_OUT_OF_INDEXES; +#endif + +#ifdef __WIN__ +/***********************************************************************//** +Gets the operating system version. Currently works only on Windows. +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, +OS_WIN7. */ +UNIV_INTERN +ulint +os_get_os_version(void) +/*===================*/ +{ + OSVERSIONINFO os_info; + + os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); + + ut_a(GetVersionEx(&os_info)); + + if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) { + return(OS_WIN31); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { + return(OS_WIN95); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { + switch (os_info.dwMajorVersion) { + case 3: + case 4: + return(OS_WINNT); + case 5: + return (os_info.dwMinorVersion == 0) + ? OS_WIN2000 : OS_WINXP; + case 6: + return (os_info.dwMinorVersion == 0) + ? OS_WINVISTA : OS_WIN7; + default: + return(OS_WIN7); + } + } else { + ut_error; + return(0); + } +} +#endif /* __WIN__ */ + + +#ifdef _WIN32 +/* +Windows : Handling synchronous IO on files opened asynchronously. + +If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to +a completion port, then every IO on this file would normally be enqueued to the +completion port. Sometimes however we would like to do a synchronous IO. This is +possible if we initialitze have overlapped.hEvent with a valid event and set its +lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info) + +We'll create this special event once for each thread and store in thread local +storage. +*/ + + +/***********************************************************************//** +Initialize tls index.for event handle used for synchronized IO on files that +might be opened with FILE_FLAG_OVERLAPPED. +*/ +static void win_init_syncio_event() +{ + tls_sync_io = TlsAlloc(); + ut_a(tls_sync_io != TLS_OUT_OF_INDEXES); +} + +/***********************************************************************//** +Retrieve per-thread event for doing synchronous io on asyncronously opened files +*/ +static HANDLE win_get_syncio_event() +{ + HANDLE h; + if(tls_sync_io == TLS_OUT_OF_INDEXES){ + win_init_syncio_event(); + } + + h = (HANDLE)TlsGetValue(tls_sync_io); + if (h) + return h; + h = CreateEventA(NULL, FALSE, FALSE, NULL); + ut_a(h); + h = (HANDLE)((uintptr_t)h | 1); + TlsSetValue(tls_sync_io, h); + return h; +} + +/* + TLS destructor, inspired by Chromium code + http://src.chromium.org/svn/trunk/src/base/threading/thread_local_storage_wi... +*/ + +static void win_free_syncio_event() +{ + HANDLE h = win_get_syncio_event(); + if (h) { + CloseHandle(h); + } +} + +static void NTAPI win_tls_thread_exit(PVOID module, DWORD reason, PVOID reserved) { + if (DLL_THREAD_DETACH == reason || DLL_PROCESS_DETACH == reason) + win_free_syncio_event(); +} + +extern "C" { +#ifdef _WIN64 +#pragma comment(linker, "/INCLUDE:_tls_used") +#pragma comment(linker, "/INCLUDE:p_thread_callback_base") +#pragma const_seg(".CRT$XLB") +extern const PIMAGE_TLS_CALLBACK p_thread_callback_base; +const PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit; +#pragma data_seg() +#else +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma comment(linker, "/INCLUDE:_p_thread_callback_base") +#pragma data_seg(".CRT$XLB") +PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit; +#pragma data_seg() +#endif +} +#endif /*_WIN32 */ + +/***********************************************************************//** +For an EINVAL I/O error, prints a diagnostic message if innodb_flush_method +== ALL_O_DIRECT. +@return true if the diagnostic message was printed +@return false if the diagnostic message does not apply */ +static +bool +os_diagnose_all_o_direct_einval( +/*============================*/ + ulint err) /*!< in: C error code */ +{ + if ((err == EINVAL) + && (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)) { + ib_logf(IB_LOG_LEVEL_INFO, + "The error might be caused by redo log I/O not " + "satisfying innodb_flush_method=ALL_O_DIRECT " + "requirements by the underlying file system."); + if (srv_log_block_size != 512) + ib_logf(IB_LOG_LEVEL_INFO, + "This might be caused by an incompatible " + "non-default innodb_log_block_size value %lu.", + srv_log_block_size); + ib_logf(IB_LOG_LEVEL_INFO, + "Please file a bug at https://bugs.percona.com and " + "include this error message, my.cnf settings, and " + "information about the file system where the redo log " + "resides."); + ib_logf(IB_LOG_LEVEL_INFO, + "A possible workaround is to change " + "innodb_flush_method value to something else " + "than ALL_O_DIRECT."); + return(true); + } + return(false); +} + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +static +ulint +os_file_get_last_error_low( +/*=======================*/ + bool report_all_errors, /*!< in: TRUE if we want an error + message printed of all errors */ + bool on_error_silent) /*!< in: TRUE then don't print any + diagnostic to the log */ +{ +#ifdef __WIN__ + + ulint err = (ulint) GetLastError(); + if (err == ERROR_SUCCESS) { + return(0); + } + + if (report_all_errors + || (!on_error_silent + && err != ERROR_DISK_FULL + && err != ERROR_FILE_EXISTS)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %lu" + " in a file operation.\n", (ulong) err); + + if (err == ERROR_PATH_NOT_FOUND) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == ERROR_ACCESS_DENIED) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory. It may also be" + " you have created a subdirectory\n" + "InnoDB: of the same name as a data file.\n"); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + fprintf(stderr, + "InnoDB: The error means that another program" + " is using InnoDB's files.\n" + "InnoDB: This might be a backup or antivirus" + " software or another instance\n" + "InnoDB: of MySQL." + " Please close it to get rid of this error.\n"); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + fprintf(stderr, + "InnoDB: The error means that there are no" + " sufficient system resources or quota to" + " complete the operation.\n"); + } else if (err == ERROR_OPERATION_ABORTED) { + fprintf(stderr, + "InnoDB: The error means that the I/O" + " operation has been aborted\n" + "InnoDB: because of either a thread exit" + " or an application request.\n" + "InnoDB: Retry attempt is made.\n"); + } else { + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + if (err == ERROR_FILE_NOT_FOUND) { + return(OS_FILE_NOT_FOUND); + } else if (err == ERROR_DISK_FULL) { + return(OS_FILE_DISK_FULL); + } else if (err == ERROR_FILE_EXISTS) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + return(OS_FILE_SHARING_VIOLATION); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + return(OS_FILE_INSUFFICIENT_RESOURCE); + } else if (err == ERROR_OPERATION_ABORTED) { + return(OS_FILE_OPERATION_ABORTED); + } else if (err == ERROR_ACCESS_DENIED) { + return(OS_FILE_ACCESS_VIOLATION); + } else if (err == ERROR_BUFFER_OVERFLOW) { + return(OS_FILE_NAME_TOO_LONG); + } else { + return(OS_FILE_ERROR_MAX + err); + } +#else + int err = errno; + if (err == 0) { + return(0); + } + + if (report_all_errors + || (err != ENOSPC && err != EEXIST && !on_error_silent)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %d" + " in a file operation.\n", err); + + if (err == ENOENT) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == EACCES) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory.\n"); + } else if (!os_diagnose_all_o_direct_einval(err)) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + + fprintf(stderr, + "InnoDB: Some operating system" + " error numbers are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + switch (err) { + case ENOSPC: + return(OS_FILE_DISK_FULL); + case ENOENT: + return(OS_FILE_NOT_FOUND); + case EEXIST: + return(OS_FILE_ALREADY_EXISTS); + case ENAMETOOLONG: + return(OS_FILE_NAME_TOO_LONG); + case EXDEV: + case ENOTDIR: + case EISDIR: + return(OS_FILE_PATH_ERROR); + case EAGAIN: + if (srv_use_native_aio) { + return(OS_FILE_AIO_RESOURCES_RESERVED); + } + break; + case EINTR: + if (srv_use_native_aio) { + return(OS_FILE_AIO_INTERRUPTED); + } + break; + case EACCES: + return(OS_FILE_ACCESS_VIOLATION); + } + return(OS_FILE_ERROR_MAX + err); +#endif +} + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + bool report_all_errors) /*!< in: TRUE if we want an error + message printed of all errors */ +{ + return(os_file_get_last_error_low(report_all_errors, false)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +Conditionally exits (calling exit(3)) based on should_exit value and the +error type, if should_exit is TRUE then on_error_silent is ignored. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_cond_exit( +/*===========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool should_exit, /*!< in: call exit(3) if unknown error + and this parameter is TRUE */ + ibool on_error_silent)/*!< in: if TRUE then don't print + any message to the log iff it is + an unknown non-fatal error */ +{ + ulint err; + + err = os_file_get_last_error_low(false, on_error_silent); + + switch (err) { + case OS_FILE_DISK_FULL: + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(FALSE); + } + + /* Disk full error is reported irrespective of the + on_error_silent setting. */ + + if (name) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Encountered a problem with" + " file %s\n", name); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Disk is full. Try to clean the disk" + " to free space.\n"); + + os_has_said_disk_full = TRUE; + + fflush(stderr); + + ut_error; + return(FALSE); + + case OS_FILE_AIO_RESOURCES_RESERVED: + case OS_FILE_AIO_INTERRUPTED: + + return(TRUE); + + case OS_FILE_PATH_ERROR: + case OS_FILE_ALREADY_EXISTS: + case OS_FILE_ACCESS_VIOLATION: + + return(FALSE); + + case OS_FILE_SHARING_VIOLATION: + + os_thread_sleep(10000000); /* 10 sec */ + return(TRUE); + + case OS_FILE_OPERATION_ABORTED: + case OS_FILE_INSUFFICIENT_RESOURCE: + + os_thread_sleep(100000); /* 100 ms */ + return(TRUE); + + default: + + /* If it is an operation that can crash on error then it + is better to ignore on_error_silent and print an error message + to the log. */ + + if (should_exit || !on_error_silent) { + ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " + "error " ULINTPF ".%s", name ? name : "(unknown)", + operation, err, should_exit + ? " Cannot continue operation" : ""); + } + + if (should_exit) { + exit(1); + } + } + + return(FALSE); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error( +/*=================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation) /*!< in: operation */ +{ + /* exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent)/*!< in: if TRUE then don't print + any message to the log. */ +{ + /* don't exit in case of unknown error */ + return(os_file_handle_error_cond_exit( + name, operation, FALSE, on_error_silent)); +} + +#undef USE_FILE_LOCK +#define USE_FILE_LOCK +#if defined(UNIV_HOTBACKUP) || defined(__WIN__) +/* InnoDB Hot Backup does not lock the data files. + * On Windows, mandatory locking is used. + */ +# undef USE_FILE_LOCK +#endif +#ifdef USE_FILE_LOCK +/****************************************************************//** +Obtain an exclusive lock on a file. +@return 0 on success */ +static +int +os_file_lock( +/*=========*/ + int fd, /*!< in: file descriptor */ + const char* name) /*!< in: file name */ +{ + struct flock lk; + + ut_ad(!srv_read_only_mode); + + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + + if (fcntl(fd, F_SETLK, &lk) == -1) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to lock %s, error: %d", name, errno); + + if (errno == EAGAIN || errno == EACCES) { + ib_logf(IB_LOG_LEVEL_INFO, + "Check that you do not already have " + "another mysqld process using the " + "same InnoDB data or log files."); + } + + return(-1); + } + + return(0); +} +#endif /* USE_FILE_LOCK */ + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Creates the seek mutexes used in positioned reads and writes. */ +static +void +os_io_init_simple(void) +/*===================*/ +{ + for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + os_file_seek_mutexes[i] = os_mutex_create(); + } +#ifdef _WIN32 + win_init_syncio_event(); +#endif +} + +/** Create a temporary file. This function is like tmpfile(3), but +the temporary file is created in the given parameter path. If the path +is null then it will create the file in the mysql server configuration +parameter (--tmpdir). +@param[in] path location for creating temporary file +@return temporary file handle, or NULL on error */ +UNIV_INTERN +FILE* +os_file_create_tmpfile( + const char* path) +{ + FILE* file = NULL; + int fd; + WAIT_ALLOW_WRITES(); + fd = innobase_mysql_tmpfile(path); + + ut_ad(!srv_read_only_mode); + + if (fd >= 0) { + file = fdopen(fd, "w+b"); + } + + if (!file) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: unable to create temporary file;" + " errno: %d\n", errno); + if (fd >= 0) { + close(fd); + } + } + + return(file); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. +@return directory stream, NULL if error */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + const char* dirname, /*!< in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal) /*!< in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +{ + os_file_dir_t dir; +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + char path[OS_FILE_MAX_PATH + 3]; + + ut_a(strlen(dirname) < OS_FILE_MAX_PATH); + + strcpy(path, dirname); + strcpy(path + strlen(path), "\\*"); + + /* Note that in Windows opening the 'directory stream' also retrieves + the first entry in the directory. Since it is '.', that is no problem, + as we will skip over the '.' and '..' entries anyway. */ + + lpFindFileData = static_cast<LPWIN32_FIND_DATA>( + ut_malloc(sizeof(WIN32_FIND_DATA))); + + dir = FindFirstFile((LPCTSTR) path, lpFindFileData); + + ut_free(lpFindFileData); + + if (dir == INVALID_HANDLE_VALUE) { + + if (error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(NULL); + } + + return(dir); +#else + dir = opendir(dirname); + + if (dir == NULL && error_is_fatal) { + os_file_handle_error(dirname, "opendir"); + } + + return(dir); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Closes a directory stream. +@return 0 if success, -1 if failure */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + os_file_dir_t dir) /*!< in: directory stream */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = FindClose(dir); + + if (!ret) { + os_file_handle_error_no_exit(NULL, "closedir", FALSE); + + return(-1); + } + + return(0); +#else + int ret; + + ret = closedir(dir); + + if (ret) { + os_file_handle_error_no_exit(NULL, "closedir", FALSE); + } + + return(ret); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. +@return 0 if ok, -1 if error, 1 if at the end of the directory */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info) /*!< in/out: buffer where the info is returned */ +{ +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + BOOL ret; + + lpFindFileData = static_cast<LPWIN32_FIND_DATA>( + ut_malloc(sizeof(WIN32_FIND_DATA))); +next_file: + ret = FindNextFile(dir, lpFindFileData); + + if (ret) { + ut_a(strlen((char*) lpFindFileData->cFileName) + < OS_FILE_MAX_PATH); + + if (strcmp((char*) lpFindFileData->cFileName, ".") == 0 + || strcmp((char*) lpFindFileData->cFileName, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, (char*) lpFindFileData->cFileName); + + info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow) + + (((ib_int64_t)(lpFindFileData->nFileSizeHigh)) + << 32); + + if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_REPARSE_POINT) { + /* TODO: test Windows symlinks */ + /* TODO: MySQL has apparently its own symlink + implementation in Windows, dbname.sym can + redirect a database directory: + REFMAN "windows-symbolic-links.html" */ + info->type = OS_FILE_TYPE_LINK; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_DIRECTORY) { + info->type = OS_FILE_TYPE_DIR; + } else { + /* It is probably safest to assume that all other + file types are normal. Better to check them rather + than blindly skip them. */ + + info->type = OS_FILE_TYPE_FILE; + } + } + + ut_free(lpFindFileData); + + if (ret) { + return(0); + } else if (GetLastError() == ERROR_NO_MORE_FILES) { + + return(1); + } else { + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + return(-1); + } +#else + struct dirent* ent; + char* full_path; + int ret; + struct stat statinfo; + +next_file: + + ent = readdir(dir); + + if (ent == NULL) { + + return(1); + } + ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); + + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, ent->d_name); + + full_path = static_cast<char*>( + ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10)); + + sprintf(full_path, "%s/%s", dirname, ent->d_name); + + ret = stat(full_path, &statinfo); + + if (ret) { + + if (errno == ENOENT) { + /* readdir() returned a file that does not exist, + it must have been deleted in the meantime. Do what + would have happened if the file was deleted before + readdir() - ignore and go to the next entry. + If this is the last entry then info->name will still + contain the name of the deleted file when this + function returns, but this is not an issue since the + caller shouldn't be looking at info when end of + directory is returned. */ + + ut_free(full_path); + + goto next_file; + } + + os_file_handle_error_no_exit(full_path, "stat", FALSE); + + ut_free(full_path); + + return(-1); + } + + info->size = (ib_int64_t) statinfo.st_size; + + if (S_ISDIR(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + + ut_free(full_path); + + return(0); +#endif +} + +/*****************************************************************//** +This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns FALSE. +@return TRUE if call succeeds, FALSE on error */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + const char* pathname, /*!< in: directory name as + null-terminated string */ + ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory + is treated as an error. */ +{ +#ifdef __WIN__ + BOOL rcode; + + rcode = CreateDirectory((LPCTSTR) pathname, NULL); + if (!(rcode != 0 + || (GetLastError() == ERROR_ALREADY_EXISTS + && !fail_if_exists))) { + + os_file_handle_error_no_exit( + pathname, "CreateDirectory", FALSE); + + return(FALSE); + } + + return(TRUE); +#else + int rcode; + WAIT_ALLOW_WRITES(); + + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + + return(FALSE); + } + + return (TRUE); +#endif /* __WIN__ */ +} + +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_func( +/*=======================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + os_file_t file; + ibool retry; + + *success = FALSE; +#ifdef __WIN__ + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + ut_a(!srv_read_only_mode); + + /* Create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); + } + + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_INFO, + "read only mode set. Unable to " + "open file '%s' in RW mode, trying RO mode", name); + + access = GENERIC_READ; + + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + + return((os_file_t) -1); + } + + do { + /* Use default security attributes and no template file. */ + + file = CreateFile( + (LPCTSTR) name, access, FILE_SHARE_READ, NULL, + create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + + *success = FALSE; + + retry = os_file_handle_error( + name, create_mode == OS_FILE_OPEN ? + "open" : "create"); + + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + +#else /* __WIN__ */ + int create_flag; + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else if (srv_read_only_mode) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + /* Create subdirs along the path if needed */ + + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); + } + + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + do { - file = ::open(name, create_flag, os_innodb_umask); ++ file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + if (file == -1) { + *success = FALSE; + + retry = os_file_handle_error( + name, + create_mode == OS_FILE_OPEN + ? "open" : "create"); + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + + *success = FALSE; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + +#endif /* __WIN__ */ + + return(file); +} + +/** Disable OS I/O caching on the file if the file type and server +configuration requires it. +@param file handle to the file +@param name name of the file, for diagnostics +@param mode_str operation on the file, for diagnostics +@param type OS_LOG_FILE or OS_DATA_FILE +@param access_type if OS_FILE_READ_WRITE_CACHED, then caching will be disabled +unconditionally, ignored otherwise */ +static +void +os_file_set_nocache_if_needed(os_file_t file, const char* name, + const char *mode_str, ulint type, + ulint access_type) +{ + if (srv_read_only_mode || access_type == OS_FILE_READ_WRITE_CACHED) + return; + + if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT + || (type != OS_LOG_FILE + && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT + || (srv_unix_file_flush_method + == SRV_UNIX_O_DIRECT_NO_FSYNC)))) + /* Do fsync() on log files when setting O_DIRECT fails. + See log_io_complete() */ + if (!os_file_set_nocache(file, name, mode_str) + && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) + srv_unix_file_flush_method = SRV_UNIX_O_DIRECT; +} + +/****************************************************************//** +NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +pfs_os_file_t +os_file_create_simple_no_error_handling_func( +/*=========================================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, + OS_FILE_READ_ALLOW_DELETE (used by a backup + program reading the file), or + OS_FILE_READ_WRITE_CACHED (disable O_DIRECT + if it would be enabled otherwise) */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + pfs_os_file_t file; + + *success = FALSE; +#ifdef __WIN__ + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + DWORD share_mode = FILE_SHARE_READ; + ut_a(name); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (srv_read_only_mode) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + file.m_file = (os_file_t)-1; + return(file); + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (srv_read_only_mode) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_WRITE_CACHED) { + access = GENERIC_READ | GENERIC_WRITE; + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + + ut_a(!srv_read_only_mode); + + access = GENERIC_READ; + + /*!< A backup program has to give mysqld the maximum + freedom to do what it likes with the file */ + + share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + file.m_file = (os_file_t)-1; + return(file); + } + + file.m_file = CreateFile((LPCTSTR) name, + access, + share_mode, + NULL, // Security attributes + create_flag, + attributes, + NULL); // No template file + + *success = (file.m_file != INVALID_HANDLE_VALUE); +#else /* __WIN__ */ + int create_flag; + const char* mode_str = NULL; + ut_a(name); + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + mode_str = "OPEN"; + + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else { + + ut_a(access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_ALLOW_DELETE + || access_type == OS_FILE_READ_WRITE_CACHED); + + create_flag = O_RDWR; + } + + } else if (srv_read_only_mode) { + + mode_str = "OPEN"; + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + file.m_file = -1; + return(file); + } + - file.m_file = ::open(name, create_flag, os_innodb_umask); ++ file.m_file = ::open(name, create_flag | O_CLOEXEC , os_innodb_umask); + + *success = file.m_file == -1 ? FALSE : TRUE; + + /* This function is always called for data files, we should disable + OS caching (O_DIRECT) here as we do in os_file_create_func(), so + we open the same file in the same mode, see man page of open(2). */ + if (*success) { + os_file_set_nocache_if_needed(file.m_file, name, mode_str, + OS_DATA_FILE, access_type); + } + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && (access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_WRITE_CACHED) + && os_file_lock(file.m_file, name)) { + + *success = FALSE; + close(file.m_file); + file.m_file = -1; + + } +#endif /* USE_FILE_LOCK */ + +#endif /* __WIN__ */ + + return(file); +} + +/****************************************************************//** +Tries to disable OS caching on an opened file descriptor. +@return TRUE if operation is success and FALSE otherwise */ +UNIV_INTERN +bool +os_file_set_nocache( +/*================*/ + os_file_t fd /*!< in: file descriptor to alter */ + MY_ATTRIBUTE((unused)), + const char* file_name /*!< in: used in the diagnostic + message */ + MY_ATTRIBUTE((unused)), + const char* operation_name MY_ATTRIBUTE((unused))) + /*!< in: "open" or "create"; used + in the diagnostic message */ +{ + /* some versions of Solaris may not have DIRECTIO_ON */ +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) + if (directio(fd, DIRECTIO_ON) == -1) { + int errno_save = errno; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Failed to set DIRECTIO_ON on file %s: %s: %s, " + "continuing anyway.", + file_name, operation_name, strerror(errno_save)); + return false; + } +#elif defined(O_DIRECT) + if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { + int errno_save = errno; + static bool warning_message_printed = false; + if (errno_save == EINVAL) { + if (!warning_message_printed) { + warning_message_printed = true; +# ifdef UNIV_LINUX + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set O_DIRECT on file " + "%s: %s: %s, continuing anyway. " + "O_DIRECT is known to result " + "in 'Invalid argument' on Linux on " + "tmpfs, see MySQL Bug#26662.", + file_name, operation_name, + strerror(errno_save)); +# else /* UNIV_LINUX */ + goto short_warning; +# endif /* UNIV_LINUX */ + } + } else { +# ifndef UNIV_LINUX +short_warning: +# endif + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set O_DIRECT on file %s: %s: %s, " + "continuing anyway.", + file_name, operation_name, strerror(errno_save)); + } + return false; + } +#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ + return true; +} + + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static MY_ATTRIBUTE((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name /*!< in: name of the file */ + MY_ATTRIBUTE((unused)), + os_file_t file /*!< in: handle to the file */ + MY_ATTRIBUTE((unused))) + +{ +#ifdef DFS_IOCTL_ATOMIC_WRITE_SET + int atomic_option = 1; + + if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { + + os_file_handle_error_no_exit(name, "ioctl", FALSE); + return(FALSE); + } + + return(TRUE); +#else + ib_logf(IB_LOG_LEVEL_ERROR, + "trying to enable atomic writes on non-supported platform! " + "Please restart with innodb_use_atomic_writes disabled."); + return(FALSE); +#endif +} + +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +pfs_os_file_t +os_file_create_func( +/*================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + pfs_os_file_t file; + ibool retry; + ibool on_error_no_exit; + ibool on_error_silent; +#ifdef __WIN__ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + SetLastError(ERROR_DISK_FULL); + file.m_file = (os_file_t)-1; + return(file); + ); +#else /* __WIN__ */ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + errno = ENOSPC; + file.m_file = -1; + return(file); + ); +#endif /* __WIN__ */ + +#ifdef __WIN__ + DWORD create_flag; + DWORD share_mode = FILE_SHARE_READ; + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? TRUE : FALSE; + + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? TRUE : FALSE; + + create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; + create_mode &= ~OS_FILE_ON_ERROR_SILENT; + + if (create_mode == OS_FILE_OPEN_RAW) { + + ut_a(!srv_read_only_mode); + + create_flag = OPEN_EXISTING; + + /* On Windows Physical devices require admin privileges and + have to have the write-share mode set. See the remarks + section for the CreateFile() function documentation in MSDN. */ + + share_mode |= FILE_SHARE_WRITE; + + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RETRY) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + create_flag = CREATE_ALWAYS; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + file.m_file = (os_file_t)-1; + return(file); + } + + DWORD attributes = 0; + +#ifdef UNIV_HOTBACKUP + attributes |= FILE_FLAG_NO_BUFFERING; +#else + if (purpose == OS_FILE_AIO) { + +#ifdef WIN_ASYNC_IO + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ + + if (srv_use_native_aio) { + attributes |= FILE_FLAG_OVERLAPPED; + } +#endif /* WIN_ASYNC_IO */ + + } else if (purpose == OS_FILE_NORMAL) { + /* Use default setting. */ + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown purpose flag (%lu) while opening file '%s'", + purpose, name); + file.m_file = (os_file_t)-1; + return(file); + } + +#ifdef UNIV_NON_BUFFERED_IO + // TODO: Create a bug, this looks wrong. The flush log + // parameter is dynamic. + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) { + + /* Do not use unbuffered i/o for the log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + + } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) { + + attributes |= FILE_FLAG_NO_BUFFERING; + } +#endif /* UNIV_NON_BUFFERED_IO */ + +#endif /* UNIV_HOTBACKUP */ + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + + if (type == OS_LOG_FILE) { + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + /* Map O_DSYNC to WRITE_THROUGH */ + attributes |= FILE_FLAG_WRITE_THROUGH; + } else if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { + /* Open log file without buffering */ + attributes |= FILE_FLAG_NO_BUFFERING; + } + } + + do { + /* Use default security attributes and no template file. */ + file.m_file = CreateFile( + (LPCTSTR) name, access, share_mode, NULL, + create_flag, attributes, NULL); + + if (file.m_file == INVALID_HANDLE_VALUE) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; + + *success = FALSE; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } + } else { + *success = TRUE; + retry = FALSE; + if (srv_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) { + ut_a(CreateIoCompletionPort(file.m_file, completion_port, 0, 0)); + } + } + + } while (retry); + + if (srv_use_atomic_writes && type == OS_DATA_FILE && + !os_file_set_atomic_writes(name, file.m_file)) { + CloseHandle(file.m_file); + *success = FALSE; + file.m_file = INVALID_HANDLE_VALUE; + } + +#else /* __WIN__ */ + int create_flag; + const char* mode_str = NULL; + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? TRUE : FALSE; + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? TRUE : FALSE; + + create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; + create_mode &= ~OS_FILE_ON_ERROR_SILENT; + + if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + + mode_str = "OPEN"; + + create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR; + + } else if (srv_read_only_mode) { + + mode_str = "OPEN"; + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + file.m_file = -1; + return(file); + } + + ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE); + ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); + +#ifdef O_SYNC + /* We let O_SYNC only affect log files; note that we map O_DSYNC to + O_SYNC because the datasync options seemed to corrupt files in 2001 + in both Linux and Solaris */ + + if (!srv_read_only_mode + && type == OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + + create_flag |= O_SYNC; + } +#endif /* O_SYNC */ + + do { - file.m_file = ::open(name, create_flag, os_innodb_umask); ++ file.m_file = ::open(name, create_flag | O_CLOEXEC, os_innodb_umask); + + if (file.m_file == -1) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; + + *success = FALSE; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent); + } else { + retry = os_file_handle_error(name, operation); + } + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + + if (*success) { + + os_file_set_nocache_if_needed(file.m_file, name, mode_str, + type, 0); + } + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && create_mode != OS_FILE_OPEN_RAW + && os_file_lock(file.m_file, name)) { + + if (create_mode == OS_FILE_OPEN_RETRY) { + + ut_a(!srv_read_only_mode); + + ib_logf(IB_LOG_LEVEL_INFO, + "Retrying to lock the first data file"); + + for (int i = 0; i < 100; i++) { + os_thread_sleep(1000000); + + if (!os_file_lock(file.m_file, name)) { + *success = TRUE; + return(file); + } + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Unable to open the first data file"); + } + + *success = FALSE; + close(file.m_file); + file.m_file = -1; + } +#endif /* USE_FILE_LOCK */ + + if (srv_use_atomic_writes && type == OS_DATA_FILE + && file.m_file != -1 + && !os_file_set_atomic_writes(name, file.m_file)) { + + *success = FALSE; + close(file.m_file); + file.m_file = -1; + } + +#endif /* __WIN__ */ + + return(file); +} + +/***********************************************************************//** +Deletes a file if it exists. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_if_exists_func( +/*==========================*/ + const char* name) /*!< in: file path as a null-terminated + string */ +{ +#ifdef __WIN__ + bool ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying + it */ + + ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + DWORD lasterr = GetLastError(); + if (lasterr == ERROR_FILE_NOT_FOUND + || lasterr == ERROR_PATH_NOT_FOUND) { + /* the file does not exist, this not an error */ + + return(true); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + + ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name); + } + + os_thread_sleep(500000); /* sleep for 0.5 second */ + + if (count > 2000) { + + return(false); + } + + goto loop; +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = unlink(name); + + if (ret != 0 && errno != ENOENT) { + os_file_handle_error_no_exit(name, "delete", FALSE); + + return(false); + } + + return(true); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Deletes a file. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_func( +/*================*/ + const char* name) /*!< in: file path as a null-terminated + string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if mysqlbackup is copying + it */ + + ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* If the file does not exist, we classify this as a 'mild' + error and return */ + + return(false); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + + fprintf(stderr, + "InnoDB: Warning: cannot delete file %s\n" + "InnoDB: Are you running mysqlbackup" + " to back up the file?\n", name); + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(false); + } + + goto loop; +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = unlink(name); + + if (ret != 0) { + os_file_handle_error_no_exit(name, "delete", FALSE); + + return(false); + } + + return(true); +#endif +} + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_rename(), not directly this function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_rename_func( +/*================*/ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath)/*!< in: new file path */ +{ +#ifdef UNIV_DEBUG + os_file_type_t type; + ibool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + +#ifdef __WIN__ + BOOL ret; + + ret = MoveFileEx((LPCTSTR)oldpath, (LPCTSTR)newpath, MOVEFILE_REPLACE_EXISTING); + + if (ret) { + return(TRUE); + } + + os_file_handle_error_no_exit(oldpath, "rename", FALSE); + + return(FALSE); +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = rename(oldpath, newpath); + + if (ret != 0) { + os_file_handle_error_no_exit(oldpath, "rename", FALSE); + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_close(), not directly this function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_func( +/*===============*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(NULL, "close"); + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + os_file_handle_error(NULL, "close"); + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Closes a file handle. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_close_no_error_handling_func( +/*============================*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = CloseHandle(file); + + if (ret) { + return(true); + } + + return(false); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + + return(false); + } + + return(true); +#endif /* __WIN__ */ +} + +#ifdef HAVE_POSIX_FALLOCATE +/***********************************************************************//** +Ensures that disk space is allocated for the file. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_allocate_func( + os_file_t file, /*!< in, own: handle to a file */ + os_offset_t offset, /*!< in: file region offset */ + os_offset_t len) /*!< in: file region length */ +{ + return(posix_fallocate(file, offset, len) == 0); +} +#endif + +/***********************************************************************//** +Checks if the file is marked as invalid. +@return TRUE if invalid */ +UNIV_INTERN +bool +os_file_is_invalid( + pfs_os_file_t file) /*!< in, own: handle to a file */ +{ + return(file.m_file == os_file_invalid); +} + +/***********************************************************************//** +Marks the file as invalid. */ +UNIV_INTERN +void +os_file_mark_invalid( + pfs_os_file_t* file) /*!< out: pointer to a handle to a file */ +{ + file->m_file = os_file_invalid; +} + +/***********************************************************************//** +Announces an intention to access file data in a specific pattern in the +future. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_advise( + pfs_os_file_t file, /*!< in, own: handle to a file */ + os_offset_t offset, /*!< in: file region offset */ + os_offset_t len, /*!< in: file region length */ + ulint advice)/*!< in: advice for access pattern */ +{ +#ifdef __WIN__ + return(true); +#else +#ifdef UNIV_LINUX + int native_advice = 0; + if ((advice & OS_FILE_ADVISE_NORMAL) != 0) + native_advice |= POSIX_FADV_NORMAL; + if ((advice & OS_FILE_ADVISE_RANDOM) != 0) + native_advice |= POSIX_FADV_RANDOM; + if ((advice & OS_FILE_ADVISE_SEQUENTIAL) != 0) + native_advice |= POSIX_FADV_SEQUENTIAL; + if ((advice & OS_FILE_ADVISE_WILLNEED) != 0) + native_advice |= POSIX_FADV_WILLNEED; + if ((advice & OS_FILE_ADVISE_DONTNEED) != 0) + native_advice |= POSIX_FADV_DONTNEED; + if ((advice & OS_FILE_ADVISE_NOREUSE) != 0) + native_advice |= POSIX_FADV_NOREUSE; + + return(posix_fadvise(file.m_file, offset, len, native_advice) == 0); +#else + return(true); +#endif +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Gets a file size. +@return file size, or (os_offset_t) -1 on failure */ +UNIV_INTERN +os_offset_t +os_file_get_size( +/*=============*/ + pfs_os_file_t file) /*!< in: handle to a file */ +{ +#ifdef __WIN__ + os_offset_t offset; + DWORD high; + DWORD low; + + low = GetFileSize(file.m_file, &high); + + if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) { + return((os_offset_t) -1); + } + + offset = (os_offset_t) low | ((os_offset_t) high << 32); + + return(offset); +#else + return((os_offset_t) lseek(file.m_file, 0, SEEK_END)); + +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Write the specified number of zeros to a newly created file. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + pfs_os_file_t file, /*!< in: handle to a file */ + os_offset_t size) /*!< in: file size */ +{ + ibool ret; + byte* buf; + byte* buf2; + ulint buf_size; + +#ifdef HAVE_POSIX_FALLOCATE + if (srv_use_posix_fallocate) { + int err; + do { + err = posix_fallocate(file.m_file, 0, size); + } while (err == EINTR + && srv_shutdown_state == SRV_SHUTDOWN_NONE); + + if (err) { + ib_logf(IB_LOG_LEVEL_ERROR, + "preallocating " INT64PF " bytes for" + "file %s failed with error %d", + size, name, err); + } + return(!err); + } +#endif + +#ifdef _WIN32 + /* Write 1 page of zeroes at the desired end. */ + buf_size = UNIV_PAGE_SIZE; + os_offset_t current_size = size - buf_size; +#else + /* Write up to 1 megabyte at a time. */ + buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) + * UNIV_PAGE_SIZE; + os_offset_t current_size = 0; +#endif + buf2 = static_cast<byte*>(calloc(1, buf_size + UNIV_PAGE_SIZE)); + + if (!buf2) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot allocate " ULINTPF " bytes to extend file\n", + buf_size + UNIV_PAGE_SIZE); + return(FALSE); + } + + /* Align the buffer for possible raw i/o */ + buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE)); + + do { + ulint n_bytes; + + if (size - current_size < (os_offset_t) buf_size) { + n_bytes = (ulint) (size - current_size); + } else { + n_bytes = buf_size; + } + + ret = os_file_write(name, file, buf, current_size, n_bytes); + if (!ret) { + break; + } + + current_size += n_bytes; + } while (current_size < size); + + free(buf2); + + return(ret && os_file_flush(file)); +} + +/***********************************************************************//** +Truncates a file at its current position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + FILE* file) /*!< in: file to be truncated */ +{ +#ifdef __WIN__ + HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); + return(SetEndOfFile(h)); +#else /* __WIN__ */ + WAIT_ALLOW_WRITES(); + return(!ftruncate(fileno(file), ftell(file))); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Truncates a file at the specified position. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_set_eof_at_func( + os_file_t file, /*!< in: handle to a file */ + ib_uint64_t new_len)/*!< in: new file length */ +{ +#ifdef __WIN__ + LARGE_INTEGER li, li2; + li.QuadPart = new_len; + return(SetFilePointerEx(file, li, &li2,FILE_BEGIN) + && SetEndOfFile(file)); +#else + WAIT_ALLOW_WRITES(); + /* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */ + return(!ftruncate(file, new_len)); +#endif +} + + +#ifndef __WIN__ +/***********************************************************************//** +Wrapper to fsync(2) that retries the call on some errors. +Returns the value 0 if successful; otherwise the value -1 is returned and +the global variable errno is set to indicate the error. +@return 0 if success, -1 otherwise */ + +static +int +os_file_fsync( +/*==========*/ + os_file_t file) /*!< in: handle to a file */ +{ + int ret; + int failures; + ibool retry; + + failures = 0; + + do { + ret = fsync(file); + + os_n_fsyncs++; + + if (ret == -1 && errno == ENOLCK) { + + if (failures % 100 == 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: fsync(): " + "No locks available; retrying\n"); + } + + os_thread_sleep(200000 /* 0.2 sec */); + + failures++; + + retry = TRUE; + } else if (ret == -1 && errno == EINTR) { + /* Handle signal interruptions correctly */ + retry = TRUE; + } else { + + retry = FALSE; + } + } while (retry); + + return(ret); +} +#endif /* !__WIN__ */ + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_flush(), not directly this function! +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_flush_func( +/*===============*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + os_n_fsyncs++; + + ret = FlushFileBuffers(file); + + if (ret) { + return(TRUE); + } + + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + + if (srv_start_raw_disk_in_use && GetLastError() + == ERROR_INVALID_FUNCTION) { + return(TRUE); + } + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#else + int ret; + WAIT_ALLOW_WRITES(); + +#if defined(HAVE_DARWIN_THREADS) +# ifndef F_FULLFSYNC + /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */ +# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */ +# elif F_FULLFSYNC != 51 +# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3" +# endif + /* Apple has disabled fsync() for internal disk drives in OS X. That + caused corruption for a user when he tested a power outage. Let us in + OS X use a nonstandard flush method recommended by an Apple + engineer. */ + + if (!srv_have_fullfsync) { + /* If we are not on an operating system that supports this, + then fall back to a plain fsync. */ + + ret = os_file_fsync(file); + } else { + ret = fcntl(file, F_FULLFSYNC, NULL); + + if (ret) { + /* If we are not on a file system that supports this, + then fall back to a plain fsync. */ + ret = os_file_fsync(file); + } + } +#else + ret = os_file_fsync(file); +#endif + + if (ret == 0) { + return(TRUE); + } + + /* Since Linux returns EINVAL if the 'file' is actually a raw device, + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { + + return(TRUE); + } + + ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); + + os_file_handle_error(NULL, "flush"); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#endif +} + +#ifndef __WIN__ +/*******************************************************************//** +Does a synchronous read operation in Posix. +@return number of bytes read, -1 if error */ +static MY_ATTRIBUTE((nonnull(2), warn_unused_result)) +ssize_t +os_file_pread( +/*==========*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + ulint n, /*!< in: number of bytes to read */ + os_offset_t offset, /*!< in: file offset from where to read */ + trx_t* trx) +{ + off_t offs; + ulint sec; + ulint ms; + ib_uint64_t start_time; + ib_uint64_t finish_time; + + ut_ad(n); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + offs = (off_t) offset; + + if (sizeof(off_t) <= 4) { + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File read at offset > 4 GB"); + } + } + + os_n_file_reads++; + + if (UNIV_UNLIKELY(trx && trx->take_stats)) + { + trx->io_reads++; + trx->io_read += n; + ut_usectime(&sec, &ms); + start_time = (ib_uint64_t)sec * 1000000 + ms; + } else { + start_time = 0; + } + + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); +#ifdef HAVE_PREAD + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); + + ssize_t n_bytes; + + /* Handle partial reads and signal interruptions correctly */ + for (n_bytes = 0; n_bytes < (ssize_t) n; ) { + ssize_t n_read = pread(file, buf, (ssize_t)n - n_bytes, offs); + if (n_read > 0) { + n_bytes += n_read; + offs += n_read; + buf = (char *)buf + n_read; + } else if (n_read == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + + if (UNIV_UNLIKELY(start_time != 0)) + { + ut_usectime(&sec, &ms); + finish_time = (ib_uint64_t)sec * 1000000 + ms; + trx->io_reads_wait_timer += (ulint)(finish_time - start_time); + } + + return(n_bytes); +#else + { + off_t ret_offset; + ssize_t ret; + ssize_t n_read; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); +#ifndef UNIV_HOTBACKUP + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + } else { + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_read = read(file, buf, (ssize_t)n); + if (n_read > 0) { + ret += n_read; + } else if (n_read == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + } + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + + if (UNIV_UNLIKELY(start_time != 0) + { + ut_usectime(&sec, &ms); + finish_time = (ib_uint64_t)sec * 1000000 + ms; + trx->io_reads_wait_timer += (ulint)(finish_time - start_time); + } + + return(ret); + } +#endif +} + +/*******************************************************************//** +Does a synchronous write operation in Posix. +@return number of bytes written, -1 if error */ +static MY_ATTRIBUTE((nonnull, warn_unused_result)) +ssize_t +os_file_pwrite( +/*===========*/ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from where to write */ + ulint n, /*!< in: number of bytes to write */ + os_offset_t offset) /*!< in: file offset where to write */ +{ + ssize_t ret; + ssize_t n_written; + off_t offs; + + ut_ad(n); + ut_ad(!srv_read_only_mode); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + offs = (off_t) offset; + + if (sizeof(off_t) <= 4) { + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File write at offset > 4 GB."); + } + } + + os_n_file_writes++; + + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES); +#ifdef HAVE_PWRITE + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + /* Handle partial writes and signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_written = pwrite(file, buf, (ssize_t)n - ret, offs); + DBUG_EXECUTE_IF("xb_simulate_all_o_direct_write_failure", + n_written = -1; + errno = EINVAL;); + if (n_written >= 0) { + ret += n_written; + offs += n_written; + buf = (char *)buf + n_written; + } else if (n_written == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + return(ret); +#else + { + off_t ret_offset; +# ifndef UNIV_HOTBACKUP + ulint i; +# endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + +# ifndef UNIV_HOTBACKUP + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +# endif /* UNIV_HOTBACKUP */ + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + + goto func_exit; + } + + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_written = write(file, buf, (ssize_t)n); + if (n_written > 0) { + ret += n_written; + } else if (n_written == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + +func_exit: +# ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +# endif /* !UNIV_HOTBACKUP */ + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + return(ret); + } +#endif /* HAVE_PWRITE */ +} +#endif + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read(), not directly this +function! +Requests a synchronous positioned read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_func( +/*==============*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + trx_t* trx) +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + ibool retry; + OVERLAPPED overlapped; + + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); + +try_again: + ut_ad(buf); + ut_ad(n > 0); + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); + + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + overlapped.hEvent = win_get_syncio_event(); + ret = ReadFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); + } + else if(GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); + } + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + + if (ret && len == n) { + return(TRUE); + } +#else /* __WIN__ */ + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, trx); + + DBUG_EXECUTE_IF("xb_simulate_all_o_direct_read_failure", + ret = -1; + errno = EINVAL;); + + if ((ulint) ret == n) { + return(TRUE); + } else if (ret == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error in system call pread(). The operating" + " system error number is %lu.",(ulint) errno); + } else { + /* Partial read occurred */ + ib_logf(IB_LOG_LEVEL_ERROR, + "Tried to read " ULINTPF " bytes at offset " + UINT64PF ". Was only able to read %ld.", + n, offset, (lint) ret); + } +#endif /* __WIN__ */ + retry = os_file_handle_error(NULL, "read"); + + if (retry) { + goto try_again; + } + + fprintf(stderr, + "InnoDB: Fatal error: cannot read from file." + " OS error number %lu.\n", +#ifdef __WIN__ + (ulong) GetLastError() +#else + (ulong) errno +#endif /* __WIN__ */ + ); + fflush(stderr); + + ut_error; + + return(FALSE); +} + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_no_error_handling_func( +/*================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n) /*!< in: number of bytes to read */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + ibool retry; + OVERLAPPED overlapped; + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS); + +try_again: + ut_ad(buf); + ut_ad(n > 0); + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor); + + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + overlapped.hEvent = win_get_syncio_event(); + ret = ReadFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); + } + else if(GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); + } + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor); + + if (ret && len == n) { + return(TRUE); + } +#else /* __WIN__ */ + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, NULL); + + if ((ulint) ret == n) { + return(TRUE); + } else if (ret == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error in system call pread(). The operating" + " system error number is %lu.",(ulint) errno); + } else { + /* Partial read occurred */ + ib_logf(IB_LOG_LEVEL_ERROR, + "Tried to read " ULINTPF " bytes at offset " + UINT64PF ". Was only able to read %ld.", + n, offset, (lint) ret); + } +#endif /* __WIN__ */ + retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + + if (retry) { + goto try_again; + } + + return(FALSE); +} + +/*******************************************************************//** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /*!< in: file to read from */ + char* str, /*!< in: buffer where to read */ + ulint size) /*!< in: size of buffer */ +{ + size_t flen; + + if (size == 0) { + return; + } + + rewind(file); + flen = fread(str, 1, size - 1, file); + str[flen] = '\0'; +} + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_write(), not directly +this function! +Requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_write_func( +/*===============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n) /*!< in: number of bytes to write */ +{ + ut_ad(!srv_read_only_mode); + +#ifdef __WIN__ + BOOL ret; + DWORD len; + ulint n_retries = 0; + ulint err; + OVERLAPPED overlapped; + DWORD saved_error = 0; + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_writes++; + + ut_ad(buf); + ut_ad(n > 0); + const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES); +retry: + + MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + + overlapped.hEvent = win_get_syncio_event(); + ret = WriteFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); + } + else if ( GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); + } + + MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor); + + if (ret && len == n) { + + return(TRUE); + } + + /* If some background file system backup tool is running, then, at + least in Windows 2000, we may get here a specific error. Let us + retry the operation 100 times, with 1 second waits. */ + + if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) { + + os_thread_sleep(1000000); + + n_retries++; + + goto retry; + } + + if (!os_has_said_disk_full) { + char *winmsg = NULL; + + saved_error = GetLastError(); + err = (ulint) saved_error; + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset %llu.\n" + "InnoDB: %lu bytes should have been written," + " only %lu were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset, + (ulong) n, (ulong) len, (ulong) err); + + /* Ask Windows to prepare a standard message for a + GetLastError() */ + + FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, saved_error, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPSTR)&winmsg, 0, NULL); + + if (winmsg) { + fprintf(stderr, + "InnoDB: FormatMessage: Error number %lu means '%s'.\n", + (ulong) saved_error, winmsg); + LocalFree(winmsg); + } + + if (strerror((int) err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu means '%s'.\n", + (ulong) err, strerror((int) err)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#else + ssize_t ret; + WAIT_ALLOW_WRITES(); + + ret = os_file_pwrite(file, buf, n, offset); + + if ((ulint) ret == n) { + + return(TRUE); + } + + if (!os_has_said_disk_full) { + + ut_print_timestamp(stderr); + + if(ret == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Failure of system call pwrite(). Operating" + " system error number is %lu.", + (ulint) errno); + } else { + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset " UINT64PF ".\n" + "InnoDB: %lu bytes should have been written," + " only %ld were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset, n, (lint) ret, + (ulint) errno); + } + + if (strerror(errno) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d means '%s'.\n", + errno, strerror(errno)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_diagnose_all_o_direct_einval(errno); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#endif +} + +/*******************************************************************//** +Check the existence and type of the given file. +@return TRUE if call succeeded */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + const char* path, /*!< in: pathname of the file */ + ibool* exists, /*!< out: TRUE if file exists */ + os_file_type_t* type) /*!< out: type of the file (if it exists) */ +{ +#ifdef __WIN__ + int ret; + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(FALSE); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#endif +} + +/*******************************************************************//** +This function returns information about the specified file +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +os_file_get_status( +/*===============*/ + const char* path, /*!< in: pathname of the file */ + os_file_stat_t* stat_info, /*!< information of a file in a + directory */ + bool check_rw_perm) /*!< in: for testing whether the + file can be opened in RW mode */ +{ + int ret; + +#ifdef __WIN__ + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(DB_FAIL); + + } else if (_S_IFDIR & statinfo.st_mode) { + stat_info->type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + + stat_info->type = OS_FILE_TYPE_FILE; + + /* Check if we can open it in read-only mode. */ + + if (check_rw_perm) { + HANDLE fh; + + fh = CreateFile( + (LPCTSTR) path, // File to open + access, + 0, // No sharing + NULL, // Default security + OPEN_EXISTING, // Existing file only + FILE_ATTRIBUTE_NORMAL, // Normal file + NULL); // No attr. template + + if (fh == INVALID_HANDLE_VALUE) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + CloseHandle(fh); + } + } + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } +#else + struct stat statinfo; + + ret = stat(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE); + + return(DB_FAIL); + + } + + switch (statinfo.st_mode & S_IFMT) { + case S_IFDIR: + stat_info->type = OS_FILE_TYPE_DIR; + break; + case S_IFLNK: + stat_info->type = OS_FILE_TYPE_LINK; + break; + case S_IFBLK: + /* Handle block device as regular file. */ + case S_IFCHR: + /* Handle character device as regular file. */ + case S_IFREG: + stat_info->type = OS_FILE_TYPE_FILE; + break; + default: + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + + if (check_rw_perm && stat_info->type == OS_FILE_TYPE_FILE) { + + int fh; + int access; + + access = !srv_read_only_mode ? O_RDWR : O_RDONLY; + - fh = ::open(path, access, os_innodb_umask); ++ fh = ::open(path, access | O_CLOEXEC, os_innodb_umask); + + if (fh == -1) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + close(fh); + } + } + +#endif /* _WIN_ */ + + stat_info->ctime = statinfo.st_ctime; + stat_info->atime = statinfo.st_atime; + stat_info->mtime = statinfo.st_mtime; + stat_info->size = statinfo.st_size; + + return(DB_SUCCESS); +} + +/* path name separator character */ +#ifdef __WIN__ +# define OS_FILE_PATH_SEPARATOR '\\' +#else +# define OS_FILE_PATH_SEPARATOR '/' +#endif + +/****************************************************************//** +This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: new full pathname */ +UNIV_INTERN +char* +os_file_make_new_pathname( +/*======================*/ + const char* old_path, /*!< in: pathname */ + const char* tablename) /*!< in: contains new base name */ +{ + ulint dir_len; + char* last_slash; + char* base_name; + char* new_path; + ulint new_path_len; + + /* Split the tablename into its database and table name components. + They are separated by a '/'. */ + last_slash = strrchr((char*) tablename, '/'); + base_name = last_slash ? last_slash + 1 : (char*) tablename; + + /* Find the offset of the last slash. We will strip off the + old basename.ibd which starts after that slash. */ + last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR); + dir_len = last_slash ? last_slash - old_path : strlen(old_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd"; + new_path = static_cast<char*>(mem_alloc(new_path_len)); + memcpy(new_path, old_path, dir_len); + + ut_snprintf(new_path + dir_len, + new_path_len - dir_len, + "%c%s.ibd", + OS_FILE_PATH_SEPARATOR, + base_name); + + return(new_path); +} + +/****************************************************************//** +This function returns a remote path name by combining a data directory +path provided in a DATA DIRECTORY clause with the tablename which is +in the form 'database/tablename'. It strips the file basename (which +is the tablename) found after the last directory in the path provided. +The full filepath created will include the database name as a directory +under the path provided. The filename is the tablename with the '.ibd' +extension. All input and output strings are null-terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: A full pathname; data_dir_path/databasename/tablename.ibd */ +UNIV_INTERN +char* +os_file_make_remote_pathname( +/*=========================*/ + const char* data_dir_path, /*!< in: pathname */ + const char* tablename, /*!< in: tablename */ + const char* extention) /*!< in: file extention; ibd,cfg */ +{ + ulint data_dir_len; + char* last_slash; + char* new_path; + ulint new_path_len; + + ut_ad(extention && strlen(extention) == 3); + + /* Find the offset of the last slash. We will strip off the + old basename or tablename which starts after that slash. */ + last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = data_dir_len + strlen(tablename) + + sizeof "/." + strlen(extention); + new_path = static_cast<char*>(mem_alloc(new_path_len)); + memcpy(new_path, data_dir_path, data_dir_len); + ut_snprintf(new_path + data_dir_len, + new_path_len - data_dir_len, + "%c%s.%s", + OS_FILE_PATH_SEPARATOR, + tablename, + extention); + + srv_normalize_path_for_win(new_path); + + return(new_path); +} + +/****************************************************************//** +This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. */ +UNIV_INTERN +void +os_file_make_data_dir_path( +/*========================*/ + char* data_dir_path) /*!< in/out: full path/data_dir_path */ +{ + char* ptr; + char* tablename; + ulint tablename_len; + + /* Replace the period before the extension with a null byte. */ + ptr = strrchr((char*) data_dir_path, '.'); + if (!ptr) { + return; + } + ptr[0] = '\0'; + + /* The tablename starts after the last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + ptr[0] = '\0'; + tablename = ptr + 1; + + /* The databasename starts after the next to last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + tablename_len = ut_strlen(tablename); + + ut_memmove(++ptr, tablename, tablename_len); + + ptr[tablename_len] = '\0'; +} + +/****************************************************************//** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' characters +are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." + +@return own: directory component of the pathname */ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + const char* path) /*!< in: pathname */ +{ + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR); + if (!last_slash) { + /* No slash in the path, return "." */ + + return(mem_strdup(".")); + } + + /* Ok, there is a slash */ + + if (last_slash == path) { + /* last slash is the first char of the path */ + + return(mem_strdup("/")); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, last_slash - path)); +} + +/****************************************************************//** +Creates all missing subdirectories along the given path. +@return TRUE if call succeeded FALSE otherwise */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + const char* path) /*!< in: path name */ +{ + if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "read only mode set. Can't create subdirectories '%s'", + path); + + return(FALSE); + + } + + char* subdir = os_file_dirname(path); + + if (strlen(subdir) == 1 + && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) { + /* subdir is root or cwd, nothing to do */ + mem_free(subdir); + + return(TRUE); + } + + /* Test if subdir exists */ + os_file_type_t type; + ibool subdir_exists; + ibool success = os_file_status(subdir, &subdir_exists, &type); + + if (success && !subdir_exists) { + + /* subdir does not exist, create it */ + success = os_file_create_subdirs_if_needed(subdir); + + if (!success) { + mem_free(subdir); + + return(FALSE); + } + + success = os_file_create_directory(subdir, FALSE); + } + + mem_free(subdir); + + return(success); +} + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Returns a pointer to the nth slot in the aio array. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_get_nth_slot( +/*======================*/ + os_aio_array_t* array, /*!< in: aio array */ + ulint index) /*!< in: index of the slot */ +{ + ut_a(index < array->n_slots); + + return(&array->slots[index]); +} + +#if defined(LINUX_NATIVE_AIO) +/******************************************************************//** +Creates an io_context for native linux AIO. +@return TRUE on success. */ +static +ibool +os_aio_linux_create_io_ctx( +/*=======================*/ + ulint max_events, /*!< in: number of events. */ + io_context_t* io_ctx) /*!< out: io_ctx to initialize. */ +{ + int ret; + ulint retries = 0; + +retry: + memset(io_ctx, 0x0, sizeof(*io_ctx)); + + /* Initialize the io_ctx. Tell it how many pending + IO requests this context will handle. */ + + ret = io_setup(max_events, io_ctx); + if (ret == 0) { +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "InnoDB: Linux native AIO:" + " initialized io_ctx for segment\n"); +#endif + /* Success. Return now. */ + return(TRUE); + } + + /* If we hit EAGAIN we'll make a few attempts before failing. */ + + switch (ret) { + case -EAGAIN: + if (retries == 0) { + /* First time around. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: io_setup() failed" + " with EAGAIN. Will make %d attempts" + " before giving up.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + } + + if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) { + ++retries; + fprintf(stderr, + "InnoDB: Warning: io_setup() attempt" + " %lu failed.\n", + retries); + os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP); + goto retry; + } + + /* Have tried enough. Better call it a day. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: io_setup() failed" + " with EAGAIN after %d attempts.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + break; + + case -ENOSYS: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO interface" + " is not supported on this platform. Please" + " check your OS documentation and install" + " appropriate binary of InnoDB.\n"); + + break; + + default: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO setup" + " returned following error[%d]\n", -ret); + break; + } + + fprintf(stderr, + "InnoDB: You can disable Linux Native AIO by" + " setting innodb_use_native_aio = 0 in my.cnf\n"); + return(FALSE); +} + +/******************************************************************//** +Checks if the system supports native linux aio. On some kernel +versions where native aio is supported it won't work on tmpfs. In such +cases we can't use native aio as it is not possible to mix simulated +and native aio. +@return: TRUE if supported, FALSE otherwise. */ +static +ibool +os_aio_native_aio_supported(void) +/*=============================*/ +{ + int fd; + io_context_t io_ctx; + char name[1000]; + + if (!os_aio_linux_create_io_ctx(1, &io_ctx)) { + /* The platform does not support native aio. */ + return(FALSE); + } else if (!srv_read_only_mode) { + /* Now check if tmpdir supports native aio ops. */ + fd = innobase_mysql_tmpfile(NULL); + + if (fd < 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to create temp file to check " + "native AIO support."); + + return(FALSE); + } + } else { + + srv_normalize_path_for_win(srv_log_group_home_dir); + + ulint dirnamelen = strlen(srv_log_group_home_dir); + ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile"); + memcpy(name, srv_log_group_home_dir, dirnamelen); + + /* Add a path separator if needed. */ + if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + name[dirnamelen++] = SRV_PATH_SEPARATOR; + } + + strcpy(name + dirnamelen, "ib_logfile0"); + - fd = ::open(name, O_RDONLY); ++ fd = ::open(name, O_RDONLY | O_CLOEXEC); + + if (fd == -1) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to open \"%s\" to check " + "native AIO read support.", name); + + return(FALSE); + } + } + + struct io_event io_event; + + memset(&io_event, 0x0, sizeof(io_event)); + + byte* buf = static_cast<byte*>(ut_malloc(UNIV_PAGE_SIZE * 2)); + byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE)); + + struct iocb iocb; + + /* Suppress valgrind warning. */ + memset(buf, 0x00, UNIV_PAGE_SIZE * 2); + memset(&iocb, 0x0, sizeof(iocb)); + + struct iocb* p_iocb = &iocb; + + if (!srv_read_only_mode) { + io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0); + } else { + ut_a(UNIV_PAGE_SIZE >= 512); + io_prep_pread(p_iocb, fd, ptr, 512, 0); + } + + int err = io_submit(io_ctx, 1, &p_iocb); + + if (err >= 1) { + /* Now collect the submitted IO request. */ + err = io_getevents(io_ctx, 1, 1, &io_event, NULL); + } + + ut_free(buf); + close(fd); + + switch (err) { + case 1: + return(TRUE); + + case -EINVAL: + case -ENOSYS: + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO not supported. You can either " + "move %s to a file system that supports native " + "AIO or you can set innodb_use_native_aio to " + "FALSE to avoid this message.", + srv_read_only_mode ? name : "tmpdir"); + + /* fall through. */ + default: + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO check on %s returned error[%d]", + srv_read_only_mode ? name : "tmpdir", -err); + } + + return(FALSE); +} +#endif /* LINUX_NATIVE_AIO */ + +/******************************************************************//** +Creates an aio wait array. Note that we return NULL in case of failure. +We don't care about freeing memory here because we assume that a +failure will result in server refusing to start up. +@return own: aio array, NULL on failure */ +static +os_aio_array_t* +os_aio_array_create( +/*================*/ + ulint n, /*!< in: maximum number of pending aio + operations allowed; n must be + divisible by n_segments */ + ulint n_segments) /*!< in: number of segments in the aio array */ +{ + os_aio_array_t* array; +#ifdef LINUX_NATIVE_AIO + struct io_event* io_event = NULL; +#endif + ut_a(n > 0); + ut_a(n_segments > 0); + + array = static_cast<os_aio_array_t*>(ut_malloc(sizeof(*array))); + memset(array, 0x0, sizeof(*array)); + + array->mutex = os_mutex_create(); + array->not_full = os_event_create(); + array->is_empty = os_event_create(); + + os_event_set(array->is_empty); + + array->n_slots = n; + array->n_segments = n_segments; + + array->slots = static_cast<os_aio_slot_t*>( + ut_malloc(n * sizeof(*array->slots))); + + memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); + +#if defined(LINUX_NATIVE_AIO) + array->aio_ctx = NULL; + array->aio_events = NULL; + + /* If we are not using native aio interface then skip this + part of initialization. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Initialize the io_context array. One io_context + per segment in the array. */ + + array->aio_ctx = static_cast<io_context**>( + ut_malloc(n_segments * sizeof(*array->aio_ctx))); + + for (ulint i = 0; i < n_segments; ++i) { + if (!os_aio_linux_create_io_ctx(n/n_segments, + &array->aio_ctx[i])) { + /* If something bad happened during aio setup + we disable linux native aio. + The disadvantage will be a small memory leak + at shutdown but that's ok compared to a crash + or a not working server. + This frequently happens when running the test suite + with many threads on a system with low fs.aio-max-nr! + */ + + fprintf(stderr, + " InnoDB: Warning: Linux Native AIO disabled " + "because os_aio_linux_create_io_ctx() " + "failed. To get rid of this warning you can " + "try increasing system " + "fs.aio-max-nr to 1048576 or larger or " + "setting innodb_use_native_aio = 0 in my.cnf\n"); + srv_use_native_aio = FALSE; + goto skip_native_aio; + } + } + + /* Initialize the event array. One event per slot. */ + io_event = static_cast<struct io_event*>( + ut_malloc(n * sizeof(*io_event))); + + memset(io_event, 0x0, sizeof(*io_event) * n); + array->aio_events = io_event; + + array->pending = static_cast<struct iocb**>( + ut_malloc(n * sizeof(struct iocb*))); + memset(array->pending, 0x0, sizeof(struct iocb*) * n); + array->count = static_cast<ulint*>( + ut_malloc(n_segments * sizeof(ulint))); + memset(array->count, 0x0, sizeof(ulint) * n_segments); + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i); + slot->pos = i; + slot->reserved = FALSE; +#ifdef LINUX_NATIVE_AIO + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; +#endif /* WIN_ASYNC_IO */ + } + + return(array); +} + +/************************************************************************//** +Frees an aio wait array. */ +static +void +os_aio_array_free( +/*==============*/ + os_aio_array_t*& array) /*!< in, own: array to free */ +{ + os_mutex_free(array->mutex); + os_event_free(array->not_full); + os_event_free(array->is_empty); + +#if defined(LINUX_NATIVE_AIO) + if (srv_use_native_aio) { + ut_free(array->aio_events); + ut_free(array->aio_ctx); + +#ifdef UNIV_DEBUG + for (size_t idx = 0; idx < array->n_slots; ++idx) + ut_ad(array->pending[idx] == NULL); + for (size_t idx = 0; idx < array->n_segments; ++idx) + ut_ad(array->count[idx] == 0); +#endif + + ut_free(array->pending); + ut_free(array->count); + } +#endif /* LINUX_NATIVE_AIO */ + + ut_free(array->slots); + ut_free(array); + + array = 0; +} + +/*********************************************************************** +Initializes the asynchronous io system. Creates one array each for ibuf +and log i/o. Also creates one array each for read and write where each +array is divided logically into n_read_segs and n_write_segs +respectively. The caller must create an i/o handler thread for each +segment in these arrays. This function also creates the sync array. +No i/o handler thread needs to be created for that */ +UNIV_INTERN +ibool +os_aio_init( +/*========*/ + ulint n_per_seg, /*<! in: maximum number of pending aio + operations allowed per segment */ + ulint n_read_segs, /*<! in: number of reader threads */ + ulint n_write_segs, /*<! in: number of writer threads */ + ulint n_slots_sync) /*<! in: number of slots in the sync aio + array */ +{ + os_io_init_simple(); + +#if defined(LINUX_NATIVE_AIO) + /* Check if native aio is supported on this system and tmpfs */ + if (srv_use_native_aio && !os_aio_native_aio_supported()) { + + ib_logf(IB_LOG_LEVEL_WARN, "Linux Native AIO disabled."); + + srv_use_native_aio = FALSE; + } +#endif /* LINUX_NATIVE_AIO */ + + srv_reset_io_thread_op_info(); + + os_aio_read_array = os_aio_array_create( + n_read_segs * n_per_seg, n_read_segs); + + if (os_aio_read_array == NULL) { + return(FALSE); + } + + ulint start = (srv_read_only_mode) ? 0 : 2; + ulint n_segs = n_read_segs + start; + + /* 0 is the ibuf segment and 1 is the insert buffer segment. */ + for (ulint i = start; i < n_segs; ++i) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "read thread"; + } + + ulint n_segments = n_read_segs; + + if (!srv_read_only_mode) { + + os_aio_log_array = os_aio_array_create(n_per_seg, 1); + + if (os_aio_log_array == NULL) { + return(FALSE); + } + + ++n_segments; + + srv_io_thread_function[1] = "log thread"; + + os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); + + if (os_aio_ibuf_array == NULL) { + return(FALSE); + } + + ++n_segments; + + srv_io_thread_function[0] = "insert buffer thread"; + + os_aio_write_array = os_aio_array_create( + n_write_segs * n_per_seg, n_write_segs); + + if (os_aio_write_array == NULL) { + return(FALSE); + } + + n_segments += n_write_segs; + + for (ulint i = start + n_read_segs; i < n_segments; ++i) { + ut_a(i < SRV_MAX_N_IO_THREADS); + srv_io_thread_function[i] = "write thread"; + } + + ut_ad(n_segments >= 4); + } else { + ut_ad(n_segments > 0); + } + + os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); + + if (os_aio_sync_array == NULL) { + return(FALSE); + } + + os_aio_n_segments = n_segments; + + os_aio_validate(); + + os_last_printout = ut_time(); + +#ifdef _WIN32 + ut_a(completion_port == 0 && read_completion_port == 0); + completion_port = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); + read_completion_port = srv_read_only_mode? completion_port : CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); + ut_a(completion_port && read_completion_port); +#endif + + if (srv_use_native_aio) { + return(TRUE); + } + + os_aio_segment_wait_events = static_cast<os_event_t*>( + ut_malloc(n_segments * sizeof *os_aio_segment_wait_events)); + + for (ulint i = 0; i < n_segments; ++i) { + os_aio_segment_wait_events[i] = os_event_create(); + } + + return(TRUE); +} + +/*********************************************************************** +Frees the asynchronous io system. */ +UNIV_INTERN +void +os_aio_free(void) +/*=============*/ +{ + if (os_aio_ibuf_array != 0) { + os_aio_array_free(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_free(os_aio_log_array); + } + + if (os_aio_write_array != 0) { + os_aio_array_free(os_aio_write_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_free(os_aio_sync_array); + } + + os_aio_array_free(os_aio_read_array); + + if (!srv_use_native_aio) { + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_event_free(os_aio_segment_wait_events[i]); + } + } + + for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + os_mutex_free(os_file_seek_mutexes[i]); + } + + ut_free(os_aio_segment_wait_events); + os_aio_segment_wait_events = 0; + os_aio_n_segments = 0; +#ifdef _WIN32 + completion_port = 0; + read_completion_port = 0; +#endif +} + +#ifdef WIN_ASYNC_IO +/************************************************************************//** +Wakes up all async i/o threads in the array in Windows async i/o at +shutdown. */ +static +void +os_aio_array_wake_win_aio_at_shutdown( +/*==================================*/ + os_aio_array_t* array) /*!< in: aio array */ +{ + if(completion_port) + { + PostQueuedCompletionStatus(completion_port, 0, IOCP_SHUTDOWN_KEY, NULL); + PostQueuedCompletionStatus(read_completion_port, 0, IOCP_SHUTDOWN_KEY, NULL); + } +} +#endif + +/************************************************************************//** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void) +/*=====================================*/ +{ +#ifdef WIN_ASYNC_IO + /* This code wakes up all ai/o threads in Windows native aio */ + os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array); + if (os_aio_write_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); + } +#elif defined(LINUX_NATIVE_AIO) + /* When using native AIO interface the io helper threads + wait on io_getevents with a timeout value of 500ms. At + each wake up these threads check the server status. + No need to do anything to wake them up. */ +#endif /* !WIN_ASYNC_AIO */ + + if (srv_use_native_aio) { + return; + } + + /* This loop wakes up all simulated ai/o threads */ + + for (ulint i = 0; i < os_aio_n_segments; i++) { + + os_event_set(os_aio_segment_wait_events[i]); + } +} + +/************************************************************************//** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void) +/*=====================================*/ +{ + ut_ad(!srv_read_only_mode); + os_event_wait(os_aio_write_array->is_empty); +} + +/**********************************************************************//** +Calculates segment number for a slot. +@return segment number (which is the number used by, for example, +i/o-handler threads) */ +static +ulint +os_aio_get_segment_no_from_slot( +/*============================*/ + os_aio_array_t* array, /*!< in: aio wait array */ + os_aio_slot_t* slot) /*!< in: slot in this array */ +{ + ulint segment; + ulint seg_len; + + if (array == os_aio_ibuf_array) { + ut_ad(!srv_read_only_mode); + + segment = IO_IBUF_SEGMENT; + + } else if (array == os_aio_log_array) { + ut_ad(!srv_read_only_mode); + + segment = IO_LOG_SEGMENT; + + } else if (array == os_aio_read_array) { + seg_len = os_aio_read_array->n_slots + / os_aio_read_array->n_segments; + + segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len; + } else { + ut_ad(!srv_read_only_mode); + ut_a(array == os_aio_write_array); + + seg_len = os_aio_write_array->n_slots + / os_aio_write_array->n_segments; + + segment = os_aio_read_array->n_segments + 2 + + slot->pos / seg_len; + } + + return(segment); +} + +/**********************************************************************//** +Calculates local segment number and aio array from global segment number. +@return local segment number within the aio array */ +static +ulint +os_aio_get_array_and_local_segment( +/*===============================*/ + os_aio_array_t** array, /*!< out: aio wait array */ + ulint global_segment)/*!< in: global segment number */ +{ + ulint segment; + + ut_a(global_segment < os_aio_n_segments); + + if (srv_read_only_mode) { + *array = os_aio_read_array; + + return(global_segment); + } else if (global_segment == IO_IBUF_SEGMENT) { + *array = os_aio_ibuf_array; + segment = 0; + + } else if (global_segment == IO_LOG_SEGMENT) { + *array = os_aio_log_array; + segment = 0; + + } else if (global_segment < os_aio_read_array->n_segments + 2) { + *array = os_aio_read_array; + + segment = global_segment - 2; + } else { + *array = os_aio_write_array; + + segment = global_segment - (os_aio_read_array->n_segments + 2); + } + + return(segment); +} + +/*******************************************************************//** +Requests for a slot in the aio array. If no slot is available, waits until +not_full-event becomes signaled. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_reserve_slot( +/*======================*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + os_aio_array_t* array, /*!< in: aio array */ + fil_node_t* message1,/*!< in: message to be passed along with + the aio operation */ + void* message2,/*!< in: message to be passed along with + the aio operation */ + pfs_os_file_t file, /*!< in: file handle */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset */ + ulint len, /*!< in: length of the block to read or write */ + ulint space_id) +{ + os_aio_slot_t* slot = NULL; +#ifdef WIN_ASYNC_IO + OVERLAPPED* control; + +#elif defined(LINUX_NATIVE_AIO) + + struct iocb* iocb; + off_t aio_offset; + +#endif /* WIN_ASYNC_IO */ + ulint i; + ulint counter; + ulint slots_per_seg; + ulint local_seg; + +#ifdef WIN_ASYNC_IO + ut_a((len & 0xFFFFFFFFUL) == len); +#endif /* WIN_ASYNC_IO */ + + /* No need of a mutex. Only reading constant fields */ + slots_per_seg = array->n_slots / array->n_segments; + + /* We attempt to keep adjacent blocks in the same local + segment. This can help in merging IO requests when we are + doing simulated AIO */ + local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) + % array->n_segments; + +loop: + os_mutex_enter(array->mutex); + + if (array->n_reserved == array->n_slots) { + os_mutex_exit(array->mutex); + + if (!srv_use_native_aio) { + /* If the handler threads are suspended, wake them + so that we get more slots */ + + os_aio_simulated_wake_handler_threads(); + } + + os_event_wait(array->not_full); + + goto loop; + } + + /* We start our search for an available slot from our preferred + local segment and do a full scan of the array. We are + guaranteed to find a slot in full scan. */ + for (i = local_seg * slots_per_seg, counter = 0; + counter < array->n_slots; + i++, counter++) { + + i %= array->n_slots; + + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + goto found; + } + } + + /* We MUST always be able to get hold of a reserved slot. */ + ut_error; + +found: + ut_a(slot->reserved == FALSE); + array->n_reserved++; + + if (array->n_reserved == 1) { + os_event_reset(array->is_empty); + } + + if (array->n_reserved == array->n_slots) { + os_event_reset(array->not_full); + } + + slot->reserved = TRUE; + slot->reservation_time = ut_time(); + slot->message1 = message1; + slot->message2 = message2; + slot->file = file; + slot->name = name; + slot->len = len; + slot->type = type; + slot->buf = static_cast<byte*>(buf); + slot->offset = offset; + slot->io_already_done = FALSE; + slot->space_id = space_id; + +#ifdef WIN_ASYNC_IO + control = &slot->control; + control->Offset = (DWORD) offset & 0xFFFFFFFF; + control->OffsetHigh = (DWORD) (offset >> 32); + control->hEvent = 0; + slot->arr = array; + +#elif defined(LINUX_NATIVE_AIO) + + /* If we are not using native AIO skip this part. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Check if we are dealing with 64 bit arch. + If not then make sure that offset fits in 32 bits. */ + aio_offset = (off_t) offset; + + ut_a(sizeof(aio_offset) >= sizeof(offset) + || ((os_offset_t) aio_offset) == offset); + + iocb = &slot->control; + + if (type == OS_FILE_READ) { + io_prep_pread(iocb, file.m_file, buf, len, aio_offset); + } else { + ut_a(type == OS_FILE_WRITE); + io_prep_pwrite(iocb, file.m_file, buf, len, aio_offset); + } + + iocb->data = (void*) slot; + slot->n_bytes = 0; + slot->ret = 0; + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ + os_mutex_exit(array->mutex); + + return(slot); +} + +/*******************************************************************//** +Frees a slot in the aio array. */ +static +void +os_aio_array_free_slot( +/*===================*/ + os_aio_array_t* array, /*!< in: aio array */ + os_aio_slot_t* slot) /*!< in: pointer to slot */ +{ + os_mutex_enter(array->mutex); + + ut_ad(slot->reserved); + + slot->reserved = FALSE; + + array->n_reserved--; + + if (array->n_reserved == array->n_slots - 1) { + os_event_set(array->not_full); + } + + if (array->n_reserved == 0) { + os_event_set(array->is_empty); + } + +#ifdef LINUX_NATIVE_AIO + + if (srv_use_native_aio) { + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; + /*fprintf(stderr, "Freed up Linux native slot.\n");*/ + } else { + /* These fields should not be used if we are not + using native AIO. */ + ut_ad(slot->n_bytes == 0); + ut_ad(slot->ret == 0); + } + +#endif + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up a simulated aio i/o-handler thread if it has something to do. */ +static +void +os_aio_simulated_wake_handler_thread( +/*=================================*/ + ulint global_segment) /*!< in: the number of the segment in the aio + arrays */ +{ + os_aio_array_t* array; + ulint segment; + + ut_ad(!srv_use_native_aio); + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + + ulint n = array->n_slots / array->n_segments; + + segment *= n; + + /* Look through n slots after the segment * n'th slot */ + + os_mutex_enter(array->mutex); + + for (ulint i = 0; i < n; ++i) { + const os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, segment + i); + + if (slot->reserved) { + + /* Found an i/o request */ + + os_mutex_exit(array->mutex); + + os_event_t event; + + event = os_aio_segment_wait_events[global_segment]; + + os_event_set(event); + + return; + } + } + + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void) +/*=======================================*/ +{ + if (srv_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = FALSE; + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_simulated_wake_handler_thread(i); + } +} + +#ifdef _WIN32 +/**********************************************************************//** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep() +{ + +/* The idea of putting background IO threads to sleep is only for +Windows when using simulated AIO. Windows XP seems to schedule +background threads too eagerly to allow for coalescing during +readahead requests. */ + + os_aio_array_t* array; + + if (srv_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = TRUE; + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_get_array_and_local_segment(&array, i); + + if (array == os_aio_read_array) { + + os_event_reset(os_aio_segment_wait_events[i]); + } + } +} +#endif /* _WIN32 */ + +/** Submit buffered AIO requests on the given segment to the kernel +(low level function). +@param acquire_mutex specifies whether to lock array mutex +*/ +static +void +os_aio_dispatch_read_array_submit_low(bool acquire_mutex MY_ATTRIBUTE((unused))) +{ + if (!srv_use_native_aio) { + return; + } +#if defined(LINUX_NATIVE_AIO) + os_aio_array_t* array = os_aio_read_array; + ulint total_submitted = 0; + if (acquire_mutex) + os_mutex_enter(array->mutex); + /* Submit aio requests buffered on all segments. */ + for (ulint i = 0; i < array->n_segments; i++) { + const int count = array->count[i]; + int offset = 0; + while (offset != count) { + struct iocb** const iocb_array = array->pending + + i * array->n_slots / array->n_segments + + offset; + const int partial_count = count - offset; + /* io_submit() returns number of successfully queued + requests or (-errno). + It returns 0 only if the number of iocb blocks passed + is also 0. */ + const int submitted = io_submit(array->aio_ctx[i], + partial_count, iocb_array); + + /* This assertion prevents infinite loop in both + debug and release modes. */ + ut_a(submitted != 0); + + if (submitted < 0) { + /* Terminating with fatal error */ + const char* errmsg = + strerror(-submitted); + ib_logf(IB_LOG_LEVEL_FATAL, + "Trying to sumbit %d aio requests, " + "io_submit() set errno to %d: %s", + partial_count, -submitted, + errmsg ? errmsg : "<unknown>"); + } + ut_ad(submitted <= partial_count); + if (submitted < partial_count) + { + ib_logf(IB_LOG_LEVEL_WARN, + "Trying to sumbit %d aio requests, " + "io_submit() submitted only %d", + partial_count, submitted); + } + offset += submitted; + } + total_submitted += count; + } + /* Reset the aio request buffer. */ + memset(array->pending, 0x0, sizeof(struct iocb*) * array->n_slots); + memset(array->count, 0x0, sizeof(ulint) * array->n_segments); + + if (acquire_mutex) + os_mutex_exit(array->mutex); + + srv_stats.n_aio_submitted.add(total_submitted); +#endif +} + +/** Submit buffered AIO requests on the given segment to the kernel. */ +UNIV_INTERN +void +os_aio_dispatch_read_array_submit() +{ + os_aio_dispatch_read_array_submit_low(true); +} + +#if defined(LINUX_NATIVE_AIO) +/*******************************************************************//** +Dispatch an AIO request to the kernel. +@return TRUE on success. */ +static +ibool +os_aio_linux_dispatch( +/*==================*/ + os_aio_array_t* array, /*!< in: io request array. */ + os_aio_slot_t* slot, /*!< in: an already reserved slot. */ + bool should_buffer) /*!< in: should buffer the request + rather than submit. */ +{ + int ret; + struct iocb* iocb; + + ut_ad(slot != NULL); + ut_ad(array); + + ut_a(slot->reserved); + + /* Find out what we are going to work with. + The iocb struct is directly in the slot. + The io_context is one per segment. */ + + ulint slots_per_segment = array->n_slots / array->n_segments; + iocb = &slot->control; + ulint io_ctx_index = slot->pos / slots_per_segment; + if (should_buffer) { + ut_ad(array == os_aio_read_array); + + os_mutex_enter(array->mutex); + /* There are array->n_slots elements in array->pending, + which is divided into array->n_segments area of equal size. + The iocb of each segment are buffered in its corresponding area + in the pending array consecutively as they come. + array->count[i] records the number of buffered aio requests + in the ith segment.*/ + ulint& count = array->count[io_ctx_index]; + ut_ad(count != slots_per_segment); + ulint n = io_ctx_index * slots_per_segment + count; + array->pending[n] = iocb; + ++count; + if (count == slots_per_segment) { + os_aio_dispatch_read_array_submit_low(false); + } + os_mutex_exit(array->mutex); + return(TRUE); + } + /* Submit the given request. */ + ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot, + array->aio_ctx[io_ctx_index], (ulong) io_ctx_index); +#endif + + /* io_submit returns number of successfully + queued requests or -errno. */ + if (UNIV_UNLIKELY(ret != 1)) { + errno = -ret; + return(FALSE); + } + + return(TRUE); +} +#endif /* LINUX_NATIVE_AIO */ + + +/*******************************************************************//** +NOTE! Use the corresponding macro os_aio(), not directly this function! +Requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INTERN +ibool +os_aio_func( +/*========*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + pfs_os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + ulint space_id, + trx_t* trx, + bool should_buffer) + /*!< in: Whether to buffer an aio request. + AIO read ahead uses this. If you plan to + use this parameter, make sure you remember + to call os_aio_dispatch_read_array_submit() + when you're ready to commit all your requests.*/ +{ + os_aio_array_t* array; + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + DWORD len = (DWORD) n; + BOOL ret; +#endif + ulint wake_later; + ut_ad(buf); + ut_ad(n > 0); + ut_ad(n % OS_MIN_LOG_BLOCK_SIZE == 0); + ut_ad(offset % OS_MIN_LOG_BLOCK_SIZE == 0); + ut_ad(os_aio_validate_skip()); +#ifdef WIN_ASYNC_IO + ut_ad((n & 0xFFFFFFFFUL) == n); +#endif + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); + + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + mode = OS_AIO_SYNC; os_has_said_disk_full = FALSE;); + + if (mode == OS_AIO_SYNC) { + ibool ret; + /* This is actually an ordinary synchronous read or write: + no need to use an i/o-handler thread */ + + if (type == OS_FILE_READ) { + ret = os_file_read_func(file.m_file, buf, offset, n, trx); + } else { + ut_ad(!srv_read_only_mode); + ut_a(type == OS_FILE_WRITE); + + ret = os_file_write(name, file, buf, offset, n); + + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + os_has_said_disk_full = FALSE; ret = 0; errno = 28;); + + if (!ret) { + os_file_handle_error_cond_exit(name, "os_file_write_func", TRUE, FALSE); + } + } + + if (!ret) { + fprintf(stderr, "FAIL"); + } + + return ret; + } + +try_again: + switch (mode) { + case OS_AIO_NORMAL: + if (type == OS_FILE_READ) { + array = os_aio_read_array; + } else { + ut_ad(!srv_read_only_mode); + array = os_aio_write_array; + } + break; + case OS_AIO_IBUF: + ut_ad(type == OS_FILE_READ); + /* Reduce probability of deadlock bugs in connection with ibuf: + do not let the ibuf i/o handler sleep */ + + wake_later = FALSE; + + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_ibuf_array; + } + break; + case OS_AIO_LOG: + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_log_array; + } + break; + case OS_AIO_SYNC: + array = os_aio_sync_array; +#if defined(LINUX_NATIVE_AIO) + /* In Linux native AIO we don't use sync IO array. */ + ut_a(!srv_use_native_aio); +#endif /* LINUX_NATIVE_AIO */ + break; + default: + ut_error; + array = NULL; /* Eliminate compiler warning */ + } + + if (trx && type == OS_FILE_READ) + { + trx->io_reads++; + trx->io_read += n; + } + slot = os_aio_array_reserve_slot(type, array, message1, message2, file, + name, buf, offset, n, space_id); + if (type == OS_FILE_READ) { + if (srv_use_native_aio) { + os_n_file_reads++; + os_bytes_read_since_printout += n; +#ifdef WIN_ASYNC_IO + ret = ReadFile(file.m_file, buf, (DWORD) n, &len, + &(slot->control)); + if(!ret && GetLastError() != ERROR_IO_PENDING) + goto err_exit; + +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot, + should_buffer)) { + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + if (srv_use_native_aio) { + os_n_file_writes++; +#ifdef WIN_ASYNC_IO + ret = WriteFile(file.m_file, buf, (DWORD) n, &len, + &(slot->control)); + + if(!ret && GetLastError() != ERROR_IO_PENDING) + goto err_exit; +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot, false)) { + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else { + ut_error; + } + + /* aio was queued successfully! */ + return(TRUE); + +#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO +err_exit: +#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */ + os_aio_array_free_slot(array, slot); + + if (os_file_handle_error( + name,type == OS_FILE_READ ? "aio read" : "aio write")) { + + goto try_again; + } + + return(FALSE); +} + +#ifdef WIN_ASYNC_IO +#define READ_SEGMENT(x) (x < srv_n_read_io_threads) +#define WRITE_SEGMENT(x) !READ_SEGMENT(x) + +/**********************************************************************//** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /*!< this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + ulint orig_seg = segment; + os_aio_slot_t* slot; + ibool ret_val; + BOOL ret; + DWORD len; + BOOL retry = FALSE; + ULONG_PTR key; + HANDLE port = READ_SEGMENT(segment)? read_completion_port : completion_port; + + for(;;) { + ret = GetQueuedCompletionStatus(port, &len, &key, + (OVERLAPPED **)&slot, INFINITE); + + /* If shutdown key was received, repost the shutdown message and exit */ + if (ret && (key == IOCP_SHUTDOWN_KEY)) { + PostQueuedCompletionStatus(port, 0, key, NULL); + os_thread_exit(NULL); + } + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); + } + + if(WRITE_SEGMENT(segment)&& slot->type == OS_FILE_READ) { + /* + Redirect read completions to the dedicated completion port + and thread. We need to split read and write threads. If we do not + do that, and just allow all io threads process all IO, it is possible + to get stuck in a deadlock in buffer pool code, + + Currently, the problem is solved this way - "write io" threads + always get all completion notifications, from both async reads and + writes. Write completion is handled in the same thread that gets it. + Read completion is forwarded via PostQueueCompletionStatus()) + to the second completion port dedicated solely to reads. One of the + "read io" threads waiting on this port will finally handle the IO. + + Forwarding IO completion this way costs a context switch , and this + seems tolerable since asynchronous reads are by far less frequent. + */ + ut_a(PostQueuedCompletionStatus(read_completion_port, len, key, + &slot->control)); + } + else { + break; + } + } + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + *space_id = slot->space_id; + + if (ret && len == slot->len) { + + ret_val = TRUE; + } else if (os_file_handle_error(slot->name, "Windows aio")) { + + retry = TRUE; + } else { + + ret_val = FALSE; + } + + if (retry) { + LARGE_INTEGER li; + li.LowPart = slot->control.Offset; + li.HighPart = slot->control.OffsetHigh; + + ut_a((slot->len & 0xFFFFFFFFUL) == slot->len); + + switch (slot->type) { + case OS_FILE_WRITE: + ret_val = os_file_write(slot->name, slot->file, slot->buf, + li.QuadPart, slot->len); + break; + case OS_FILE_READ: + ret_val = os_file_read(slot->file, slot->buf, + li.QuadPart, slot->len); + break; + default: + ut_error; + } + + } + + os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot); + + return(ret_val); +} +#endif + +#if defined(LINUX_NATIVE_AIO) +/******************************************************************//** +This function is only used in Linux native asynchronous i/o. This is +called from within the io-thread. If there are no completed IO requests +in the slot array, the thread calls this function to collect more +requests from the kernel. +The io-thread waits on io_getevents(), which is a blocking call, with +a timeout value. Unless the system is very heavy loaded, keeping the +io-thread very busy, the io-thread will spend most of its time waiting +in this function. +The io-thread also exits in this function. It checks server status at +each wakeup and that is why we use timed wait in io_getevents(). */ +static +void +os_aio_linux_collect( +/*=================*/ + os_aio_array_t* array, /*!< in/out: slot array. */ + ulint segment, /*!< in: local segment no. */ + ulint seg_size) /*!< in: segment size. */ +{ + int i; + int ret; + ulint start_pos; + ulint end_pos; + struct timespec timeout; + struct io_event* events; + struct io_context* io_ctx; + + /* sanity checks. */ + ut_ad(array != NULL); + ut_ad(seg_size > 0); + ut_ad(segment < array->n_segments); + + /* Which part of event array we are going to work on. */ + events = &array->aio_events[segment * seg_size]; + + /* Which io_context we are going to use. */ + io_ctx = array->aio_ctx[segment]; + + /* Starting point of the segment we will be working on. */ + start_pos = segment * seg_size; + + /* End point. */ + end_pos = start_pos + seg_size; + +retry: + + /* Initialize the events. The timeout value is arbitrary. + We probably need to experiment with it a little. */ + memset(events, 0, sizeof(*events) * seg_size); + timeout.tv_sec = 0; + timeout.tv_nsec = OS_AIO_REAP_TIMEOUT; + + ret = io_getevents(io_ctx, 1, seg_size, events, &timeout); + + if (ret > 0) { + for (i = 0; i < ret; i++) { + os_aio_slot_t* slot; + struct iocb* control; + + control = (struct iocb*) events[i].obj; + ut_a(control != NULL); + + slot = (os_aio_slot_t*) control->data; + + /* Some sanity checks. */ + ut_a(slot != NULL); + ut_a(slot->reserved); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_getevents[%c]: slot[%p] ctx[%p]" + " seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', + slot, io_ctx, segment); +#endif + + /* We are not scribbling previous segment. */ + ut_a(slot->pos >= start_pos); + + /* We have not overstepped to next segment. */ + ut_a(slot->pos < end_pos); + + /* Mark this request as completed. The error handling + will be done in the calling function. */ + os_mutex_enter(array->mutex); + slot->n_bytes = events[i].res; + slot->ret = events[i].res2; + slot->io_already_done = TRUE; + os_mutex_exit(array->mutex); + } + return; + } + + if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + return; + } + + /* This error handling is for any error in collecting the + IO requests. The errors, if any, for any particular IO + request are simply passed on to the calling routine. */ + + switch (ret) { + case -EAGAIN: + /* Not enough resources! Try again. */ + case -EINTR: + /* Interrupted! I have tested the behaviour in case of an + interrupt. If we have some completed IOs available then + the return code will be the number of IOs. We get EINTR only + if there are no completed IOs and we have been interrupted. */ + case 0: + /* No pending request! Go back and check again. */ + goto retry; + } + + /* All other errors should cause a trap for now. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected ret_code[%d] from io_getevents()!\n", + ret); + ut_error; +} + +/**********************************************************************//** +This function is only used in Linux native asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait for +the completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the IO was successful */ +UNIV_INTERN +ibool +os_aio_linux_handle( +/*================*/ + ulint global_seg, /*!< in: segment number in the aio array + to wait for; segment 0 is the ibuf + i/o thread, segment 1 is log i/o thread, + then follow the non-ibuf read threads, + and the last are the non-ibuf write + threads. */ + fil_node_t**message1, /*!< out: the messages passed with the */ + void** message2, /*!< aio request; note that in case the + aio operation failed, these output + parameters are valid and can be used to + restart the operation. */ + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + ulint segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret = FALSE; + + /* Should never be doing Sync IO here. */ + ut_a(global_seg != ULINT_UNDEFINED); + + /* Find the array and the local segment. */ + segment = os_aio_get_array_and_local_segment(&array, global_seg); + n = array->n_slots / array->n_segments; + + wait_for_event: + /* Loop until we have found a completed request. */ + for (;;) { + ibool any_reserved = FALSE; + os_mutex_enter(array->mutex); + for (i = 0; i < n; ++i) { + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { + /* Something for us to work on. */ + goto found; + } else { + any_reserved = TRUE; + } + } + + os_mutex_exit(array->mutex); + + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (UNIV_UNLIKELY + (!any_reserved + && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + /* Wait for some request. Note that we return + from wait iff we have found a request. */ + + srv_set_io_thread_op_info(global_seg, + "waiting for completed aio requests"); + os_aio_linux_collect(array, segment, n); + } + +found: + /* Note that it may be that there are more then one completed + IO requests. We process them one at a time. We may have a case + here to improve the performance slightly by dealing with all + requests in one sweep. */ + srv_set_io_thread_op_info(global_seg, + "processing completed aio requests"); + + /* Ensure that we are scribbling only our segment. */ + ut_a(i < n); + + ut_ad(slot != NULL); + ut_ad(slot->reserved); + ut_ad(slot->io_already_done); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + *space_id = slot->space_id; + + if (slot->ret == 0 && slot->n_bytes == (long) slot->len) { + + ret = TRUE; + } else if ((slot->ret == 0) && (slot->n_bytes > 0) + && (slot->n_bytes < (long) slot->len)) { + /* Partial read or write scenario */ + int submit_ret; + struct iocb* iocb; + slot->buf = (byte*)slot->buf + slot->n_bytes; + slot->offset = slot->offset + slot->n_bytes; + slot->len = slot->len - slot->n_bytes; + /* Resetting the bytes read/written */ + slot->n_bytes = 0; + slot->io_already_done = FALSE; + iocb = &(slot->control); + + if (slot->type == OS_FILE_READ) { + io_prep_pread(&slot->control, slot->file.m_file, + slot->buf, slot->len, + (off_t) slot->offset); + } else { + ut_a(slot->type == OS_FILE_WRITE); + io_prep_pwrite(&slot->control, slot->file.m_file, + slot->buf, slot->len, + (off_t) slot->offset); + } + /* Resubmit an I/O request */ + submit_ret = io_submit(array->aio_ctx[segment], 1, &iocb); + if (submit_ret < 0 ) { + /* Aborting in case of submit failure */ + ib_logf(IB_LOG_LEVEL_FATAL, + "Native Linux AIO interface. io_submit()" + " call failed when resubmitting a partial" + " I/O request on the file %s.", + slot->name); + } else { + ret = FALSE; + os_mutex_exit(array->mutex); + goto wait_for_event; + } + } else { + errno = -slot->ret; + + /* os_file_handle_error does tell us if we should retry + this IO. As it stands now, we don't do this retry when + reaping requests from a different context than + the dispatcher. This non-retry logic is the same for + windows and linux native AIO. + We should probably look into this to transparently + re-submit the IO. */ + os_file_handle_error(slot->name, "Linux aio"); + + ret = FALSE; + } + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret); +} +#endif /* LINUX_NATIVE_AIO */ + +/**********************************************************************//** +Does simulated aio. This function should be called by an i/o-handler +thread. +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + ulint global_segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + os_aio_array_t* array; + ulint segment; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; + ulint n_consecutive; + ulint total_len; + ulint offs; + os_offset_t lowest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; + byte* combined_buf2; + ibool ret; + ibool any_reserved; + ulint n; + os_aio_slot_t* aio_slot; + + /* Fix compiler warning */ + *consecutive_ios = NULL; + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + +restart: + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (a)"); + ut_ad(os_aio_validate_skip()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + /* Look through n slots after the segment * n'th slot */ + + if (array == os_aio_read_array + && os_aio_recommend_sleep_for_read_threads) { + + /* Give other threads chance to add several i/os to the array + at once. */ + + goto recommended_sleep; + } + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (b)"); + + /* Check if there is a slot for which the i/o has already been + done */ + any_reserved = FALSE; + + os_mutex_enter(array->mutex); + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: i/o for slot %lu" + " already done, returning\n", + (ulong) i); + } + + aio_slot = slot; + ret = TRUE; + goto slot_io_done; + } else { + any_reserved = TRUE; + } + } + + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_mutex_exit(array->mutex); + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + n_consecutive = 0; + + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ + + biggest_age = 0; + lowest_offset = IB_UINT64_MAX; + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved) { + + age = (ulint) difftime( + ut_time(), slot->reservation_time); + + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age + && slot->offset < lowest_offset)) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + biggest_age = age; + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + /* There were no old requests. Look for an i/o request at the + lowest offset in the array (we ignore the high 32 bits of the + offset in these heuristics) */ + + lowest_offset = IB_UINT64_MAX; + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + + if (slot->reserved && slot->offset < lowest_offset) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + + /* if n_consecutive != 0, then we have assigned + something valid to consecutive_ios[0] */ + ut_ad(n_consecutive != 0); + ut_ad(consecutive_ios[0] != NULL); + + aio_slot = consecutive_ios[0]; + + /* Check if there are several consecutive blocks to read or write */ + +consecutive_loop: + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + if (slot->reserved + && slot != aio_slot + && slot->offset == aio_slot->offset + aio_slot->len + && slot->type == aio_slot->type + && slot->file.m_file == aio_slot->file.m_file) { + + /* Found a consecutive i/o request */ + + consecutive_ios[n_consecutive] = slot; + n_consecutive++; + + aio_slot = slot; + + if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { + + goto consecutive_loop; + } else { + break; + } + } + } + + srv_set_io_thread_op_info(global_segment, "consecutive i/o requests"); + + /* We have now collected n_consecutive i/o requests in the array; + allocate a single buffer which can hold all data, and perform the + i/o */ + + total_len = 0; + aio_slot = consecutive_ios[0]; + + for (ulint i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; + } + + if (n_consecutive == 1) { + /* We can use the buffer of the i/o request */ + combined_buf = aio_slot->buf; + combined_buf2 = NULL; + } else { + combined_buf2 = static_cast<byte*>( + ut_malloc(total_len + UNIV_PAGE_SIZE)); + + ut_a(combined_buf2); + + combined_buf = static_cast<byte*>( + ut_align(combined_buf2, UNIV_PAGE_SIZE)); + } + + /* We release the array mutex for the time of the i/o: NOTE that + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + + os_mutex_exit(array->mutex); + + if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) { + /* Copy the buffers to the combined buffer */ + offs = 0; + + for (ulint i = 0; i < n_consecutive; i++) { + + ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf, + consecutive_ios[i]->len); + + offs += consecutive_ios[i]->len; + } + } + + srv_set_io_thread_op_info(global_segment, "doing file i/o"); + + /* Do the i/o with ordinary, synchronous i/o functions: */ + if (aio_slot->type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + ret = os_file_write( + aio_slot->name, aio_slot->file, combined_buf, + aio_slot->offset, total_len); + + DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28", + os_has_said_disk_full = FALSE; ret = 0; errno = 28;); + + if (!ret) { + os_file_handle_error_cond_exit(aio_slot->name, "os_file_write_func", TRUE, FALSE); + } + + } else { + ret = os_file_read( + aio_slot->file, combined_buf, + aio_slot->offset, total_len); + } + + srv_set_io_thread_op_info(global_segment, "file i/o done"); + + if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) { + /* Copy the combined buffer to individual buffers */ + offs = 0; + + for (ulint i = 0; i < n_consecutive; i++) { + + ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs, + consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; + } + } + + if (combined_buf2) { + ut_free(combined_buf2); + } + + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (ulint i = 0; i < n_consecutive; i++) { + consecutive_ios[i]->io_already_done = TRUE; + } + + /* We return the messages for the first slot now, and if there were + several slots, the messages will be returned with subsequent calls + of this function */ + +slot_io_done: + + ut_a(aio_slot->reserved); + + *message1 = aio_slot->message1; + *message2 = aio_slot->message2; + + *type = aio_slot->type; + *space_id = aio_slot->space_id; + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, aio_slot); + + return(ret); + +wait_for_io: + srv_set_io_thread_op_info(global_segment, "resetting wait event"); + + /* We wait here until there again can be i/os in the segment + of this thread */ + + os_event_reset(os_aio_segment_wait_events[global_segment]); + + os_mutex_exit(array->mutex); + +recommended_sleep: + srv_set_io_thread_op_info(global_segment, "waiting for i/o request"); + + os_event_wait(os_aio_segment_wait_events[global_segment]); + + goto restart; +} + +/**********************************************************************//** +Validates the consistency of an aio array. +@return true if ok */ +static +bool +os_aio_array_validate( +/*==================*/ + os_aio_array_t* array) /*!< in: aio wait array */ +{ + ulint i; + ulint n_reserved = 0; + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { + n_reserved++; + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + os_mutex_exit(array->mutex); + + return(true); +} + +/**********************************************************************//** +Validates the consistency the aio system. +@return TRUE if ok */ +UNIV_INTERN +ibool +os_aio_validate(void) +/*=================*/ +{ + os_aio_array_validate(os_aio_read_array); + + if (os_aio_write_array != 0) { + os_aio_array_validate(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_validate(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_validate(os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_validate(os_aio_sync_array); + } + + return(TRUE); +} + +/**********************************************************************//** +Prints pending IO requests per segment of an aio array. +We probably don't need per segment statistics but they can help us +during development phase to see if the IO requests are being +distributed as expected. */ +static +void +os_aio_print_segment_info( +/*======================*/ + FILE* file, /*!< in: file where to print */ + ulint* n_seg, /*!< in: pending IO array */ + os_aio_array_t* array) /*!< in: array to process */ +{ + ulint i; + + ut_ad(array); + ut_ad(n_seg); + ut_ad(array->n_segments > 0); + + if (array->n_segments == 1) { + return; + } + + fprintf(file, " ["); + for (i = 0; i < array->n_segments; i++) { + if (i != 0) { + fprintf(file, ", "); + } + + fprintf(file, "%lu", n_seg[i]); + } + fprintf(file, "] "); +} + +/**********************************************************************//** +Prints info about the aio array. */ +UNIV_INTERN +void +os_aio_print_array( +/*==============*/ + FILE* file, /*!< in: file where to print */ + os_aio_array_t* array) /*!< in: aio array to print */ +{ + ulint n_reserved = 0; + ulint n_res_seg[SRV_MAX_N_IO_THREADS]; + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + memset(n_res_seg, 0x0, sizeof(n_res_seg)); + + for (ulint i = 0; i < array->n_slots; ++i) { + os_aio_slot_t* slot; + ulint seg_no; + + slot = os_aio_array_get_nth_slot(array, i); + + seg_no = (i * array->n_segments) / array->n_slots; + + if (slot->reserved) { + ++n_reserved; + ++n_res_seg[seg_no]; + + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + fprintf(file, " %lu", (ulong) n_reserved); + + os_aio_print_segment_info(file, n_res_seg, array); + + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file) /*!< in: file where to print */ +{ + time_t current_time; + double time_elapsed; + double avg_bytes_read; + + for (ulint i = 0; i < srv_n_file_io_threads; ++i) { + fprintf(file, "I/O thread %lu state: %s (%s)", + (ulong) i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); + +#ifndef _WIN32 + if (!srv_use_native_aio + && os_aio_segment_wait_events[i]->is_set()) { + fprintf(file, " ev set"); + } +#endif /* _WIN32 */ + + fprintf(file, "\n"); + } + + fputs("Pending normal aio reads:", file); + + os_aio_print_array(file, os_aio_read_array); + + if (os_aio_write_array != 0) { + fputs(", aio writes:", file); + os_aio_print_array(file, os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + fputs(",\n ibuf aio reads:", file); + os_aio_print_array(file, os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + fputs(", log i/o's:", file); + os_aio_print_array(file, os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + fputs(", sync i/o's:", file); + os_aio_print_array(file, os_aio_sync_array); + } + + putc('\n', file); + current_time = ut_time(); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + fprintf(file, + "Pending flushes (fsync) log: " ULINTPF + "; buffer pool: " ULINTPF "\n" + ULINTPF " OS file reads, " + ULINTPF " OS file writes, " + ULINTPF " OS fsyncs\n", + fil_n_pending_log_flushes, + fil_n_pending_tablespace_flushes, + os_n_file_reads, + os_n_file_writes, + os_n_fsyncs); + + const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS)); + const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES)); + + if (n_reads != 0 || n_writes != 0) { + fprintf(file, + ULINTPF " pending reads, " ULINTPF " pending writes\n", + n_reads, n_writes); + } + + if (os_n_file_reads == os_n_file_reads_old) { + avg_bytes_read = 0.0; + } else { + avg_bytes_read = (double) os_bytes_read_since_printout + / (os_n_file_reads - os_n_file_reads_old); + } + + fprintf(file, + "%.2f reads/s, %lu avg bytes/read," + " %.2f writes/s, %.2f fsyncs/s\n", + (os_n_file_reads - os_n_file_reads_old) + / time_elapsed, + (ulong) avg_bytes_read, + (os_n_file_writes - os_n_file_writes_old) + / time_elapsed, + (os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = current_time; +} + +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void) +/*======================*/ +{ + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = time(NULL); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. +@return TRUE if all free */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void) +/*=======================*/ +{ + os_aio_array_t* array; + ulint n_res = 0; + + array = os_aio_read_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (!srv_read_only_mode) { + ut_a(os_aio_write_array == 0); + + array = os_aio_write_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + ut_a(os_aio_ibuf_array == 0); + + array = os_aio_ibuf_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + } + + ut_a(os_aio_log_array == 0); + + array = os_aio_log_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_sync_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (n_res == 0) { + + return(TRUE); + } + + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ diff --cc support-files/mysql.server.sh index 9c4d8e35ec5,28f0c2f041b..34f3ca4af34 --- a/support-files/mysql.server.sh +++ b/support-files/mysql.server.sh @@@ -25,7 -25,14 +25,6 @@@ # Description: MariaDB is a very fast and reliable SQL database engine. ### END INIT INFO - # If you install MariaDB on some other places than @prefix@, then you -# Prevent OpenSUSE's init scripts from calling systemd, so that -# both 'bootstrap' and 'start' are handled entirely within this -# script -SYSTEMD_NO_WRAP=1 - -# Prevent Debian's init scripts from calling systemctl -_SYSTEMCTL_SKIP_REDIRECT=true - # have to do one of the following things for this script to work: # # - Run this script from within the MariaDB installation directory @@@ -438,7 -452,7 +437,6 @@@ case "$mode" i *) # usage basename=`basename "$0"` - echo "Usage: $basename {start|stop|restart|reload|force-reload|status|configtest} [ MariaDB server options ]" - echo "Usage: $basename {start|stop|restart|reload|force-reload|status|configtest|bootstrap} [ MySQL server options ]" exit 1 ;; esac