[Maria-developers] bzr commit into MariaDB 5.1, with Maria 1.5:maria branch (knielsen:2717)
#At lp:maria 2717 knielsen@knielsen-hq.org 2009-08-03 [merge] Merge XtraDB 6 into MariaDB. modified: mysql-test/r/events_stress.result mysql-test/r/information_schema.result mysql-test/r/information_schema_all_engines.result mysql-test/r/innodb_bug36169.result mysql-test/r/innodb_xtradb_bug317074.result mysql-test/t/events_stress.test mysql-test/t/innodb-analyze.test mysql-test/t/innodb_bug36169.test mysql-test/t/innodb_bug36172.test mysql-test/t/innodb_xtradb_bug317074.test storage/xtradb/btr/btr0cur.c storage/xtradb/btr/btr0sea.c storage/xtradb/buf/buf0buddy.c storage/xtradb/buf/buf0buf.c storage/xtradb/buf/buf0flu.c storage/xtradb/buf/buf0lru.c storage/xtradb/buf/buf0rea.c storage/xtradb/dict/dict0boot.c storage/xtradb/dict/dict0crea.c storage/xtradb/dict/dict0dict.c storage/xtradb/dict/dict0load.c storage/xtradb/fil/fil0fil.c storage/xtradb/handler/ha_innodb.cc storage/xtradb/handler/i_s.cc storage/xtradb/handler/i_s.h storage/xtradb/handler/innodb_patch_info.h storage/xtradb/ibuf/ibuf0ibuf.c storage/xtradb/include/buf0buddy.h storage/xtradb/include/buf0buddy.ic storage/xtradb/include/buf0buf.h storage/xtradb/include/buf0buf.ic storage/xtradb/include/buf0flu.ic storage/xtradb/include/buf0lru.h storage/xtradb/include/dict0dict.h storage/xtradb/include/dict0dict.ic storage/xtradb/include/log0log.h storage/xtradb/include/rem0cmp.h storage/xtradb/include/rem0cmp.ic storage/xtradb/include/srv0srv.h storage/xtradb/include/sync0sync.h storage/xtradb/include/univ.i storage/xtradb/include/ut0auxconf.h storage/xtradb/log/log0log.c storage/xtradb/log/log0recv.c storage/xtradb/mtr/mtr0mtr.c storage/xtradb/os/os0file.c storage/xtradb/rem/rem0cmp.c storage/xtradb/row/row0mysql.c storage/xtradb/scripts/install_innodb_plugins.sql storage/xtradb/srv/srv0srv.c storage/xtradb/srv/srv0start.c storage/xtradb/sync/sync0sync.c storage/xtradb/ut/ut0ut.c === modified file 'mysql-test/r/events_stress.result' --- a/mysql-test/r/events_stress.result 2006-09-01 11:08:44 +0000 +++ b/mysql-test/r/events_stress.result 2009-08-03 20:09:53 +0000 @@ -32,6 +32,7 @@ USE events_conn1_test2; SELECT COUNT(*) FROM INFORMATION_SCHEMA.EVENTS WHERE EVENT_SCHEMA='events_conn1_test2'; COUNT(*) 50 +SET @old_event_scheduler=@@event_scheduler; SET GLOBAL event_scheduler=on; DROP DATABASE events_conn1_test2; SET GLOBAL event_scheduler=off; @@ -63,3 +64,4 @@ DROP TABLE fill_it1; DROP TABLE fill_it2; DROP TABLE fill_it3; DROP DATABASE events_test; +SET GLOBAL event_scheduler=@old_event_scheduler; === modified file 'mysql-test/r/information_schema.result' --- a/mysql-test/r/information_schema.result 2009-06-11 17:49:51 +0000 +++ b/mysql-test/r/information_schema.result 2009-08-03 20:09:53 +0000 @@ -61,9 +61,11 @@ INNODB_CMP INNODB_CMPMEM INNODB_CMPMEM_RESET INNODB_CMP_RESET +INNODB_INDEX_STATS INNODB_LOCKS INNODB_LOCK_WAITS INNODB_RSEG +INNODB_TABLE_STATS INNODB_TRX KEY_COLUMN_USAGE PARTITIONS @@ -863,6 +865,8 @@ TABLE_CONSTRAINTS TABLE_NAME select TABLE_PRIVILEGES TABLE_NAME select VIEWS TABLE_NAME select INNODB_BUFFER_POOL_PAGES_INDEX table_name select +INNODB_INDEX_STATS table_name select +INNODB_TABLE_STATS table_name select delete from mysql.user where user='mysqltest_4'; delete from mysql.db where user='mysqltest_4'; flush privileges; === modified file 'mysql-test/r/information_schema_all_engines.result' --- a/mysql-test/r/information_schema_all_engines.result 2009-06-11 12:53:26 +0000 +++ b/mysql-test/r/information_schema_all_engines.result 2009-08-03 20:09:53 +0000 @@ -35,13 +35,15 @@ INNODB_CMP INNODB_RSEG XTRADB_ENHANCEMENTS INNODB_BUFFER_POOL_PAGES_INDEX -INNODB_BUFFER_POOL_PAGES_BLOB +INNODB_INDEX_STATS INNODB_TRX INNODB_CMP_RESET INNODB_LOCK_WAITS INNODB_CMPMEM_RESET INNODB_LOCKS INNODB_CMPMEM +INNODB_TABLE_STATS +INNODB_BUFFER_POOL_PAGES_BLOB SELECT t.table_name, c1.column_name FROM information_schema.tables t INNER JOIN @@ -91,13 +93,15 @@ INNODB_CMP page_size INNODB_RSEG rseg_id XTRADB_ENHANCEMENTS name INNODB_BUFFER_POOL_PAGES_INDEX schema_name -INNODB_BUFFER_POOL_PAGES_BLOB space_id +INNODB_INDEX_STATS table_name INNODB_TRX trx_id INNODB_CMP_RESET page_size INNODB_LOCK_WAITS requesting_trx_id INNODB_CMPMEM_RESET page_size INNODB_LOCKS lock_id INNODB_CMPMEM page_size +INNODB_TABLE_STATS table_name +INNODB_BUFFER_POOL_PAGES_BLOB space_id SELECT t.table_name, c1.column_name FROM information_schema.tables t INNER JOIN @@ -147,13 +151,15 @@ INNODB_CMP page_size INNODB_RSEG rseg_id XTRADB_ENHANCEMENTS name INNODB_BUFFER_POOL_PAGES_INDEX schema_name -INNODB_BUFFER_POOL_PAGES_BLOB space_id +INNODB_INDEX_STATS table_name INNODB_TRX trx_id INNODB_CMP_RESET page_size INNODB_LOCK_WAITS requesting_trx_id INNODB_CMPMEM_RESET page_size INNODB_LOCKS lock_id INNODB_CMPMEM page_size +INNODB_TABLE_STATS table_name +INNODB_BUFFER_POOL_PAGES_BLOB space_id select 1 as f1 from information_schema.tables where "CHARACTER_SETS"= (select cast(table_name as char) from information_schema.tables order by table_name limit 1) limit 1; @@ -192,9 +198,11 @@ INNODB_CMP information_schema.INNODB_CMP INNODB_CMPMEM information_schema.INNODB_CMPMEM 1 INNODB_CMPMEM_RESET information_schema.INNODB_CMPMEM_RESET 1 INNODB_CMP_RESET information_schema.INNODB_CMP_RESET 1 +INNODB_INDEX_STATS information_schema.INNODB_INDEX_STATS 1 INNODB_LOCKS information_schema.INNODB_LOCKS 1 INNODB_LOCK_WAITS information_schema.INNODB_LOCK_WAITS 1 INNODB_RSEG information_schema.INNODB_RSEG 1 +INNODB_TABLE_STATS information_schema.INNODB_TABLE_STATS 1 INNODB_TRX information_schema.INNODB_TRX 1 KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1 PARTITIONS information_schema.PARTITIONS 1 @@ -254,13 +262,15 @@ Database: information_schema | INNODB_RSEG | | XTRADB_ENHANCEMENTS | | INNODB_BUFFER_POOL_PAGES_INDEX | -| INNODB_BUFFER_POOL_PAGES_BLOB | +| INNODB_INDEX_STATS | | INNODB_TRX | | INNODB_CMP_RESET | | INNODB_LOCK_WAITS | | INNODB_CMPMEM_RESET | | INNODB_LOCKS | | INNODB_CMPMEM | +| INNODB_TABLE_STATS | +| INNODB_BUFFER_POOL_PAGES_BLOB | +---------------------------------------+ Database: INFORMATION_SCHEMA +---------------------------------------+ @@ -300,13 +310,15 @@ Database: INFORMATION_SCHEMA | INNODB_RSEG | | XTRADB_ENHANCEMENTS | | INNODB_BUFFER_POOL_PAGES_INDEX | -| INNODB_BUFFER_POOL_PAGES_BLOB | +| INNODB_INDEX_STATS | | INNODB_TRX | | INNODB_CMP_RESET | | INNODB_LOCK_WAITS | | INNODB_CMPMEM_RESET | | INNODB_LOCKS | | INNODB_CMPMEM | +| INNODB_TABLE_STATS | +| INNODB_BUFFER_POOL_PAGES_BLOB | +---------------------------------------+ Wildcard: inf_rmation_schema +--------------------+ @@ -316,5 +328,5 @@ Wildcard: inf_rmation_schema +--------------------+ SELECT table_schema, count(*) FROM information_schema.TABLES WHERE table_schema IN ('mysql', 'INFORMATION_SCHEMA', 'test', 'mysqltest') AND table_name<>'ndb_binlog_index' AND table_name<>'ndb_apply_status' GROUP BY TABLE_SCHEMA; table_schema count(*) -information_schema 41 +information_schema 43 mysql 22 === modified file 'mysql-test/r/innodb_bug36169.result' --- a/mysql-test/r/innodb_bug36169.result 2009-06-11 12:53:26 +0000 +++ b/mysql-test/r/innodb_bug36169.result 2009-08-03 20:09:53 +0000 @@ -1,5 +1,5 @@ -SET @save_innodb_file_format=@@global.innodb_file_format; -SET @save_innodb_file_format_check=@@global.innodb_file_format_check; -SET @save_innodb_file_per_table=@@global.innodb_file_per_table; +set @old_innodb_file_per_table=@@innodb_file_per_table; +set @old_innodb_file_format=@@innodb_file_format; +set @old_innodb_file_format_check=@@innodb_file_format_check; SET GLOBAL innodb_file_format='Barracuda'; SET GLOBAL innodb_file_per_table=ON; === modified file 'mysql-test/r/innodb_xtradb_bug317074.result' --- a/mysql-test/r/innodb_xtradb_bug317074.result 2009-06-11 12:53:26 +0000 +++ b/mysql-test/r/innodb_xtradb_bug317074.result 2009-08-03 20:09:53 +0000 @@ -1,5 +1,5 @@ -SET @save_innodb_file_format=@@global.innodb_file_format; -SET @save_innodb_file_format_check=@@global.innodb_file_format_check; -SET @save_innodb_file_per_table=@@global.innodb_file_per_table; +SET @old_innodb_file_format=@@innodb_file_format; +SET @old_innodb_file_per_table=@@innodb_file_per_table; +SET @old_innodb_file_format_check=@@innodb_file_format_check; SET GLOBAL innodb_file_format='Barracuda'; SET GLOBAL innodb_file_per_table=ON; === modified file 'mysql-test/t/events_stress.test' --- a/mysql-test/t/events_stress.test 2007-05-26 14:36:38 +0000 +++ b/mysql-test/t/events_stress.test 2009-08-03 20:09:53 +0000 @@ -61,6 +61,7 @@ while ($1) } --enable_query_log SELECT COUNT(*) FROM INFORMATION_SCHEMA.EVENTS WHERE EVENT_SCHEMA='events_conn1_test2'; +SET @old_event_scheduler=@@event_scheduler; SET GLOBAL event_scheduler=on; --sleep 2.5 DROP DATABASE events_conn1_test2; @@ -135,3 +136,6 @@ DROP USER event_user3@localhost; # DROP DATABASE events_test; + +# Cleanup +SET GLOBAL event_scheduler=@old_event_scheduler; === modified file 'mysql-test/t/innodb-analyze.test' --- a/mysql-test/t/innodb-analyze.test 2009-06-09 15:08:46 +0000 +++ b/mysql-test/t/innodb-analyze.test 2009-08-03 20:09:53 +0000 @@ -11,7 +11,7 @@ -- disable_result_log -- enable_warnings -SET @save_innodb_stats_sample_pages=@@innodb_stats_sample_pages; +SET @old_innodb_stats_sample_pages=@@innodb_stats_sample_pages; SET GLOBAL innodb_stats_sample_pages=0; # check that the value has been adjusted to 1 @@ -61,5 +61,5 @@ ANALYZE TABLE innodb_analyze; SET GLOBAL innodb_stats_sample_pages=16; ANALYZE TABLE innodb_analyze; -SET GLOBAL innodb_stats_sample_pages=@save_innodb_stats_sample_pages; DROP TABLE innodb_analyze; +SET GLOBAL innodb_stats_sample_pages=@old_innodb_stats_sample_pages; === modified file 'mysql-test/t/innodb_bug36169.test' --- a/mysql-test/t/innodb_bug36169.test 2009-06-11 12:53:26 +0000 +++ b/mysql-test/t/innodb_bug36169.test 2009-08-03 20:09:53 +0000 @@ -4,10 +4,10 @@ # -- source include/have_innodb.inc +set @old_innodb_file_per_table=@@innodb_file_per_table; +set @old_innodb_file_format=@@innodb_file_format; +set @old_innodb_file_format_check=@@innodb_file_format_check; -SET @save_innodb_file_format=@@global.innodb_file_format; -SET @save_innodb_file_format_check=@@global.innodb_file_format_check; -SET @save_innodb_file_per_table=@@global.innodb_file_per_table; SET GLOBAL innodb_file_format='Barracuda'; SET GLOBAL innodb_file_per_table=ON; @@ -1148,10 +1148,6 @@ KEY `idx44` (`col176`(100),`col42`,`col7 KEY `idx45` (`col2`(27),`col27`(116)) )engine=innodb ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; -SET GLOBAL innodb_file_format=@save_innodb_file_format; -SET GLOBAL innodb_file_format_check=@save_innodb_file_format_check; -SET GLOBAL innodb_file_per_table=@save_innodb_file_per_table; - DROP TABLE IF EXISTS table0; DROP TABLE IF EXISTS table1; DROP TABLE IF EXISTS table2; @@ -1160,3 +1156,7 @@ DROP TABLE IF EXISTS table4; DROP TABLE IF EXISTS table5; DROP TABLE IF EXISTS table6; +set global innodb_file_per_table=@old_innodb_file_per_table; +set global innodb_file_format=@old_innodb_file_format; +set global innodb_file_format_check=@old_innodb_file_format_check; + === modified file 'mysql-test/t/innodb_bug36172.test' --- a/mysql-test/t/innodb_bug36172.test 2009-06-11 12:53:26 +0000 +++ b/mysql-test/t/innodb_bug36172.test 2009-08-03 20:09:53 +0000 @@ -13,10 +13,10 @@ SET storage_engine=InnoDB; -- disable_query_log -- disable_result_log +set @old_innodb_file_per_table=@@innodb_file_per_table; +set @old_innodb_file_format=@@innodb_file_format; +set @old_innodb_file_format_check=@@innodb_file_format_check; -SET @save_innodb_file_format=@@global.innodb_file_format; -SET @save_innodb_file_format_check=@@global.innodb_file_format_check; -SET @save_innodb_file_per_table=@@global.innodb_file_per_table; SET GLOBAL innodb_file_format='Barracuda'; SET GLOBAL innodb_file_per_table=on; @@ -27,7 +27,8 @@ CHECK TABLE table0 EXTENDED; INSERT IGNORE INTO `table0` SET `col19` = '19940127002709', `col20` = 2383927.9055146948, `col21` = 4293243420.5621204000, `col22` = '20511211123705', `col23` = 4289899778.6573381000, `col24` = 4293449279.0540481000, `col25` = 'emphysemic', `col26` = 'dentally', `col27` = '2347406', `col28` = 'eruct', `col30` = 1222, `col31` = 4294372994.9941406000, `col32` = 4291385574.1173744000, `col33` = 'borrowing\'s', `col34` = 'septics', `col35` = 'ratter\'s', `col36` = 'Kaye', `col37` = 'Florentia', `col38` = 'allium', `col39` = 'barkeep', `col40` = '19510407003441', `col41` = 4293559200.4215522000, `col42` = 22482, `col43` = 'decussate', `col44` = 'Brom\'s', `col45` = 'violated', `col46` = 4925506.4635456400, `col47` = 930549, `col48` = '51296066', `col49` = 'voluminously', `col50` = '29306676', `col51` = -88, `col52` = -2153690, `col53` = 4290250202.1464887000, `col54` = 'expropriation', `col55` = 'Aberdeen\'s', `col56` = 20343, `col58` = '19640415171532', `col59` = 'extern', `col60` = 'Ubana', `col61` = 4290487961.8539081000, `col62` = '2147', `col63` = -24271, `col64` = '20750801194548', `col65` = 'Cunaxa\'s', `col66` = 'pasticcio', `col67` = 2795817, `col68` = 'Indore\'s', `col70` = 6864127, `col71` = '1817832', `col72` = '20540506114211', `col73` = '20040101012300', `col74` = 'rationalized', `col75` = '45522', `col76` = 'indene', `col77` = -6964559, `col78` = 4247535.5266884370, `col79` = '20720416124357', `col80` = '2143', `col81` = 4292060102.4466386000, `col82` = 'striving', `col83` = 'boneblack\'s', `col84` = 'redolent', `col85` = 6489697.9009369183, `col86` = 4287473465.9731131000, `col87` = 7726015, `col88` = 'perplexed', `col89` = '17153791', `col90` = 5478587.1108127078, `col91` = 4287091404.7004304000, `col92` = 'Boulez\'s', `col93` = '2931278'; CHECK TABLE table0 EXTENDED; -SET GLOBAL innodb_file_format=@save_innodb_file_format; -SET GLOBAL innodb_file_format_check=@save_innodb_file_format_check; -SET GLOBAL innodb_file_per_table=@save_innodb_file_per_table; DROP TABLE table0; +set global innodb_file_per_table=@old_innodb_file_per_table; +set global innodb_file_format=@old_innodb_file_format; +set global innodb_file_format_check=@old_innodb_file_format_check; + === modified file 'mysql-test/t/innodb_xtradb_bug317074.test' --- a/mysql-test/t/innodb_xtradb_bug317074.test 2009-06-11 12:53:26 +0000 +++ b/mysql-test/t/innodb_xtradb_bug317074.test 2009-08-03 20:09:53 +0000 @@ -1,8 +1,8 @@ -- source include/have_innodb.inc -SET @save_innodb_file_format=@@global.innodb_file_format; -SET @save_innodb_file_format_check=@@global.innodb_file_format_check; -SET @save_innodb_file_per_table=@@global.innodb_file_per_table; +SET @old_innodb_file_format=@@innodb_file_format; +SET @old_innodb_file_per_table=@@innodb_file_per_table; +SET @old_innodb_file_format_check=@@innodb_file_format_check; SET GLOBAL innodb_file_format='Barracuda'; SET GLOBAL innodb_file_per_table=ON; @@ -38,8 +38,7 @@ DROP PROCEDURE insert_many; # The bug is hangup at the following statement ALTER TABLE test1 ENGINE=MyISAM; -SET GLOBAL innodb_file_format=@save_innodb_file_format; -SET GLOBAL innodb_file_format_check=@save_innodb_file_format_check; -SET GLOBAL innodb_file_per_table=@save_innodb_file_per_table; - DROP TABLE test1; +SET GLOBAL innodb_file_format=@old_innodb_file_format; +SET GLOBAL innodb_file_per_table=@old_innodb_file_per_table; +SET GLOBAL innodb_file_format_check=@old_innodb_file_format_check; === modified file 'storage/xtradb/btr/btr0cur.c' --- a/storage/xtradb/btr/btr0cur.c 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/btr/btr0cur.c 2009-06-25 01:43:25 +0000 @@ -3202,7 +3202,9 @@ btr_estimate_number_of_different_key_val ulint n_cols; ulint matched_fields; ulint matched_bytes; + ib_int64_t n_recs = 0; ib_int64_t* n_diff; + ib_int64_t* n_not_nulls; ullint n_sample_pages; /* number of pages to sample */ ulint not_empty_flag = 0; ulint total_external_size = 0; @@ -3215,6 +3217,7 @@ btr_estimate_number_of_different_key_val ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE]; ulint* offsets_rec = offsets_rec_; ulint* offsets_next_rec= offsets_next_rec_; + ulint stats_method = srv_stats_method; rec_offs_init(offsets_rec_); rec_offs_init(offsets_next_rec_); @@ -3222,6 +3225,10 @@ btr_estimate_number_of_different_key_val n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t)); + if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) { + n_not_nulls = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t)); + } + /* It makes no sense to test more pages than are contained in the index, thus we lower the number if it is too high */ if (srv_stats_sample_pages > index->stat_index_size) { @@ -3260,6 +3267,20 @@ btr_estimate_number_of_different_key_val } while (rec != supremum) { + /* count recs */ + if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) { + n_recs++; + for (j = 0; j <= n_cols; j++) { + ulint f_len; + rec_get_nth_field(rec, offsets_rec, + j, &f_len); + if (f_len == UNIV_SQL_NULL) + break; + + n_not_nulls[j]++; + } + } + rec_t* next_rec = page_rec_get_next(rec); if (next_rec == supremum) { break; @@ -3274,7 +3295,7 @@ btr_estimate_number_of_different_key_val cmp_rec_rec_with_match(rec, next_rec, offsets_rec, offsets_next_rec, index, &matched_fields, - &matched_bytes); + &matched_bytes, srv_stats_method); for (j = matched_fields + 1; j <= n_cols; j++) { /* We add one if this index record has @@ -3359,9 +3380,21 @@ btr_estimate_number_of_different_key_val } index->stat_n_diff_key_vals[j] += add_on; + + /* revision for 'nulls_ignored' */ + if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) { + if (!n_not_nulls[j]) + n_not_nulls[j] = 1; + index->stat_n_diff_key_vals[j] = + index->stat_n_diff_key_vals[j] * n_recs + / n_not_nulls[j]; + } } mem_free(n_diff); + if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) { + mem_free(n_not_nulls); + } if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } @@ -3733,7 +3766,8 @@ btr_blob_free( mtr_commit(mtr); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); mutex_enter(&block->mutex); /* Only free the block if it is still allocated to @@ -3744,17 +3778,22 @@ btr_blob_free( && buf_block_get_space(block) == space && buf_block_get_page_no(block) == page_no) { - if (buf_LRU_free_block(&block->page, all, NULL) + if (buf_LRU_free_block(&block->page, all, NULL, TRUE) != BUF_LRU_FREED - && all && block->page.zip.data) { + && all && block->page.zip.data + /* Now, buf_LRU_free_block() may release mutex temporarily */ + && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE + && buf_block_get_space(block) == space + && buf_block_get_page_no(block) == page_no) { /* Attempt to deallocate the uncompressed page if the whole block cannot be deallocted. */ - buf_LRU_free_block(&block->page, FALSE, NULL); + buf_LRU_free_block(&block->page, FALSE, NULL, TRUE); } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); mutex_exit(&block->mutex); } === modified file 'storage/xtradb/btr/btr0sea.c' --- a/storage/xtradb/btr/btr0sea.c 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/btr/btr0sea.c 2009-06-25 01:43:25 +0000 @@ -1731,7 +1731,8 @@ btr_search_validate(void) rec_offs_init(offsets_); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); cell_count = hash_get_n_cells(btr_search_sys->hash_index); @@ -1739,11 +1740,13 @@ btr_search_validate(void) /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if ((i != 0) && ((i % chunk_size) == 0)) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); } node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node; @@ -1850,11 +1853,13 @@ btr_search_validate(void) /* We release btr_search_latch every once in a while to give other queries a chance to run. */ if (i != 0) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); rw_lock_x_unlock(&btr_search_latch); os_thread_yield(); rw_lock_x_lock(&btr_search_latch); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_x_lock(&page_hash_latch); } if (!ha_validate(btr_search_sys->hash_index, i, end_index)) { @@ -1862,7 +1867,8 @@ btr_search_validate(void) } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); rw_lock_x_unlock(&btr_search_latch); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); === modified file 'storage/xtradb/buf/buf0buddy.c' --- a/storage/xtradb/buf/buf0buddy.c 2009-05-04 04:32:30 +0000 +++ b/storage/xtradb/buf/buf0buddy.c 2009-06-25 01:43:25 +0000 @@ -82,7 +82,7 @@ buf_buddy_add_to_free( #endif /* UNIV_DEBUG_VALGRIND */ ut_ad(buf_pool->zip_free[i].start != bpage); - UT_LIST_ADD_FIRST(list, buf_pool->zip_free[i], bpage); + UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage); #ifdef UNIV_DEBUG_VALGRIND if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i); @@ -100,8 +100,8 @@ buf_buddy_remove_from_free( ulint i) /* in: index of buf_pool->zip_free[] */ { #ifdef UNIV_DEBUG_VALGRIND - buf_page_t* prev = UT_LIST_GET_PREV(list, bpage); - buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); + buf_page_t* prev = UT_LIST_GET_PREV(zip_list, bpage); + buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage); if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i); if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i); @@ -111,7 +111,7 @@ buf_buddy_remove_from_free( #endif /* UNIV_DEBUG_VALGRIND */ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE); - UT_LIST_REMOVE(list, buf_pool->zip_free[i], bpage); + UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage); #ifdef UNIV_DEBUG_VALGRIND if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i); @@ -131,12 +131,13 @@ buf_buddy_alloc_zip( { buf_page_t* bpage; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&zip_free_mutex)); ut_a(i < BUF_BUDDY_SIZES); #if defined UNIV_DEBUG && !defined UNIV_DEBUG_VALGRIND /* Valgrind would complain about accessing free memory. */ - UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i]); + UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i]); #endif /* UNIV_DEBUG && !UNIV_DEBUG_VALGRIND */ bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]); @@ -177,16 +178,19 @@ static void buf_buddy_block_free( /*=================*/ - void* buf) /* in: buffer frame to deallocate */ + void* buf, /* in: buffer frame to deallocate */ + ibool have_page_hash_mutex) { const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf); buf_page_t* bpage; buf_block_t* block; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE)); + mutex_enter(&zip_hash_mutex); + HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage, ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY && bpage->in_zip_hash && !bpage->in_page_hash), @@ -198,12 +202,14 @@ buf_buddy_block_free( ut_d(bpage->in_zip_hash = FALSE); HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage); + mutex_exit(&zip_hash_mutex); + ut_d(memset(buf, 0, UNIV_PAGE_SIZE)); UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE); block = (buf_block_t*) bpage; mutex_enter(&block->mutex); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, have_page_hash_mutex); mutex_exit(&block->mutex); ut_ad(buf_buddy_n_frames > 0); @@ -219,7 +225,7 @@ buf_buddy_block_register( buf_block_t* block) /* in: buffer frame to allocate */ { const ulint fold = BUF_POOL_ZIP_FOLD(block); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); buf_block_set_state(block, BUF_BLOCK_MEMORY); @@ -230,7 +236,10 @@ buf_buddy_block_register( ut_ad(!block->page.in_page_hash); ut_ad(!block->page.in_zip_hash); ut_d(block->page.in_zip_hash = TRUE); + + mutex_enter(&zip_hash_mutex); HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page); + mutex_exit(&zip_hash_mutex); ut_d(buf_buddy_n_frames++); } @@ -264,7 +273,7 @@ buf_buddy_alloc_from( bpage->state = BUF_BLOCK_ZIP_FREE; #if defined UNIV_DEBUG && !defined UNIV_DEBUG_VALGRIND /* Valgrind would complain about accessing free memory. */ - UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[j]); + UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[j]); #endif /* UNIV_DEBUG && !UNIV_DEBUG_VALGRIND */ buf_buddy_add_to_free(bpage, j); } @@ -284,24 +293,28 @@ buf_buddy_alloc_low( possibly NULL if lru==NULL */ ulint i, /* in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /* in: pointer to a variable that will be assigned + ibool* lru, /* in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool_mutex was temporarily released, or NULL if the LRU list should not be used */ + ibool have_page_hash_mutex) { buf_block_t* block; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(!mutex_own(&buf_pool_zip_mutex)); if (i < BUF_BUDDY_SIZES) { /* Try to allocate from the buddy system. */ + mutex_enter(&zip_free_mutex); block = buf_buddy_alloc_zip(i); if (block) { goto func_exit; } + + mutex_exit(&zip_free_mutex); } /* Try allocating from the buf_pool->free list. */ @@ -318,18 +331,29 @@ buf_buddy_alloc_low( } /* Try replacing an uncompressed page in the buffer pool. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + if (have_page_hash_mutex) { + rw_lock_x_unlock(&page_hash_latch); + } block = buf_LRU_get_free_block(0); *lru = TRUE; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + if (have_page_hash_mutex) { + rw_lock_x_lock(&page_hash_latch); + } alloc_big: buf_buddy_block_register(block); + mutex_enter(&zip_free_mutex); block = buf_buddy_alloc_from(block->frame, i, BUF_BUDDY_SIZES); func_exit: buf_buddy_stat[i].used++; + mutex_exit(&zip_free_mutex); + return(block); } @@ -345,7 +369,10 @@ buf_buddy_relocate_block( { buf_page_t* b; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); +#endif switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_FREE: @@ -354,7 +381,7 @@ buf_buddy_relocate_block( case BUF_BLOCK_FILE_PAGE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: - ut_error; + /* ut_error; */ /* optimistic */ case BUF_BLOCK_ZIP_DIRTY: /* Cannot relocate dirty pages. */ return(FALSE); @@ -364,9 +391,17 @@ buf_buddy_relocate_block( } mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&zip_free_mutex); if (!buf_page_can_relocate(bpage)) { mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&zip_free_mutex); + return(FALSE); + } + + if (bpage != buf_page_hash_get(bpage->space, bpage->offset)) { + mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&zip_free_mutex); return(FALSE); } @@ -374,16 +409,19 @@ buf_buddy_relocate_block( ut_d(bpage->state = BUF_BLOCK_ZIP_FREE); /* relocate buf_pool->zip_clean */ - b = UT_LIST_GET_PREV(list, dpage); - UT_LIST_REMOVE(list, buf_pool->zip_clean, dpage); + mutex_enter(&flush_list_mutex); + b = UT_LIST_GET_PREV(zip_list, dpage); + UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, dpage); if (b) { - UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, dpage); + UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, dpage); } else { - UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, dpage); + UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, dpage); } + mutex_exit(&flush_list_mutex); mutex_exit(&buf_pool_zip_mutex); + mutex_exit(&zip_free_mutex); return(TRUE); } @@ -396,13 +434,15 @@ buf_buddy_relocate( /* out: TRUE if relocated */ void* src, /* in: block to relocate */ void* dst, /* in: free block to relocate to */ - ulint i) /* in: index of buf_pool->zip_free[] */ + ulint i, /* in: index of buf_pool->zip_free[] */ + ibool have_page_hash_mutex) { buf_page_t* bpage; const ulint size = BUF_BUDDY_LOW << i; ullint usec = ut_time_us(NULL); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&zip_free_mutex)); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); @@ -421,9 +461,16 @@ buf_buddy_relocate( actually is a properly initialized buf_page_t object. */ if (size >= PAGE_ZIP_MIN_SIZE) { + if (!have_page_hash_mutex) + mutex_exit(&zip_free_mutex); + /* This is a compressed page. */ mutex_t* mutex; + if (!have_page_hash_mutex) { + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + } /* The src block may be split into smaller blocks, some of which may be free. Thus, the mach_read_from_4() calls below may attempt to read @@ -444,6 +491,11 @@ buf_buddy_relocate( added to buf_pool->page_hash yet. Obviously, it cannot be relocated. */ + if (!have_page_hash_mutex) { + mutex_enter(&zip_free_mutex); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } return(FALSE); } @@ -453,16 +505,32 @@ buf_buddy_relocate( For the sake of simplicity, give up. */ ut_ad(page_zip_get_size(&bpage->zip) < size); + if (!have_page_hash_mutex) { + mutex_enter(&zip_free_mutex); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } return(FALSE); } + /* To keep latch order */ + if (have_page_hash_mutex) + mutex_exit(&zip_free_mutex); + /* The block must have been allocated, but it may contain uninitialized data. */ UNIV_MEM_ASSERT_W(src, size); mutex = buf_page_get_mutex(bpage); +retry_lock: mutex_enter(mutex); + if (mutex != buf_page_get_mutex(bpage)) { + mutex_exit(mutex); + mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } + mutex_enter(&zip_free_mutex); if (buf_page_can_relocate(bpage)) { /* Relocate the compressed page. */ @@ -479,17 +547,48 @@ success: buddy_stat->relocated_usec += ut_time_us(NULL) - usec; } + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } return(TRUE); } + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } + mutex_exit(mutex); } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) { /* This must be a buf_page_t object. */ UNIV_MEM_ASSERT_RW(src, size); + + mutex_exit(&zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + } + if (buf_buddy_relocate_block(src, dst)) { + mutex_enter(&zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } goto success; } + + mutex_enter(&zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } } return(FALSE); @@ -503,12 +602,14 @@ buf_buddy_free_low( /*===============*/ void* buf, /* in: block to be freed, must not be pointed to by the buffer pool */ - ulint i) /* in: index of buf_pool->zip_free[] */ + ulint i, /* in: index of buf_pool->zip_free[] */ + ibool have_page_hash_mutex) { buf_page_t* bpage; buf_page_t* buddy; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&zip_free_mutex)); ut_ad(!mutex_own(&buf_pool_zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); ut_ad(buf_buddy_stat[i].used > 0); @@ -519,7 +620,9 @@ recombine: ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE); if (i == BUF_BUDDY_SIZES) { - buf_buddy_block_free(buf); + mutex_exit(&zip_free_mutex); + buf_buddy_block_free(buf, have_page_hash_mutex); + mutex_enter(&zip_free_mutex); return; } @@ -564,7 +667,7 @@ buddy_free2: ut_a(bpage != buf); { - buf_page_t* next = UT_LIST_GET_NEXT(list, bpage); + buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage); UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i); bpage = next; } @@ -573,11 +676,11 @@ buddy_free2: #ifndef UNIV_DEBUG_VALGRIND buddy_nonfree: /* Valgrind would complain about accessing free memory. */ - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->zip_free[i])); + ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i])); #endif /* UNIV_DEBUG_VALGRIND */ /* The buddy is not free. Is there a free block of this size? */ - bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); + bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]); if (bpage) { /* Remove the block from the free list, because a successful @@ -587,7 +690,7 @@ buddy_nonfree: buf_buddy_remove_from_free(bpage, i); /* Try to relocate the buddy of buf to the free block. */ - if (buf_buddy_relocate(buddy, bpage, i)) { + if (buf_buddy_relocate(buddy, bpage, i, have_page_hash_mutex)) { ut_d(buddy->state = BUF_BLOCK_ZIP_FREE); goto buddy_free2; @@ -608,14 +711,14 @@ buddy_nonfree: (Parts of the buddy can be free in buf_pool->zip_free[j] with j < i.)*/ for (b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); - b; b = UT_LIST_GET_NEXT(list, b)) { + b; b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(b != buddy); } } #endif /* UNIV_DEBUG && !UNIV_DEBUG_VALGRIND */ - if (buf_buddy_relocate(buddy, buf, i)) { + if (buf_buddy_relocate(buddy, buf, i, have_page_hash_mutex)) { buf = bpage; UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i); === modified file 'storage/xtradb/buf/buf0buf.c' --- a/storage/xtradb/buf/buf0buf.c 2009-05-04 04:32:30 +0000 +++ b/storage/xtradb/buf/buf0buf.c 2009-06-25 01:43:25 +0000 @@ -244,6 +244,12 @@ UNIV_INTERN buf_pool_t* buf_pool = NULL; /* mutex protecting the buffer pool struct and control blocks, except the read-write lock in them */ UNIV_INTERN mutex_t buf_pool_mutex; +UNIV_INTERN mutex_t LRU_list_mutex; +UNIV_INTERN mutex_t flush_list_mutex; +UNIV_INTERN rw_lock_t page_hash_latch; +UNIV_INTERN mutex_t free_list_mutex; +UNIV_INTERN mutex_t zip_free_mutex; +UNIV_INTERN mutex_t zip_hash_mutex; /* mutex protecting the control blocks of compressed-only pages (of type buf_page_t, not buf_block_t) */ UNIV_INTERN mutex_t buf_pool_zip_mutex; @@ -664,9 +670,9 @@ buf_block_init( block->page.in_zip_hash = FALSE; block->page.in_flush_list = FALSE; block->page.in_free_list = FALSE; - block->in_unzip_LRU_list = FALSE; #endif /* UNIV_DEBUG */ block->page.in_LRU_list = FALSE; + block->in_unzip_LRU_list = FALSE; #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG block->n_pointers = 0; #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ @@ -751,8 +757,10 @@ buf_chunk_init( memset(block->frame, '\0', UNIV_PAGE_SIZE); #endif /* Add the block to the free list */ - UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page)); + mutex_enter(&free_list_mutex); + UT_LIST_ADD_LAST(free, buf_pool->free, (&block->page)); ut_d(block->page.in_free_list = TRUE); + mutex_exit(&free_list_mutex); block++; frame += UNIV_PAGE_SIZE; @@ -778,7 +786,7 @@ buf_chunk_contains_zip( ulint i; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); block = chunk->blocks; @@ -832,7 +840,7 @@ buf_chunk_not_freed( ulint i; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /*optimistic...*/ block = chunk->blocks; @@ -865,7 +873,7 @@ buf_chunk_all_free( ulint i; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */ block = chunk->blocks; @@ -891,7 +899,7 @@ buf_chunk_free( buf_block_t* block; const buf_block_t* block_end; - ut_ad(buf_pool_mutex_own()); + ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */ block_end = chunk->blocks + chunk->size; @@ -903,8 +911,10 @@ buf_chunk_free( ut_ad(!block->in_unzip_LRU_list); ut_ad(!block->page.in_flush_list); /* Remove the block from the free list. */ + mutex_enter(&free_list_mutex); ut_ad(block->page.in_free_list); - UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); + UT_LIST_REMOVE(free, buf_pool->free, (&block->page)); + mutex_exit(&free_list_mutex); /* Free the latches. */ mutex_free(&block->mutex); @@ -935,8 +945,17 @@ buf_pool_init(void) /* 1. Initialize general fields ------------------------------- */ mutex_create(&buf_pool_mutex, SYNC_BUF_POOL); + mutex_create(&LRU_list_mutex, SYNC_BUF_LRU_LIST); + mutex_create(&flush_list_mutex, SYNC_BUF_FLUSH_LIST); + rw_lock_create(&page_hash_latch, SYNC_BUF_PAGE_HASH); + mutex_create(&free_list_mutex, SYNC_BUF_FREE_LIST); + mutex_create(&zip_free_mutex, SYNC_BUF_ZIP_FREE); + mutex_create(&zip_hash_mutex, SYNC_BUF_ZIP_HASH); + mutex_create(&buf_pool_zip_mutex, SYNC_BUF_BLOCK); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); buf_pool_mutex_enter(); buf_pool->n_chunks = 1; @@ -973,6 +992,8 @@ buf_pool_init(void) --------------------------- */ /* All fields are initialized by mem_zalloc(). */ + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); buf_pool_mutex_exit(); btr_search_sys_create(buf_pool->curr_size @@ -1105,7 +1126,11 @@ buf_relocate( buf_page_t* b; ulint fold; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); ut_a(bpage->buf_fix_count == 0); @@ -1186,7 +1211,8 @@ buf_pool_shrink( try_again: btr_search_disable(); /* Empty the adaptive hash index again */ - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); shrink_again: if (buf_pool->n_chunks <= 1) { @@ -1257,7 +1283,7 @@ shrink_again: buf_LRU_make_block_old(&block->page); dirty++; - } else if (buf_LRU_free_block(&block->page, TRUE, NULL) + } else if (buf_LRU_free_block(&block->page, TRUE, NULL, FALSE) != BUF_LRU_FREED) { nonfree++; } @@ -1265,7 +1291,8 @@ shrink_again: mutex_exit(&block->mutex); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); /* Request for a flush of the chunk if it helps. Do not flush if there are non-free blocks, since @@ -1314,7 +1341,8 @@ shrink_again: func_done: srv_buf_pool_old_size = srv_buf_pool_size; func_exit: - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); btr_search_enable(); } @@ -1332,7 +1360,11 @@ buf_pool_page_hash_rebuild(void) hash_table_t* zip_hash; buf_page_t* b; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + mutex_enter(&flush_list_mutex); + /* Free, create, and populate the hash table. */ hash_table_free(buf_pool->page_hash); @@ -1374,7 +1406,7 @@ buf_pool_page_hash_rebuild(void) in buf_pool->flush_list. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); ut_ad(!b->in_flush_list); ut_ad(b->in_LRU_list); @@ -1386,7 +1418,7 @@ buf_pool_page_hash_rebuild(void) } for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(flush_list, b)) { ut_ad(b->in_flush_list); ut_ad(b->in_LRU_list); ut_ad(b->in_page_hash); @@ -1412,7 +1444,10 @@ buf_pool_page_hash_rebuild(void) } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + mutex_exit(&flush_list_mutex); } /************************************************************************ @@ -1422,17 +1457,20 @@ void buf_pool_resize(void) /*=================*/ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); if (srv_buf_pool_old_size == srv_buf_pool_size) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); return; } if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); /* Disable adaptive hash indexes and empty the index in order to free up memory in the buffer pool chunks. */ @@ -1466,7 +1504,8 @@ buf_pool_resize(void) } srv_buf_pool_old_size = srv_buf_pool_size; - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } buf_pool_page_hash_rebuild(); @@ -1488,12 +1527,14 @@ buf_block_make_young( if (buf_page_peek_if_too_old(bpage)) { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); /* There has been freeing activity in the LRU list: best to move to the head of the LRU list */ buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } } @@ -1507,13 +1548,15 @@ buf_page_make_young( /*================*/ buf_page_t* bpage) /* in: buffer block of a file page */ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); ut_a(buf_page_in_file(bpage)); buf_LRU_make_block_young(bpage); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } /************************************************************************ @@ -1528,7 +1571,8 @@ buf_reset_check_index_page_at_flush( { buf_block_t* block; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); @@ -1536,7 +1580,8 @@ buf_reset_check_index_page_at_flush( block->check_index_page_at_flush = FALSE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); } /************************************************************************ @@ -1555,7 +1600,8 @@ buf_page_peek_if_search_hashed( buf_block_t* block; ibool is_hashed; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); @@ -1565,7 +1611,8 @@ buf_page_peek_if_search_hashed( is_hashed = block->is_hashed; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(is_hashed); } @@ -1587,7 +1634,8 @@ buf_page_set_file_page_was_freed( { buf_page_t* bpage; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); bpage = buf_page_hash_get(space, offset); @@ -1595,7 +1643,8 @@ buf_page_set_file_page_was_freed( bpage->file_page_was_freed = TRUE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(bpage); } @@ -1616,7 +1665,8 @@ buf_page_reset_file_page_was_freed( { buf_page_t* bpage; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); bpage = buf_page_hash_get(space, offset); @@ -1624,7 +1674,8 @@ buf_page_reset_file_page_was_freed( bpage->file_page_was_freed = FALSE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(bpage); } @@ -1657,8 +1708,9 @@ buf_page_get_zip( buf_pool->n_page_gets++; for (;;) { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); lookup: + rw_lock_s_lock(&page_hash_latch); bpage = buf_page_hash_get(space, offset); if (bpage) { break; @@ -1666,7 +1718,8 @@ lookup: /* Page not in buf_pool: needs to be read from file */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); buf_read_page(space, zip_size, offset); @@ -1677,12 +1730,21 @@ lookup: if (UNIV_UNLIKELY(!bpage->zip.data)) { /* There is no compressed page. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(NULL); } block_mutex = buf_page_get_mutex(bpage); +retry_lock: mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } + + rw_lock_s_unlock(&page_hash_latch); switch (buf_page_get_state(bpage)) { case BUF_BLOCK_NOT_USED: @@ -1698,7 +1760,7 @@ lookup: break; case BUF_BLOCK_FILE_PAGE: /* Discard the uncompressed page frame if possible. */ - if (buf_LRU_free_block(bpage, FALSE, NULL) + if (buf_LRU_free_block(bpage, FALSE, NULL, FALSE) == BUF_LRU_FREED) { mutex_exit(block_mutex); @@ -1712,7 +1774,7 @@ lookup: must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); buf_page_set_accessed(bpage, TRUE); @@ -1943,7 +2005,7 @@ buf_block_is_uncompressed( const buf_chunk_t* chunk = buf_pool->chunks; const buf_chunk_t* const echunk = chunk + buf_pool->n_chunks; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) { /* The pointer should be aligned. */ @@ -1986,6 +2048,7 @@ buf_page_get_gen( ibool accessed; ulint fix_type; ibool must_read; + mutex_t* block_mutex; ut_ad(mtr); ut_ad((rw_latch == RW_S_LATCH) @@ -2001,9 +2064,18 @@ buf_page_get_gen( buf_pool->n_page_gets++; loop: block = guess; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); if (block) { + block_mutex = buf_page_get_mutex((buf_page_t*)block); +retry_lock_1: + mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex((buf_page_t*)block)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex((buf_page_t*)block); + goto retry_lock_1; + } + /* If the guess is a compressed page descriptor that has been allocated by buf_buddy_alloc(), it may have been invalidated by buf_buddy_relocate(). In that @@ -2017,6 +2089,8 @@ loop: || space != block->page.space || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + mutex_exit(block_mutex); + block = guess = NULL; } else { ut_ad(!block->page.in_zip_hash); @@ -2025,14 +2099,26 @@ loop: } if (block == NULL) { + rw_lock_s_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); + if (block) { + block_mutex = buf_page_get_mutex((buf_page_t*)block); +retry_lock_2: + mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex((buf_page_t*)block)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex((buf_page_t*)block); + goto retry_lock_2; + } + } + rw_lock_s_unlock(&page_hash_latch); } loop2: if (block == NULL) { /* Page not in buf_pool: needs to be read from file */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); if (mode == BUF_GET_IF_IN_POOL) { @@ -2053,7 +2139,8 @@ loop2: if (must_read && mode == BUF_GET_IF_IN_POOL) { /* The page is only being read to buffer */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); return(NULL); } @@ -2063,10 +2150,16 @@ loop2: ibool success; case BUF_BLOCK_FILE_PAGE: + if (block_mutex == &buf_pool_zip_mutex) { + /* it is wrong mutex... */ + mutex_exit(block_mutex); + goto loop; + } break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: + ut_ad(block_mutex == &buf_pool_zip_mutex); bpage = &block->page; if (bpage->buf_fix_count @@ -2077,20 +2170,25 @@ loop2: wait_until_unfixed: /* The block is buffer-fixed or I/O-fixed. Try again later. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); os_thread_sleep(WAIT_FOR_READ); goto loop; } /* Allocate an uncompressed page. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); block = buf_LRU_get_free_block(0); ut_a(block); + block_mutex = &block->mutex; - buf_pool_mutex_enter(); - mutex_enter(&block->mutex); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + mutex_enter(block_mutex); { buf_page_t* hash_bpage @@ -2101,35 +2199,55 @@ wait_until_unfixed: while buf_pool_mutex was released. Free the block that was allocated. */ - buf_LRU_block_free_non_file_page(block); - mutex_exit(&block->mutex); + buf_LRU_block_free_non_file_page(block, TRUE); + mutex_exit(block_mutex); block = (buf_block_t*) hash_bpage; + if (block) { + block_mutex = buf_page_get_mutex((buf_page_t*)block); +retry_lock_3: + mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex((buf_page_t*)block)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex((buf_page_t*)block); + goto retry_lock_3; + } + } + rw_lock_x_unlock(&page_hash_latch); + mutex_exit(&LRU_list_mutex); goto loop2; } } + mutex_enter(&buf_pool_zip_mutex); + if (UNIV_UNLIKELY (bpage->buf_fix_count || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) { + mutex_exit(&buf_pool_zip_mutex); /* The block was buffer-fixed or I/O-fixed while buf_pool_mutex was not held by this thread. Free the block that was allocated and try again. This should be extremely unlikely. */ - buf_LRU_block_free_non_file_page(block); - mutex_exit(&block->mutex); + buf_LRU_block_free_non_file_page(block, TRUE); + //mutex_exit(&block->mutex); + rw_lock_x_unlock(&page_hash_latch); + mutex_exit(&LRU_list_mutex); goto wait_until_unfixed; } /* Move the compressed page from bpage to block, and uncompress it. */ - mutex_enter(&buf_pool_zip_mutex); + mutex_enter(&flush_list_mutex); buf_relocate(bpage, &block->page); + + rw_lock_x_unlock(&page_hash_latch); + buf_block_init_low(block); block->lock_hash_val = lock_rec_hash(space, offset); @@ -2138,29 +2256,31 @@ wait_until_unfixed: if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) { - UT_LIST_REMOVE(list, buf_pool->zip_clean, + UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, &block->page); ut_ad(!block->page.in_flush_list); } else { /* Relocate buf_pool->flush_list. */ buf_page_t* b; - b = UT_LIST_GET_PREV(list, &block->page); + b = UT_LIST_GET_PREV(flush_list, &block->page); ut_ad(block->page.in_flush_list); - UT_LIST_REMOVE(list, buf_pool->flush_list, + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, &block->page); if (b) { UT_LIST_INSERT_AFTER( - list, buf_pool->flush_list, b, + flush_list, buf_pool->flush_list, b, &block->page); } else { UT_LIST_ADD_FIRST( - list, buf_pool->flush_list, + flush_list, buf_pool->flush_list, &block->page); } } + mutex_exit(&flush_list_mutex); + /* Buffer-fix, I/O-fix, and X-latch the block for the duration of the decompression. Also add the block to the unzip_LRU list. */ @@ -2169,16 +2289,22 @@ wait_until_unfixed: /* Insert at the front of unzip_LRU list */ buf_unzip_LRU_add_block(block, FALSE); + mutex_exit(&LRU_list_mutex); + block->page.buf_fix_count = 1; buf_block_set_io_fix(block, BUF_IO_READ); + + mutex_enter(&buf_pool_mutex); buf_pool->n_pend_unzip++; + mutex_exit(&buf_pool_mutex); + rw_lock_x_lock(&block->lock); - mutex_exit(&block->mutex); + mutex_exit(block_mutex); mutex_exit(&buf_pool_zip_mutex); - buf_buddy_free(bpage, sizeof *bpage); + buf_buddy_free(bpage, sizeof *bpage, FALSE); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); /* Decompress the page and apply buffered operations while not holding buf_pool_mutex or block->mutex. */ @@ -2190,17 +2316,21 @@ wait_until_unfixed: } /* Unfix and unlatch the block. */ - buf_pool_mutex_enter(); - mutex_enter(&block->mutex); + //buf_pool_mutex_enter(); + block_mutex = &block->mutex; + mutex_enter(block_mutex); + mutex_enter(&buf_pool_mutex); buf_pool->n_pend_unzip--; + mutex_exit(&buf_pool_mutex); block->page.buf_fix_count--; buf_block_set_io_fix(block, BUF_IO_NONE); - mutex_exit(&block->mutex); + //mutex_exit(&block->mutex); rw_lock_x_unlock(&block->lock); if (UNIV_UNLIKELY(!success)) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); return(NULL); } @@ -2217,11 +2347,11 @@ wait_until_unfixed: ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - mutex_enter(&block->mutex); + //mutex_enter(&block->mutex); UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page); buf_block_buf_fix_inc(block, file, line); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); /* Check if this is the first access to the page */ @@ -2229,7 +2359,7 @@ wait_until_unfixed: buf_page_set_accessed(&block->page, TRUE); - mutex_exit(&block->mutex); + mutex_exit(block_mutex); buf_block_make_young(&block->page); @@ -2515,16 +2645,19 @@ buf_page_try_get_func( ibool success; ulint fix_type; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); block = buf_block_hash_get(space_id, page_no); if (!block) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(NULL); } mutex_enter(&block->mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); @@ -2644,7 +2777,10 @@ buf_page_init( { buf_page_t* hash_page; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(&(block->mutex))); ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); @@ -2677,7 +2813,8 @@ buf_page_init( (const void*) hash_page, (const void*) block); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(&block->mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_x_unlock(&page_hash_latch); buf_print(); buf_LRU_print(); buf_validate(); @@ -2756,16 +2893,24 @@ buf_page_init_for_read( ut_ad(block); } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); if (buf_page_hash_get(space, offset)) { /* The page is already in the buffer pool. */ err_exit: if (block) { mutex_enter(&block->mutex); - buf_LRU_block_free_non_file_page(block); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + buf_LRU_block_free_non_file_page(block, FALSE); mutex_exit(&block->mutex); } + else { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } bpage = NULL; goto func_exit; @@ -2785,6 +2930,8 @@ err_exit: mutex_enter(&block->mutex); buf_page_init(space, offset, block); + rw_lock_x_unlock(&page_hash_latch); + /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); @@ -2812,7 +2959,7 @@ err_exit: been added to buf_pool->LRU and buf_pool->page_hash. */ mutex_exit(&block->mutex); - data = buf_buddy_alloc(zip_size, &lru); + data = buf_buddy_alloc(zip_size, &lru, FALSE); mutex_enter(&block->mutex); block->page.zip.data = data; @@ -2825,6 +2972,7 @@ err_exit: buf_unzip_LRU_add_block(block, TRUE); } + mutex_exit(&LRU_list_mutex); mutex_exit(&block->mutex); } else { /* Defer buf_buddy_alloc() until after the block has @@ -2836,8 +2984,8 @@ err_exit: control block (bpage), in order to avoid the invocation of buf_buddy_relocate_block() on uninitialized data. */ - data = buf_buddy_alloc(zip_size, &lru); - bpage = buf_buddy_alloc(sizeof *bpage, &lru); + data = buf_buddy_alloc(zip_size, &lru, TRUE); + bpage = buf_buddy_alloc(sizeof *bpage, &lru, TRUE); /* If buf_buddy_alloc() allocated storage from the LRU list, it released and reacquired buf_pool_mutex. Thus, we must @@ -2846,8 +2994,11 @@ err_exit: && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) { /* The block was added by some other thread. */ - buf_buddy_free(bpage, sizeof *bpage); - buf_buddy_free(data, zip_size); + buf_buddy_free(bpage, sizeof *bpage, TRUE); + buf_buddy_free(data, zip_size, TRUE); + + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); bpage = NULL; goto func_exit; @@ -2877,18 +3028,26 @@ err_exit: HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, buf_page_address_fold(space, offset), bpage); + rw_lock_x_unlock(&page_hash_latch); + /* The block must be put to the LRU list, to the old blocks */ buf_LRU_add_block(bpage, TRUE/* to old blocks */); + mutex_enter(&flush_list_mutex); buf_LRU_insert_zip_clean(bpage); + mutex_exit(&flush_list_mutex); + + mutex_exit(&LRU_list_mutex); buf_page_set_io_fix(bpage, BUF_IO_READ); mutex_exit(&buf_pool_zip_mutex); } + mutex_enter(&buf_pool_mutex); buf_pool->n_pend_reads++; + mutex_exit(&buf_pool_mutex); func_exit: - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); if (mode == BUF_READ_IBUF_PAGES_ONLY) { @@ -2924,7 +3083,9 @@ buf_page_create( free_block = buf_LRU_get_free_block(0); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); block = (buf_block_t*) buf_page_hash_get(space, offset); @@ -2937,7 +3098,9 @@ buf_page_create( #endif /* UNIV_DEBUG_FILE_ACCESSES */ /* Page can be found in buf_pool */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); buf_block_free(free_block); @@ -2959,6 +3122,7 @@ buf_page_create( mutex_enter(&block->mutex); buf_page_init(space, offset, block); + rw_lock_x_unlock(&page_hash_latch); /* The block must be put to the LRU list */ buf_LRU_add_block(&block->page, FALSE); @@ -2985,7 +3149,7 @@ buf_page_create( the reacquisition of buf_pool_mutex. We also must defer this operation until after the block descriptor has been added to buf_pool->LRU and buf_pool->page_hash. */ - data = buf_buddy_alloc(zip_size, &lru); + data = buf_buddy_alloc(zip_size, &lru, FALSE); mutex_enter(&block->mutex); block->page.zip.data = data; @@ -3001,7 +3165,8 @@ buf_page_create( rw_lock_x_unlock(&block->lock); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); @@ -3053,6 +3218,8 @@ buf_page_io_complete( enum buf_io_fix io_type; const ibool uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + enum buf_flush flush_type; + mutex_t* block_mutex; ut_a(buf_page_in_file(bpage)); @@ -3187,8 +3354,23 @@ corrupt: } } - buf_pool_mutex_enter(); - mutex_enter(buf_page_get_mutex(bpage)); + //buf_pool_mutex_enter(); + if (io_type == BUF_IO_WRITE) { + flush_type = buf_page_get_flush_type(bpage); + /* to keep consistency at buf_LRU_insert_zip_clean() */ + //if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */ + mutex_enter(&LRU_list_mutex); + //} + } + block_mutex = buf_page_get_mutex(bpage); +retry_lock: + mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } + mutex_enter(&buf_pool_mutex); #ifdef UNIV_IBUF_COUNT_DEBUG if (io_type == BUF_IO_WRITE || uncompressed) { @@ -3228,6 +3410,11 @@ corrupt: buf_flush_write_complete(bpage); + /* to keep consistency at buf_LRU_insert_zip_clean() */ + //if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */ + mutex_exit(&LRU_list_mutex); + //} + if (uncompressed) { rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE); @@ -3250,8 +3437,9 @@ corrupt: } #endif /* UNIV_DEBUG */ - mutex_exit(buf_page_get_mutex(bpage)); - buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); + mutex_exit(block_mutex); + //buf_pool_mutex_exit(); } /************************************************************************* @@ -3273,12 +3461,14 @@ buf_pool_invalidate(void) freed = buf_LRU_search_and_free_block(100); } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0); ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG @@ -3302,7 +3492,10 @@ buf_validate(void) ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + /* for keep the new latch order, it cannot validate correctly... */ chunk = buf_pool->chunks; @@ -3401,7 +3594,7 @@ buf_validate(void) /* Check clean compressed-only blocks. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); switch (buf_page_get_io_fix(b)) { case BUF_IO_NONE: @@ -3426,8 +3619,9 @@ buf_validate(void) /* Check dirty compressed-only blocks. */ + mutex_enter(&flush_list_mutex); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(flush_list, b)) { ut_ad(b->in_flush_list); switch (buf_page_get_state(b)) { @@ -3472,6 +3666,7 @@ buf_validate(void) } ut_a(buf_page_hash_get(b->space, b->offset) == b); } + mutex_exit(&flush_list_mutex); mutex_exit(&buf_pool_zip_mutex); @@ -3483,19 +3678,27 @@ buf_validate(void) } ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru); + /* because of latching order with block->mutex, we cannot get free_list_mutex before that */ +/* if (UT_LIST_GET_LEN(buf_pool->free) != n_free) { fprintf(stderr, "Free list len %lu, free blocks %lu\n", (ulong) UT_LIST_GET_LEN(buf_pool->free), (ulong) n_free); ut_error; } +*/ + /* because of latching order with block->mutex, we cannot get flush_list_mutex before that */ +/* ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush); ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush); +*/ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); ut_a(buf_LRU_validate()); ut_a(buf_flush_validate()); @@ -3529,7 +3732,10 @@ buf_print(void) index_ids = mem_alloc(sizeof(dulint) * size); counts = mem_alloc(sizeof(ulint) * size); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + mutex_enter(&free_list_mutex); + mutex_enter(&flush_list_mutex); fprintf(stderr, "buf_pool size %lu\n" @@ -3592,7 +3798,10 @@ buf_print(void) } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + mutex_exit(&free_list_mutex); + mutex_exit(&flush_list_mutex); for (i = 0; i < n_found; i++) { index = dict_index_get_if_in_cache(index_ids[i]); @@ -3630,7 +3839,7 @@ buf_get_latched_pages_number(void) ulint i; ulint fixed_pages_number = 0; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); chunk = buf_pool->chunks; @@ -3664,7 +3873,7 @@ buf_get_latched_pages_number(void) /* Traverse the lists of clean and dirty compressed-only blocks. */ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(zip_list, b)) { ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE); @@ -3674,8 +3883,9 @@ buf_get_latched_pages_number(void) } } + mutex_enter(&flush_list_mutex); for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b; - b = UT_LIST_GET_NEXT(list, b)) { + b = UT_LIST_GET_NEXT(flush_list, b)) { ut_ad(b->in_flush_list); switch (buf_page_get_state(b)) { @@ -3698,9 +3908,10 @@ buf_get_latched_pages_number(void) break; } } + mutex_exit(&flush_list_mutex); mutex_exit(&buf_pool_zip_mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); return(fixed_pages_number); } @@ -3757,7 +3968,11 @@ buf_print_io( ut_ad(buf_pool); size = buf_pool->curr_size; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + mutex_enter(&free_list_mutex); + mutex_enter(&buf_pool_mutex); + mutex_enter(&flush_list_mutex); fprintf(file, "Buffer pool size %lu\n" @@ -3824,7 +4039,11 @@ buf_print_io( buf_LRU_stat_sum.io, buf_LRU_stat_cur.io, buf_LRU_stat_sum.unzip, buf_LRU_stat_cur.unzip); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + mutex_exit(&free_list_mutex); + mutex_exit(&buf_pool_mutex); + mutex_exit(&flush_list_mutex); } /************************************************************************** @@ -3853,7 +4072,7 @@ buf_all_freed(void) ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); /* optimistic */ chunk = buf_pool->chunks; @@ -3870,7 +4089,7 @@ buf_all_freed(void) } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); /* optimistic */ return(TRUE); } @@ -3886,7 +4105,8 @@ buf_pool_check_no_pending_io(void) { ibool ret; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU] + buf_pool->n_flush[BUF_FLUSH_LIST] @@ -3896,7 +4116,8 @@ buf_pool_check_no_pending_io(void) ret = TRUE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); return(ret); } @@ -3910,11 +4131,13 @@ buf_get_free_list_len(void) { ulint len; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&free_list_mutex); len = UT_LIST_GET_LEN(buf_pool->free); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&free_list_mutex); return(len); } === modified file 'storage/xtradb/buf/buf0flu.c' --- a/storage/xtradb/buf/buf0flu.c 2009-05-04 04:32:30 +0000 +++ b/storage/xtradb/buf/buf0flu.c 2009-06-25 01:43:25 +0000 @@ -61,7 +61,9 @@ buf_flush_insert_into_flush_list( /*=============================*/ buf_block_t* block) /* in/out: block which is modified */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&flush_list_mutex)); ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification <= block->page.oldest_modification)); @@ -72,7 +74,7 @@ buf_flush_insert_into_flush_list( ut_ad(!block->page.in_zip_hash); ut_ad(!block->page.in_flush_list); ut_d(block->page.in_flush_list = TRUE); - UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page); #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ut_a(buf_flush_validate_low()); @@ -92,7 +94,9 @@ buf_flush_insert_sorted_into_flush_list( buf_page_t* prev_b; buf_page_t* b; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&block->mutex)); + ut_ad(mutex_own(&flush_list_mutex)); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.in_LRU_list); @@ -107,13 +111,13 @@ buf_flush_insert_sorted_into_flush_list( while (b && b->oldest_modification > block->page.oldest_modification) { ut_ad(b->in_flush_list); prev_b = b; - b = UT_LIST_GET_NEXT(list, b); + b = UT_LIST_GET_NEXT(flush_list, b); } if (prev_b == NULL) { - UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page); + UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page); } else { - UT_LIST_INSERT_AFTER(list, buf_pool->flush_list, + UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b, &block->page); } @@ -134,7 +138,7 @@ buf_flush_ready_for_replace( buf_page_in_file(bpage) and in the LRU list */ { //ut_ad(buf_pool_mutex_own()); - //ut_ad(mutex_own(buf_page_get_mutex(bpage))); + ut_ad(mutex_own(buf_page_get_mutex(bpage))); //ut_ad(bpage->in_LRU_list); /* optimistic use */ if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) { @@ -169,12 +173,12 @@ buf_flush_ready_for_flush( buf_page_in_file(bpage) */ enum buf_flush flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ { - ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_a(buf_page_in_file(bpage)); + //ut_ad(buf_pool_mutex_own()); /*optimistic...*/ ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST); - if (bpage->oldest_modification != 0 + if (buf_page_in_file(bpage) && bpage->oldest_modification != 0 && buf_page_get_io_fix(bpage) == BUF_IO_NONE) { ut_ad(bpage->in_flush_list); @@ -203,8 +207,11 @@ buf_flush_remove( /*=============*/ buf_page_t* bpage) /* in: pointer to the block in question */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(buf_page_get_mutex(bpage))); + + mutex_enter(&flush_list_mutex); + ut_ad(bpage->in_flush_list); ut_d(bpage->in_flush_list = FALSE); @@ -216,21 +223,23 @@ buf_flush_remove( case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: + mutex_exit(&flush_list_mutex); ut_error; return; case BUF_BLOCK_ZIP_DIRTY: buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); - UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); buf_LRU_insert_zip_clean(bpage); break; case BUF_BLOCK_FILE_PAGE: - UT_LIST_REMOVE(list, buf_pool->flush_list, bpage); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage); break; } bpage->oldest_modification = 0; - ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list)); + ut_d(UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list)); + mutex_exit(&flush_list_mutex); } /************************************************************************ @@ -678,7 +687,9 @@ buf_flush_write_block_low( io_fixed and oldest_modification != 0. Thus, it cannot be relocated in the buffer pool or removed from flush_list or LRU_list. */ - ut_ad(!buf_pool_mutex_own()); + //ut_ad(!buf_pool_mutex_own()); + ut_ad(!mutex_own(&LRU_list_mutex)); + ut_ad(!mutex_own(&flush_list_mutex)); ut_ad(!mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); ut_ad(bpage->oldest_modification != 0); @@ -762,12 +773,19 @@ buf_flush_page( ibool is_uncompressed; ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX) + || rw_lock_own(&page_hash_latch, RW_LOCK_SHARED)); +#endif ut_ad(buf_page_in_file(bpage)); block_mutex = buf_page_get_mutex(bpage); ut_ad(mutex_own(block_mutex)); + mutex_enter(&buf_pool_mutex); + rw_lock_s_unlock(&page_hash_latch); + ut_ad(buf_flush_ready_for_flush(bpage, flush_type)); buf_page_set_io_fix(bpage, BUF_IO_WRITE); @@ -798,7 +816,8 @@ buf_flush_page( } mutex_exit(block_mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); /* Even though bpage is not protected by any mutex at this point, it is safe to access bpage, because it is @@ -835,7 +854,8 @@ buf_flush_page( immediately. */ mutex_exit(block_mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); break; default: @@ -899,7 +919,8 @@ buf_flush_try_neighbors( high = fil_space_get_size(space); } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); for (i = low; i < high; i++) { @@ -920,7 +941,13 @@ buf_flush_try_neighbors( || buf_page_is_old(bpage)) { mutex_t* block_mutex = buf_page_get_mutex(bpage); +retry_lock: mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } if (buf_flush_ready_for_flush(bpage, flush_type) && (i == offset || !bpage->buf_fix_count)) { @@ -936,14 +963,16 @@ buf_flush_try_neighbors( ut_ad(!mutex_own(block_mutex)); count++; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); } else { mutex_exit(block_mutex); } } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(count); } @@ -980,6 +1009,7 @@ buf_flush_batch( ulint old_page_count; ulint space; ulint offset; + ulint remaining = 0; ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); @@ -987,20 +1017,28 @@ buf_flush_batch( ut_ad((flush_type != BUF_FLUSH_LIST) || sync_thread_levels_empty_gen(TRUE)); #endif /* UNIV_SYNC_DEBUG */ - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if ((buf_pool->n_flush[flush_type] > 0) || (buf_pool->init_flush[flush_type] == TRUE)) { /* There is already a flush batch of the same type running */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); return(ULINT_UNDEFINED); } buf_pool->init_flush[flush_type] = TRUE; + mutex_exit(&buf_pool_mutex); + + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&LRU_list_mutex); + } + for (;;) { flush_next: /* If we have flushed enough, leave the loop */ @@ -1017,7 +1055,10 @@ flush_next: } else { ut_ad(flush_type == BUF_FLUSH_LIST); + mutex_enter(&flush_list_mutex); + remaining = UT_LIST_GET_LEN(buf_pool->flush_list); bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + mutex_exit(&flush_list_mutex); if (!bpage || bpage->oldest_modification >= lsn_limit) { /* We have flushed enough */ @@ -1037,9 +1078,15 @@ flush_next: mutex_t*block_mutex = buf_page_get_mutex(bpage); ibool ready; +retry_lock_1: ut_a(buf_page_in_file(bpage)); mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock_1; + } ready = buf_flush_ready_for_flush(bpage, flush_type); mutex_exit(block_mutex); @@ -1047,7 +1094,10 @@ flush_next: space = buf_page_get_space(bpage); offset = buf_page_get_page_no(bpage); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&LRU_list_mutex); + } old_page_count = page_count; @@ -1057,10 +1107,17 @@ flush_next: space, offset, flush_type); } else { /* Try to flush the page only */ - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); mutex_t* block_mutex = buf_page_get_mutex(bpage); +retry_lock_2: mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock_2; + } buf_page_t* bpage_tmp = buf_page_hash_get(space, offset); if (bpage_tmp) { @@ -1073,7 +1130,10 @@ flush_next: flush_type, offset, page_count - old_page_count); */ - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + if (flush_type == BUF_FLUSH_LRU) { + mutex_enter(&LRU_list_mutex); + } goto flush_next; } else if (flush_type == BUF_FLUSH_LRU) { @@ -1081,16 +1141,28 @@ flush_next: } else { ut_ad(flush_type == BUF_FLUSH_LIST); - bpage = UT_LIST_GET_PREV(list, bpage); - ut_ad(!bpage || bpage->in_flush_list); + mutex_enter(&flush_list_mutex); + bpage = UT_LIST_GET_PREV(flush_list, bpage); + //ut_ad(!bpage || bpage->in_flush_list); /* optimistic */ + mutex_exit(&flush_list_mutex); + remaining--; } } while (bpage != NULL); + if (remaining) + goto flush_next; + /* If we could not find anything to flush, leave the loop */ break; } + if (flush_type == BUF_FLUSH_LRU) { + mutex_exit(&LRU_list_mutex); + } + + mutex_enter(&buf_pool_mutex); + buf_pool->init_flush[flush_type] = FALSE; if (buf_pool->n_flush[flush_type] == 0) { @@ -1100,7 +1172,8 @@ flush_next: os_event_set(buf_pool->no_flush[flush_type]); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); buf_flush_buffered_writes(); @@ -1154,7 +1227,7 @@ buf_flush_LRU_recommendation(void) //buf_pool_mutex_enter(); if (have_LRU_mutex) - buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); n_replaceable = UT_LIST_GET_LEN(buf_pool->free); @@ -1173,7 +1246,13 @@ buf_flush_LRU_recommendation(void) mutex_t* block_mutex = buf_page_get_mutex(bpage); +retry_lock: mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } if (buf_flush_ready_for_replace(bpage)) { n_replaceable++; @@ -1188,7 +1267,7 @@ buf_flush_LRU_recommendation(void) //buf_pool_mutex_exit(); if (have_LRU_mutex) - buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) { @@ -1238,17 +1317,17 @@ buf_flush_validate_low(void) { buf_page_t* bpage; - UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list); + UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list); bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); while (bpage != NULL) { const ib_uint64_t om = bpage->oldest_modification; ut_ad(bpage->in_flush_list); - ut_a(buf_page_in_file(bpage)); + //ut_a(buf_page_in_file(bpage)); /* optimistic */ ut_a(om > 0); - bpage = UT_LIST_GET_NEXT(list, bpage); + bpage = UT_LIST_GET_NEXT(flush_list, bpage); ut_a(!bpage || om >= bpage->oldest_modification); } @@ -1266,11 +1345,13 @@ buf_flush_validate(void) { ibool ret; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); ret = buf_flush_validate_low(); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); return(ret); } === modified file 'storage/xtradb/buf/buf0lru.c' --- a/storage/xtradb/buf/buf0lru.c 2009-05-04 04:32:30 +0000 +++ b/storage/xtradb/buf/buf0lru.c 2009-06-25 01:43:25 +0000 @@ -129,25 +129,31 @@ static void buf_LRU_block_free_hashed_page( /*===========================*/ - buf_block_t* block); /* in: block, must contain a file page and + buf_block_t* block, /* in: block, must contain a file page and be in a state where it can be freed */ + ibool have_page_hash_mutex); /********************************************************************** Determines if the unzip_LRU list should be used for evicting a victim instead of the general LRU list. */ UNIV_INLINE ibool -buf_LRU_evict_from_unzip_LRU(void) +buf_LRU_evict_from_unzip_LRU( + ibool have_LRU_mutex) /*==============================*/ /* out: TRUE if should use unzip_LRU */ { ulint io_avg; ulint unzip_avg; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + if (!have_LRU_mutex) + mutex_enter(&LRU_list_mutex); /* If the unzip_LRU list is empty, we can only use the LRU. */ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) { + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); return(FALSE); } @@ -156,14 +162,20 @@ buf_LRU_evict_from_unzip_LRU(void) decompressed pages in the buffer pool. */ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) { + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); return(FALSE); } /* If eviction hasn't started yet, we assume by default that a workload is disk bound. */ if (buf_pool->freed_page_clock == 0) { + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); return(TRUE); } + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); /* Calculate the average over past intervals, and add the values of the current interval. */ @@ -229,7 +241,8 @@ buf_LRU_drop_page_hash_for_tablespace( page_arr = ut_malloc(sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); scan_again: num_entries = 0; @@ -239,7 +252,13 @@ scan_again: mutex_t* block_mutex = buf_page_get_mutex(bpage); buf_page_t* prev_bpage; +retry_lock: mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } prev_bpage = UT_LIST_GET_PREV(LRU, bpage); ut_a(buf_page_in_file(bpage)); @@ -269,12 +288,14 @@ scan_again: } /* Array full. We release the buf_pool_mutex to obey the latching order. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); num_entries = 0; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); } else { mutex_exit(block_mutex); } @@ -299,7 +320,8 @@ next_page: } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); /* Drop any remaining batch of search hashed pages. */ buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries); @@ -327,7 +349,9 @@ buf_LRU_invalidate_tablespace( buf_LRU_drop_page_hash_for_tablespace(id); scan_again: - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); all_freed = TRUE; @@ -339,7 +363,13 @@ scan_again: ut_a(buf_page_in_file(bpage)); +retry_lock: mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } prev_bpage = UT_LIST_GET_PREV(LRU, bpage); if (buf_page_get_space(bpage) == id) { @@ -369,7 +399,9 @@ scan_again: ulint page_no; ulint zip_size; - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); zip_size = buf_page_get_zip_size(bpage); page_no = buf_page_get_page_no(bpage); @@ -393,7 +425,7 @@ scan_again: if (buf_LRU_block_remove_hashed_page(bpage, TRUE) != BUF_BLOCK_ZIP_FREE) { buf_LRU_block_free_hashed_page((buf_block_t*) - bpage); + bpage, TRUE); } else { /* The block_mutex should have been released by buf_LRU_block_remove_hashed_page() @@ -416,7 +448,9 @@ next_page: bpage = prev_bpage; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); if (!all_freed) { os_thread_sleep(20000); @@ -439,14 +473,16 @@ buf_LRU_get_recent_limit(void) ulint len; ulint limit; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); len = UT_LIST_GET_LEN(buf_pool->LRU); if (len < BUF_LRU_OLD_MIN_LEN) { /* The LRU list is too short to do read-ahead */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); return(0); } @@ -455,7 +491,8 @@ buf_LRU_get_recent_limit(void) limit = buf_page_get_LRU_position(bpage) - len / BUF_LRU_INITIAL_RATIO; - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); return(limit); } @@ -470,7 +507,9 @@ buf_LRU_insert_zip_clean( { buf_page_t* b; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); + ut_ad(mutex_own(&flush_list_mutex)); ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); /* Find the first successor of bpage in the LRU list @@ -478,17 +517,17 @@ buf_LRU_insert_zip_clean( b = bpage; do { b = UT_LIST_GET_NEXT(LRU, b); - } while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE); + } while (b && (buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE || !b->in_LRU_list)); /* Insert bpage before b, i.e., after the predecessor of b. */ if (b) { - b = UT_LIST_GET_PREV(list, b); + b = UT_LIST_GET_PREV(zip_list, b); } if (b) { - UT_LIST_INSERT_AFTER(list, buf_pool->zip_clean, b, bpage); + UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, bpage); } else { - UT_LIST_ADD_FIRST(list, buf_pool->zip_clean, bpage); + UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage); } } @@ -500,16 +539,17 @@ ibool buf_LRU_free_from_unzip_LRU_list( /*=============================*/ /* out: TRUE if freed */ - ulint n_iterations) /* in: how many times this has been called + ulint n_iterations, /* in: how many times this has been called repeatedly without result: a high value means that we should search farther; we will search n_iterations / 5 of the unzip_LRU list, or nothing if n_iterations >= 5 */ + ibool have_LRU_mutex) { buf_block_t* block; ulint distance; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /* optimistic */ /* Theoratically it should be much easier to find a victim from unzip_LRU as we can choose even a dirty block (as we'll @@ -519,7 +559,7 @@ buf_LRU_free_from_unzip_LRU_list( if we have done five iterations so far. */ if (UNIV_UNLIKELY(n_iterations >= 5) - || !buf_LRU_evict_from_unzip_LRU()) { + || !buf_LRU_evict_from_unzip_LRU(have_LRU_mutex)) { return(FALSE); } @@ -527,18 +567,25 @@ buf_LRU_free_from_unzip_LRU_list( distance = 100 + (n_iterations * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5; +restart: for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0); block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) { enum buf_lru_free_block_status freed; + mutex_enter(&block->mutex); + if (!block->in_unzip_LRU_list || !block->page.in_LRU_list + || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + mutex_exit(&block->mutex); + goto restart; + } + ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->in_unzip_LRU_list); ut_ad(block->page.in_LRU_list); - mutex_enter(&block->mutex); - freed = buf_LRU_free_block(&block->page, FALSE, NULL); + freed = buf_LRU_free_block(&block->page, FALSE, NULL, have_LRU_mutex); mutex_exit(&block->mutex); switch (freed) { @@ -571,20 +618,22 @@ ibool buf_LRU_free_from_common_LRU_list( /*==============================*/ /* out: TRUE if freed */ - ulint n_iterations) /* in: how many times this has been called + ulint n_iterations, /* in: how many times this has been called repeatedly without result: a high value means that we should search farther; if n_iterations < 10, then we search n_iterations / 10 * buf_pool->curr_size pages from the end of the LRU list */ + ibool have_LRU_mutex) { buf_page_t* bpage; ulint distance; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /* optimistic */ distance = 100 + (n_iterations * buf_pool->curr_size) / 10; +restart: for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0); bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) { @@ -593,11 +642,25 @@ buf_LRU_free_from_common_LRU_list( mutex_t* block_mutex = buf_page_get_mutex(bpage); +retry_lock: + mutex_enter(block_mutex); + + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } + + if (!bpage->in_LRU_list + || !buf_page_in_file(bpage)) { + mutex_exit(block_mutex); + goto restart; + } + ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->in_LRU_list); - mutex_enter(block_mutex); - freed = buf_LRU_free_block(bpage, TRUE, NULL); + freed = buf_LRU_free_block(bpage, TRUE, NULL, have_LRU_mutex); mutex_exit(block_mutex); switch (freed) { @@ -640,22 +703,33 @@ buf_LRU_search_and_free_block( n_iterations / 5 of the unzip_LRU list. */ { ibool freed = FALSE; + ibool have_LRU_mutex = FALSE; + + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)) + have_LRU_mutex = TRUE; - buf_pool_mutex_enter(); + /* optimistic search... */ + //buf_pool_mutex_enter(); + if (have_LRU_mutex) + mutex_enter(&LRU_list_mutex); - freed = buf_LRU_free_from_unzip_LRU_list(n_iterations); + freed = buf_LRU_free_from_unzip_LRU_list(n_iterations, have_LRU_mutex); if (!freed) { - freed = buf_LRU_free_from_common_LRU_list(n_iterations); + freed = buf_LRU_free_from_common_LRU_list(n_iterations, have_LRU_mutex); } + mutex_enter(&buf_pool_mutex); if (!freed) { buf_pool->LRU_flush_ended = 0; } else if (buf_pool->LRU_flush_ended > 0) { buf_pool->LRU_flush_ended--; } + mutex_exit(&buf_pool_mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + if (have_LRU_mutex) + mutex_exit(&LRU_list_mutex); return(freed); } @@ -673,18 +747,22 @@ void buf_LRU_try_free_flushed_blocks(void) /*=================================*/ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); while (buf_pool->LRU_flush_ended > 0) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); buf_LRU_search_and_free_block(1); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); } /********************************************************************** @@ -700,7 +778,9 @@ buf_LRU_buf_pool_running_out(void) { ibool ret = FALSE; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); + mutex_enter(&free_list_mutex); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 4) { @@ -708,7 +788,9 @@ buf_LRU_buf_pool_running_out(void) ret = TRUE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + mutex_exit(&free_list_mutex); return(ret); } @@ -725,9 +807,10 @@ buf_LRU_get_free_only(void) { buf_block_t* block; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); - block = (buf_block_t*) UT_LIST_GET_FIRST(buf_pool->free); + mutex_enter(&free_list_mutex); + block = (buf_block_t*) UT_LIST_GET_LAST(buf_pool->free); if (block) { ut_ad(block->page.in_free_list); @@ -735,7 +818,9 @@ buf_LRU_get_free_only(void) ut_ad(!block->page.in_flush_list); ut_ad(!block->page.in_LRU_list); ut_a(!buf_page_in_file(&block->page)); - UT_LIST_REMOVE(list, buf_pool->free, (&block->page)); + UT_LIST_REMOVE(free, buf_pool->free, (&block->page)); + + mutex_exit(&free_list_mutex); mutex_enter(&block->mutex); @@ -743,6 +828,8 @@ buf_LRU_get_free_only(void) UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); mutex_exit(&block->mutex); + } else { + mutex_exit(&free_list_mutex); } return(block); @@ -767,7 +854,7 @@ buf_LRU_get_free_block( ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) { @@ -847,14 +934,16 @@ loop: if (UNIV_UNLIKELY(zip_size)) { ibool lru; page_zip_set_size(&block->page.zip, zip_size); - block->page.zip.data = buf_buddy_alloc(zip_size, &lru); + mutex_enter(&LRU_list_mutex); + block->page.zip.data = buf_buddy_alloc(zip_size, &lru, FALSE); + mutex_exit(&LRU_list_mutex); UNIV_MEM_DESC(block->page.zip.data, zip_size, block); } else { page_zip_set_size(&block->page.zip, 0); block->page.zip.data = NULL; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; @@ -866,7 +955,7 @@ loop: /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); freed = buf_LRU_search_and_free_block(n_iterations); @@ -915,18 +1004,21 @@ loop: os_aio_simulated_wake_handler_threads(); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if (buf_pool->LRU_flush_ended > 0) { /* We have written pages in an LRU flush. To make the insert buffer more efficient, we try to move these pages to the free list. */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); buf_LRU_try_free_flushed_blocks(); } else { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); } if (n_iterations > 10) { @@ -951,7 +1043,8 @@ buf_LRU_old_adjust_len(void) ulint new_len; ut_a(buf_pool->LRU_old); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); #if 3 * (BUF_LRU_OLD_MIN_LEN / 8) <= BUF_LRU_OLD_TOLERANCE + 5 # error "3 * (BUF_LRU_OLD_MIN_LEN / 8) <= BUF_LRU_OLD_TOLERANCE + 5" #endif @@ -1009,7 +1102,8 @@ buf_LRU_old_init(void) { buf_page_t* bpage; - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN); /* We first initialize all blocks in the LRU list as old and then use @@ -1041,13 +1135,14 @@ buf_unzip_LRU_remove_block_if_needed( ut_ad(buf_pool); ut_ad(bpage); ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); if (buf_page_belongs_to_unzip_LRU(bpage)) { buf_block_t* block = (buf_block_t*) bpage; ut_ad(block->in_unzip_LRU_list); - ut_d(block->in_unzip_LRU_list = FALSE); + block->in_unzip_LRU_list = FALSE; UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block); } @@ -1063,7 +1158,8 @@ buf_LRU_remove_block( { ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); @@ -1126,12 +1222,13 @@ buf_unzip_LRU_add_block( { ut_ad(buf_pool); ut_ad(block); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); ut_ad(!block->in_unzip_LRU_list); - ut_d(block->in_unzip_LRU_list = TRUE); + block->in_unzip_LRU_list = TRUE; if (old) { UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block); @@ -1152,7 +1249,8 @@ buf_LRU_add_block_to_end_low( ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); @@ -1212,7 +1310,8 @@ buf_LRU_add_block_low( { ut_ad(buf_pool); ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_a(buf_page_in_file(bpage)); ut_ad(!bpage->in_LRU_list); @@ -1331,22 +1430,23 @@ buf_LRU_free_block( buf_page_t* bpage, /* in: block to be freed */ ibool zip, /* in: TRUE if should remove also the compressed page of an uncompressed page */ - ibool* buf_pool_mutex_released) + ibool* buf_pool_mutex_released, /* in: pointer to a variable that will be assigned TRUE if buf_pool_mutex was temporarily released, or NULL */ + ibool have_LRU_mutex) { buf_page_t* b = NULL; mutex_t* block_mutex = buf_page_get_mutex(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(block_mutex)); ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); + //ut_ad(bpage->in_LRU_list); ut_ad(!bpage->in_flush_list == !bpage->oldest_modification); UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage); - if (!buf_page_can_relocate(bpage)) { + if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) { /* Do not free buffer-fixed or I/O-fixed blocks. */ return(BUF_LRU_NOT_FREED); @@ -1378,15 +1478,15 @@ buf_LRU_free_block( If it cannot be allocated (without freeing a block from the LRU list), refuse to free bpage. */ alloc: - buf_pool_mutex_exit_forbid(); - b = buf_buddy_alloc(sizeof *b, NULL); - buf_pool_mutex_exit_allow(); + //buf_pool_mutex_exit_forbid(); + b = buf_buddy_alloc(sizeof *b, NULL, FALSE); + //buf_pool_mutex_exit_allow(); if (UNIV_UNLIKELY(!b)) { return(BUF_LRU_CANNOT_RELOCATE); } - memcpy(b, bpage, sizeof *b); + //memcpy(b, bpage, sizeof *b); } #ifdef UNIV_DEBUG @@ -1397,6 +1497,39 @@ alloc: } #endif /* UNIV_DEBUG */ + /* not to break latch order, must re-enter block_mutex */ + mutex_exit(block_mutex); + + if (!have_LRU_mutex) + mutex_enter(&LRU_list_mutex); /* optimistic */ + rw_lock_x_lock(&page_hash_latch); + mutex_enter(block_mutex); + + /* recheck states of block */ + if (!bpage->in_LRU_list || block_mutex != buf_page_get_mutex(bpage) + || !buf_page_can_relocate(bpage)) { +not_freed: + if (b) { + buf_buddy_free(b, sizeof *b, TRUE); + } + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + return(BUF_LRU_NOT_FREED); + } else if (zip || !bpage->zip.data) { + if (bpage->oldest_modification) + goto not_freed; + } else if (bpage->oldest_modification) { + if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { + ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); + goto not_freed; + } + } + + if (b) { + memcpy(b, bpage, sizeof *b); + } + if (buf_LRU_block_remove_hashed_page(bpage, zip) != BUF_BLOCK_ZIP_FREE) { ut_a(bpage->buf_fix_count == 0); @@ -1408,6 +1541,10 @@ alloc: ut_a(!buf_page_hash_get(bpage->space, bpage->offset)); + while (prev_b && !prev_b->in_LRU_list) { + prev_b = UT_LIST_GET_PREV(LRU, prev_b); + } + b->state = b->oldest_modification ? BUF_BLOCK_ZIP_DIRTY : BUF_BLOCK_ZIP_PAGE; @@ -1482,6 +1619,7 @@ alloc: buf_LRU_add_block_low(b, buf_page_is_old(b)); } + mutex_enter(&flush_list_mutex); if (b->state == BUF_BLOCK_ZIP_PAGE) { buf_LRU_insert_zip_clean(b); } else { @@ -1490,22 +1628,23 @@ alloc: ut_ad(b->in_flush_list); ut_d(bpage->in_flush_list = FALSE); - prev = UT_LIST_GET_PREV(list, b); - UT_LIST_REMOVE(list, buf_pool->flush_list, b); + prev = UT_LIST_GET_PREV(flush_list, b); + UT_LIST_REMOVE(flush_list, buf_pool->flush_list, b); if (prev) { ut_ad(prev->in_flush_list); UT_LIST_INSERT_AFTER( - list, + flush_list, buf_pool->flush_list, prev, b); } else { UT_LIST_ADD_FIRST( - list, + flush_list, buf_pool->flush_list, b); } } + mutex_exit(&flush_list_mutex); bpage->zip.data = NULL; page_zip_set_size(&bpage->zip, 0); @@ -1521,7 +1660,9 @@ alloc: *buf_pool_mutex_released = TRUE; } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); mutex_exit(block_mutex); /* Remove possible adaptive hash index on the page. @@ -1553,7 +1694,9 @@ alloc: : BUF_NO_CHECKSUM_MAGIC); } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + if (have_LRU_mutex) + mutex_enter(&LRU_list_mutex); mutex_enter(block_mutex); if (b) { @@ -1563,13 +1706,17 @@ alloc: mutex_exit(&buf_pool_zip_mutex); } - buf_LRU_block_free_hashed_page((buf_block_t*) bpage); + buf_LRU_block_free_hashed_page((buf_block_t*) bpage, FALSE); } else { /* The block_mutex should have been released by buf_LRU_block_remove_hashed_page() when it returns BUF_BLOCK_ZIP_FREE. */ ut_ad(block_mutex == &buf_pool_zip_mutex); mutex_enter(block_mutex); + + if (!have_LRU_mutex) + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); } return(BUF_LRU_FREED); @@ -1581,12 +1728,13 @@ UNIV_INTERN void buf_LRU_block_free_non_file_page( /*=============================*/ - buf_block_t* block) /* in: block, must not contain a file page */ + buf_block_t* block, /* in: block, must not contain a file page */ + ibool have_page_hash_mutex) { void* data; ut_ad(block); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(&block->mutex)); switch (buf_block_get_state(block)) { @@ -1620,15 +1768,17 @@ buf_LRU_block_free_non_file_page( if (data) { block->page.zip.data = NULL; mutex_exit(&block->mutex); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(data, page_zip_get_size(&block->page.zip)); - buf_pool_mutex_exit_allow(); + //buf_pool_mutex_exit_forbid(); + buf_buddy_free(data, page_zip_get_size(&block->page.zip), have_page_hash_mutex); + //buf_pool_mutex_exit_allow(); mutex_enter(&block->mutex); page_zip_set_size(&block->page.zip, 0); } - UT_LIST_ADD_FIRST(list, buf_pool->free, (&block->page)); + mutex_enter(&free_list_mutex); + UT_LIST_ADD_FIRST(free, buf_pool->free, (&block->page)); ut_d(block->page.in_free_list = TRUE); + mutex_exit(&free_list_mutex); UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE); } @@ -1657,7 +1807,11 @@ buf_LRU_block_remove_hashed_page( { const buf_page_t* hashed_bpage; ut_ad(bpage); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)); +#endif ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); @@ -1758,7 +1912,9 @@ buf_LRU_block_remove_hashed_page( #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG mutex_exit(buf_page_get_mutex(bpage)); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); buf_print(); buf_LRU_print(); buf_validate(); @@ -1781,14 +1937,14 @@ buf_LRU_block_remove_hashed_page( ut_a(bpage->zip.data); ut_a(buf_page_get_zip_size(bpage)); - UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage); + UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage); mutex_exit(&buf_pool_zip_mutex); - buf_pool_mutex_exit_forbid(); + //buf_pool_mutex_exit_forbid(); buf_buddy_free(bpage->zip.data, - page_zip_get_size(&bpage->zip)); - buf_buddy_free(bpage, sizeof(*bpage)); - buf_pool_mutex_exit_allow(); + page_zip_get_size(&bpage->zip), TRUE); + buf_buddy_free(bpage, sizeof(*bpage), TRUE); + //buf_pool_mutex_exit_allow(); UNIV_MEM_UNDESC(bpage); return(BUF_BLOCK_ZIP_FREE); @@ -1807,9 +1963,9 @@ buf_LRU_block_remove_hashed_page( bpage->zip.data = NULL; mutex_exit(&((buf_block_t*) bpage)->mutex); - buf_pool_mutex_exit_forbid(); - buf_buddy_free(data, page_zip_get_size(&bpage->zip)); - buf_pool_mutex_exit_allow(); + //buf_pool_mutex_exit_forbid(); + buf_buddy_free(data, page_zip_get_size(&bpage->zip), TRUE); + //buf_pool_mutex_exit_allow(); mutex_enter(&((buf_block_t*) bpage)->mutex); page_zip_set_size(&bpage->zip, 0); } @@ -1835,15 +1991,16 @@ static void buf_LRU_block_free_hashed_page( /*===========================*/ - buf_block_t* block) /* in: block, must contain a file page and + buf_block_t* block, /* in: block, must contain a file page and be in a state where it can be freed */ + ibool have_page_hash_mutex) { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(&block->mutex)); buf_block_set_state(block, BUF_BLOCK_MEMORY); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, have_page_hash_mutex); } /************************************************************************ @@ -1861,7 +2018,8 @@ buf_LRU_stat_update(void) goto func_exit; } - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); /* Update the index. */ item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind]; @@ -1875,7 +2033,8 @@ buf_LRU_stat_update(void) /* Put current entry in the array. */ memcpy(item, &buf_LRU_stat_cur, sizeof *item); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); func_exit: /* Clear the current entry. */ @@ -1897,7 +2056,8 @@ buf_LRU_validate(void) ulint LRU_pos; ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { @@ -1956,15 +2116,21 @@ buf_LRU_validate(void) ut_a(buf_pool->LRU_old_len == old_len); } - UT_LIST_VALIDATE(list, buf_page_t, buf_pool->free); + mutex_exit(&LRU_list_mutex); + mutex_enter(&free_list_mutex); + + UT_LIST_VALIDATE(free, buf_page_t, buf_pool->free); for (bpage = UT_LIST_GET_FIRST(buf_pool->free); bpage != NULL; - bpage = UT_LIST_GET_NEXT(list, bpage)) { + bpage = UT_LIST_GET_NEXT(free, bpage)) { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED); } + mutex_exit(&free_list_mutex); + mutex_enter(&LRU_list_mutex); + UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU); for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU); @@ -1976,7 +2142,8 @@ buf_LRU_validate(void) ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); return(TRUE); } #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ @@ -1992,7 +2159,8 @@ buf_LRU_print(void) const buf_page_t* bpage; ut_ad(buf_pool); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&LRU_list_mutex); fprintf(stderr, "Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock); @@ -2055,6 +2223,7 @@ buf_LRU_print(void) bpage = UT_LIST_GET_NEXT(LRU, bpage); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&LRU_list_mutex); } #endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ === modified file 'storage/xtradb/buf/buf0rea.c' --- a/storage/xtradb/buf/buf0rea.c 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/buf/buf0rea.c 2009-07-06 05:47:15 +0000 @@ -134,6 +134,46 @@ buf_read_page_low( bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip, tablespace_version, offset); if (bpage == NULL) { + /* bugfix: http://bugs.mysql.com/bug.php?id=43948 */ + if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) { + /* hashed log recs must be treated here */ + recv_addr_t* recv_addr; + + mutex_enter(&(recv_sys->mutex)); + + if (recv_sys->apply_log_recs == FALSE) { + mutex_exit(&(recv_sys->mutex)); + goto not_to_recover; + } + + /* recv_get_fil_addr_struct() */ + recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, + hash_calc_hash(ut_fold_ulint_pair(space, offset), + recv_sys->addr_hash)); + while (recv_addr) { + if ((recv_addr->space == space) + && (recv_addr->page_no == offset)) { + break; + } + recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); + } + + if ((recv_addr == NULL) + || (recv_addr->state == RECV_BEING_PROCESSED) + || (recv_addr->state == RECV_PROCESSED)) { + mutex_exit(&(recv_sys->mutex)); + goto not_to_recover; + } + + fprintf(stderr, " (cannot find space: %lu)", space); + recv_addr->state = RECV_PROCESSED; + + ut_a(recv_sys->n_addrs); + recv_sys->n_addrs--; + + mutex_exit(&(recv_sys->mutex)); + } +not_to_recover: return(0); } @@ -246,18 +286,22 @@ buf_read_ahead_random( LRU_recent_limit = buf_LRU_get_recent_limit(); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); return(0); } + mutex_exit(&buf_pool_mutex); /* Count how many blocks in the area have been recently accessed, that is, reside near the start of the LRU list. */ + rw_lock_s_lock(&page_hash_latch); for (i = low; i < high; i++) { const buf_page_t* bpage = buf_page_hash_get(space, i); @@ -269,13 +313,15 @@ buf_read_ahead_random( if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); goto read_ahead; } } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); /* Do nothing */ return(0); @@ -469,10 +515,12 @@ buf_read_ahead_linear( tablespace_version = fil_space_get_version(space); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&buf_pool_mutex); if (high > fil_space_get_size(space)) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); /* The area is not whole, return */ return(0); @@ -480,10 +528,12 @@ buf_read_ahead_linear( if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&buf_pool_mutex); return(0); } + mutex_exit(&buf_pool_mutex); /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, @@ -497,6 +547,7 @@ buf_read_ahead_linear( fail_count = 0; + rw_lock_s_lock(&page_hash_latch); for (i = low; i < high; i++) { bpage = buf_page_hash_get(space, i); @@ -520,7 +571,8 @@ buf_read_ahead_linear( * LINEAR_AREA_THRESHOLD_COEF) { /* Too many failures: return */ - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(0); } @@ -531,7 +583,8 @@ buf_read_ahead_linear( bpage = buf_page_hash_get(space, offset); if (bpage == NULL) { - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(0); } @@ -557,7 +610,8 @@ buf_read_ahead_linear( pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); if ((offset == low) && (succ_offset == offset + 1)) { @@ -770,11 +824,11 @@ buf_read_recv_pages( while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); - os_thread_sleep(500000); + os_thread_sleep(10000); count++; - if (count > 100) { + if (count > 5000) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" " 50 seconds for pending\n" === modified file 'storage/xtradb/dict/dict0boot.c' --- a/storage/xtradb/dict/dict0boot.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/dict/dict0boot.c 2009-06-25 01:43:25 +0000 @@ -265,6 +265,7 @@ dict_boot(void) system tables */ /*-------------------------*/ table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0); + table->n_mysql_handles_opened = 1; /* for pin */ dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); @@ -314,6 +315,7 @@ dict_boot(void) /*-------------------------*/ table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0); + table->n_mysql_handles_opened = 1; /* for pin */ dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); @@ -346,6 +348,7 @@ dict_boot(void) /*-------------------------*/ table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0); + table->n_mysql_handles_opened = 1; /* for pin */ dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); @@ -388,6 +391,7 @@ dict_boot(void) /*-------------------------*/ table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0); + table->n_mysql_handles_opened = 1; /* for pin */ dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); === modified file 'storage/xtradb/dict/dict0crea.c' --- a/storage/xtradb/dict/dict0crea.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/dict/dict0crea.c 2009-06-25 01:43:25 +0000 @@ -1184,6 +1184,9 @@ dict_create_or_check_foreign_constraint_ /* Foreign constraint system tables have already been created, and they are ok */ + table1->n_mysql_handles_opened = 1; /* for pin */ + table2->n_mysql_handles_opened = 1; /* for pin */ + mutex_exit(&(dict_sys->mutex)); return(DB_SUCCESS); @@ -1265,6 +1268,11 @@ dict_create_or_check_foreign_constraint_ trx_commit_for_mysql(trx); + table1 = dict_table_get_low("SYS_FOREIGN"); + table2 = dict_table_get_low("SYS_FOREIGN_COLS"); + table1->n_mysql_handles_opened = 1; /* for pin */ + table2->n_mysql_handles_opened = 1; /* for pin */ + row_mysql_unlock_data_dictionary(trx); trx_free_for_mysql(trx); === modified file 'storage/xtradb/dict/dict0dict.c' --- a/storage/xtradb/dict/dict0dict.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/dict/dict0dict.c 2009-08-03 07:14:02 +0000 @@ -545,6 +545,8 @@ dict_table_get_on_id( table = dict_table_get_on_id_low(table_id); + dict_table_LRU_trim(table); + mutex_exit(&(dict_sys->mutex)); return(table); @@ -659,6 +661,8 @@ dict_table_get( table->n_mysql_handles_opened++; } + dict_table_LRU_trim(table); + mutex_exit(&(dict_sys->mutex)); if (table != NULL) { @@ -1153,6 +1157,64 @@ dict_table_remove_from_cache( dict_mem_table_free(table); } +/************************************************************************** +Frees tables from the end of table_LRU if the dictionary cache occupies +too much space. */ +UNIV_INTERN +void +dict_table_LRU_trim( +/*================*/ + dict_table_t* self) +{ + dict_table_t* table; + dict_table_t* prev_table; + dict_foreign_t* foreign; + ulint n_removed; + ulint n_have_parent; + ulint cached_foreign_tables; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(mutex_own(&(dict_sys->mutex))); +#endif /* UNIV_SYNC_DEBUG */ + +retry: + n_removed = n_have_parent = 0; + table = UT_LIST_GET_LAST(dict_sys->table_LRU); + + while ( srv_dict_size_limit && table + && ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells) * sizeof(hash_cell_t) + + dict_sys->size) > srv_dict_size_limit ) { + prev_table = UT_LIST_GET_PREV(table_LRU, table); + + if (table == self || table->n_mysql_handles_opened) + goto next_loop; + + cached_foreign_tables = 0; + foreign = UT_LIST_GET_FIRST(table->foreign_list); + while (foreign != NULL) { + if (foreign->referenced_table) + cached_foreign_tables++; + foreign = UT_LIST_GET_NEXT(foreign_list, foreign); + } + + if (cached_foreign_tables == 0) { + dict_table_remove_from_cache(table); + n_removed++; + } else { + n_have_parent++; + } +next_loop: + table = prev_table; + } + + if ( srv_dict_size_limit && n_have_parent && n_removed + && ((dict_sys->table_hash->n_cells + + dict_sys->table_id_hash->n_cells) * sizeof(hash_cell_t) + + dict_sys->size) > srv_dict_size_limit ) + goto retry; +} + /******************************************************************** If the given column name is reserved for InnoDB system columns, return TRUE. */ @@ -2987,7 +3049,7 @@ scan_more: } else if (quote) { /* Within quotes: do not look for starting quotes or comments. */ - } else if (*sptr == '"' || *sptr == '`') { + } else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') { /* Starting quote: remember the quote character. */ quote = *sptr; } else if (*sptr == '#' @@ -4276,7 +4338,8 @@ dict_table_print_low( ut_ad(mutex_own(&(dict_sys->mutex))); - dict_update_statistics_low(table, TRUE); + if (srv_stats_auto_update) + dict_update_statistics_low(table, TRUE); fprintf(stderr, "--------------------------------------\n" === modified file 'storage/xtradb/dict/dict0load.c' --- a/storage/xtradb/dict/dict0load.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/dict/dict0load.c 2009-06-25 01:43:25 +0000 @@ -223,7 +223,7 @@ loop: /* The table definition was corrupt if there is no index */ - if (dict_table_get_first_index(table)) { + if (srv_stats_auto_update && dict_table_get_first_index(table)) { dict_update_statistics_low(table, TRUE); } === modified file 'storage/xtradb/fil/fil0fil.c' --- a/storage/xtradb/fil/fil0fil.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/fil/fil0fil.c 2009-06-25 01:43:25 +0000 @@ -42,6 +42,10 @@ Created 10/25/1995 Heikki Tuuri #include "mtr0log.h" #include "dict0dict.h" #include "page0zip.h" +#include "trx0trx.h" +#include "trx0sys.h" +#include "pars0pars.h" +#include "row0mysql.h" /* @@ -2977,7 +2981,7 @@ fil_open_single_table_tablespace( ut_a(flags != DICT_TF_COMPACT); file = os_file_create_simple_no_error_handling( - filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); if (!success) { /* The following call prints an error message */ os_file_get_last_error(TRUE); @@ -3025,6 +3029,275 @@ fil_open_single_table_tablespace( space_id = fsp_header_get_space_id(page); space_flags = fsp_header_get_flags(page); + if (srv_expand_import && (space_id != id || space_flags != flags)) { + dulint old_id[31]; + dulint new_id[31]; + ulint root_page[31]; + ulint n_index; + os_file_t info_file = -1; + char* info_file_path; + ulint i; + int len; + ib_uint64_t current_lsn; + + current_lsn = log_get_lsn(); + + /* overwrite fsp header */ + fsp_header_init_fields(page, id, flags); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id); + space_id = id; + space_flags = flags; + if (mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn) + mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn); + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? buf_calc_page_new_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + srv_use_checksums + ? buf_calc_page_old_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE); + + /* get file size */ + ulint size_low, size_high, size; + ib_int64_t size_bytes; + os_file_get_size(file, &size_low, &size_high); + size_bytes = (((ib_int64_t)size_high) << 32) + + (ib_int64_t)size_low; + + /* get cruster index information */ + dict_table_t* table; + dict_index_t* index; + table = dict_table_get_low(name); + index = dict_table_get_first_index(table); + ut_a(index->page==3); + + + /* read metadata from .exp file */ + n_index = 0; + bzero(old_id, sizeof(old_id)); + bzero(new_id, sizeof(new_id)); + bzero(root_page, sizeof(root_page)); + + info_file_path = fil_make_ibd_name(name, FALSE); + len = strlen(info_file_path); + info_file_path[len - 3] = 'e'; + info_file_path[len - 2] = 'x'; + info_file_path[len - 1] = 'p'; + + info_file = os_file_create_simple_no_error_handling( + info_file_path, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success); + if (!success) { + fprintf(stderr, "InnoDB: cannot open %s\n", info_file_path); + goto skip_info; + } + success = os_file_read(info_file, page, 0, 0, UNIV_PAGE_SIZE); + if (!success) { + fprintf(stderr, "InnoDB: cannot read %s\n", info_file_path); + goto skip_info; + } + if (mach_read_from_4(page) != 0x78706f72UL + || mach_read_from_4(page + 4) != 0x74696e66UL) { + fprintf(stderr, "InnoDB: %s seems not to be a correct .exp file\n", info_file_path); + goto skip_info; + } + + fprintf(stderr, "InnoDB: import: extended import of %s is started.\n", name); + + n_index = mach_read_from_4(page + 8); + fprintf(stderr, "InnoDB: import: %lu indexes are detected.\n", (ulong)n_index); + for (i = 0; i < n_index; i++) { + new_id[i] = + dict_table_get_index_on_name(table, + (page + (i + 1) * 512 + 12))->id; + old_id[i] = mach_read_from_8(page + (i + 1) * 512); + root_page[i] = mach_read_from_4(page + (i + 1) * 512 + 8); + } + +skip_info: + if (info_file != -1) + os_file_close(info_file); + + /* + if (size_bytes >= 1024 * 1024) { + size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); + } + */ + if (!(flags & DICT_TF_ZSSIZE_MASK)) { + mem_heap_t* heap = NULL; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + /* over write space id of all pages */ + ib_int64_t offset; + + rec_offs_init(offsets_); + + fprintf(stderr, "InnoDB: Progress in %:"); + + for (offset = 0; offset < size_bytes; offset += UNIV_PAGE_SIZE) { + success = os_file_read(file, page, + (ulint)(offset & 0xFFFFFFFFUL), + (ulint)(offset >> 32), UNIV_PAGE_SIZE); + if (mach_read_from_4(page + FIL_PAGE_OFFSET) || !offset) { + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id); + + for (i = 0; i < n_index; i++) { + if (offset / UNIV_PAGE_SIZE == root_page[i]) { + /* this is index root page */ + mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + + FSEG_HDR_SPACE, id); + mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + + FSEG_HDR_SPACE, id); + break; + } + } + + if (fil_page_get_type(page) == FIL_PAGE_INDEX) { + dulint tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID)); + + if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0 + && ut_dulint_cmp(old_id[0], tmp) == 0) { + /* leaf page of cluster index, reset trx_id of records */ + rec_t* rec; + rec_t* supremum; + ulint n_recs; + + supremum = page_get_supremum_rec(page); + rec = page_rec_get_next(page_get_infimum_rec(page)); + n_recs = page_get_n_recs(page); + + while (rec && rec != supremum && n_recs > 0) { + ulint offset = index->trx_id_offset; + if (!offset) { + offsets = rec_get_offsets(rec, index, offsets, + ULINT_UNDEFINED, &heap); + offset = row_get_trx_id_offset(rec, index, offsets); + } + trx_write_trx_id(rec + offset, ut_dulint_create(0, 1)); + rec = page_rec_get_next(rec); + n_recs--; + } + } + + for (i = 0; i < n_index; i++) { + if (ut_dulint_cmp(old_id[i], tmp) == 0) { + mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]); + break; + } + } + } + + if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) { + mach_write_ull(page + FIL_PAGE_LSN, current_lsn); + mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + current_lsn); + } + + mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, + srv_use_checksums + ? buf_calc_page_new_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, + srv_use_checksums + ? buf_calc_page_old_checksum(page) + : BUF_NO_CHECKSUM_MAGIC); + + success = os_file_write(filepath, file, page, + (ulint)(offset & 0xFFFFFFFFUL), + (ulint)(offset >> 32), UNIV_PAGE_SIZE); + } + + if (size_bytes + && ((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / size_bytes) + != ((offset * 100) / size_bytes)) { + fprintf(stderr, " %lu", + (ulong)((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / size_bytes)); + } + } + + fprintf(stderr, " done.\n"); + + /* update SYS_INDEXES set root page */ + index = dict_table_get_first_index(table); + while (index) { + for (i = 0; i < n_index; i++) { + if (ut_dulint_cmp(new_id[i], index->id) == 0) { + break; + } + } + + if (i != n_index + && root_page[i] != index->page) { + /* must update */ + ulint error; + trx_t* trx; + pars_info_t* info = NULL; + + trx = trx_allocate_for_mysql(); + trx->op_info = "extended import"; + + info = pars_info_create(); + + pars_info_add_dulint_literal(info, "indexid", new_id[i]); + pars_info_add_int4_literal(info, "new_page", (lint) root_page[i]); + + error = que_eval_sql(info, + "PROCEDURE UPDATE_INDEX_PAGE () IS\n" + "BEGIN\n" + "UPDATE SYS_INDEXES" + " SET PAGE_NO = :new_page" + " WHERE ID = :indexid;\n" + "COMMIT WORK;\n" + "END;\n", + FALSE, trx); + + if (error != DB_SUCCESS) { + fprintf(stderr, "InnoDB: failed to update SYS_INDEXES\n"); + } + + trx_commit_for_mysql(trx); + + trx_free_for_mysql(trx); + + index->page = root_page[i]; + } + + index = dict_table_get_next_index(index); + } + if (UNIV_LIKELY_NULL(heap)) { + mem_heap_free(heap); + } + } else { + /* zip page? */ + size = (ulint) + (size_bytes + / dict_table_flags_to_zip_size(flags)); + fprintf(stderr, "InnoDB: import: table %s seems to be in newer format." + " It may not be able to treated for now.\n", name); + } + /* .exp file should be removed */ + success = os_file_delete(info_file_path); + if (!success) { + success = os_file_delete_if_exists(info_file_path); + } + mem_free(info_file_path); + + fil_system_t* system = fil_system; + mutex_enter(&(system->mutex)); + fil_node_t* node = NULL; + fil_space_t* space; + space = fil_space_get_by_id(id); + if (space) + node = UT_LIST_GET_FIRST(space->chain); + if (node && node->size < size) { + space->size += (size - node->size); + node->size = size; + } + mutex_exit(&(system->mutex)); + } + ut_free(buf2); if (UNIV_UNLIKELY(space_id != id || space_flags != flags)) { === modified file 'storage/xtradb/handler/ha_innodb.cc' --- a/storage/xtradb/handler/ha_innodb.cc 2009-06-18 12:39:21 +0000 +++ b/storage/xtradb/handler/ha_innodb.cc 2009-08-03 20:09:53 +0000 @@ -157,6 +157,7 @@ static long innobase_mirrored_log_groups innobase_autoinc_lock_mode; static unsigned long innobase_read_io_threads, innobase_write_io_threads; +static my_bool innobase_thread_concurrency_timer_based; static long long innobase_buffer_pool_size, innobase_log_file_size; /* The default values for the following char* start-up parameters @@ -488,6 +489,8 @@ static SHOW_VAR innodb_status_variables[ (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG}, {"dblwr_writes", (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG}, + {"dict_tables", + (char*) &export_vars.innodb_dict_tables, SHOW_LONG}, {"have_atomic_builtins", (char*) &export_vars.innodb_have_atomic_builtins, SHOW_BOOL}, {"log_waits", @@ -2100,77 +2103,6 @@ mem_free_and_error: goto error; } -#ifdef HAVE_REPLICATION -#ifdef MYSQL_SERVER - if(innobase_overwrite_relay_log_info) { - /* If InnoDB progressed from relay-log.info, overwrite it */ - if (fname[0] == '\0') { - fprintf(stderr, - "InnoDB: something wrong with relay-info.log. InnoDB will not overwrite it.\n"); - } else if (0 != strcmp(fname, trx_sys_mysql_master_log_name) - || pos != trx_sys_mysql_master_log_pos) { - /* Overwrite relay-log.info */ - bzero((char*) &info_file, sizeof(info_file)); - fn_format(fname, relay_log_info_file, mysql_data_home, "", 4+32); - - int error = 0; - - if (!access(fname,F_OK)) { - /* exist */ - if ((info_fd = my_open(fname, O_RDWR|O_BINARY, MYF(MY_WME))) < 0) { - error = 1; - } else if (init_io_cache(&info_file, info_fd, IO_SIZE*2, - WRITE_CACHE, 0L, 0, MYF(MY_WME))) { - error = 1; - } - - if (error) { - if (info_fd >= 0) - my_close(info_fd, MYF(0)); - goto skip_overwrite; - } - } else { - error = 1; - goto skip_overwrite; - } - - char buff[FN_REFLEN*2+22*2+4], *pos; - - my_b_seek(&info_file, 0L); - pos=strmov(buff, trx_sys_mysql_relay_log_name); - *pos++='\n'; - pos=longlong2str(trx_sys_mysql_relay_log_pos, pos, 10); - *pos++='\n'; - pos=strmov(pos, trx_sys_mysql_master_log_name); - *pos++='\n'; - pos=longlong2str(trx_sys_mysql_master_log_pos, pos, 10); - *pos='\n'; - - if (my_b_write(&info_file, (uchar*) buff, (size_t) (pos-buff)+1)) - error = 1; - if (flush_io_cache(&info_file)) - error = 1; - - end_io_cache(&info_file); - if (info_fd >= 0) - my_close(info_fd, MYF(0)); -skip_overwrite: - if (error) { - fprintf(stderr, - "InnoDB: ERROR: error occured during overwriting relay-log.info.\n"); - } else { - fprintf(stderr, - "InnoDB: relay-log.info was overwritten.\n"); - } - } else { - fprintf(stderr, - "InnoDB: InnoDB and relay-log.info are synchronized. InnoDB will not overwrite it.\n"); - } - } -#endif /* MYSQL_SERVER */ -#endif /* HAVE_REPLICATION */ - - srv_extra_undoslots = (ibool) innobase_extra_undoslots; /* -------------- Log files ---------------------------*/ @@ -2266,6 +2198,9 @@ skip_overwrite: srv_n_log_files = (ulint) innobase_log_files_in_group; srv_log_file_size = (ulint) innobase_log_file_size; + srv_thread_concurrency_timer_based = + (ibool) innobase_thread_concurrency_timer_based; + #ifdef UNIV_LOG_ARCHIVE srv_log_archive_on = (ulint) innobase_log_archive; #endif /* UNIV_LOG_ARCHIVE */ @@ -2280,6 +2215,7 @@ skip_overwrite: srv_n_write_io_threads = (ulint) innobase_write_io_threads; srv_read_ahead &= 3; + srv_adaptive_checkpoint %= 3; srv_force_recovery = (ulint) innobase_force_recovery; @@ -2329,6 +2265,76 @@ skip_overwrite: goto mem_free_and_error; } +#ifdef HAVE_REPLICATION +#ifdef MYSQL_SERVER + if(innobase_overwrite_relay_log_info) { + /* If InnoDB progressed from relay-log.info, overwrite it */ + if (fname[0] == '\0') { + fprintf(stderr, + "InnoDB: something wrong with relay-info.log. InnoDB will not overwrite it.\n"); + } else if (0 != strcmp(fname, trx_sys_mysql_master_log_name) + || pos != trx_sys_mysql_master_log_pos) { + /* Overwrite relay-log.info */ + bzero((char*) &info_file, sizeof(info_file)); + fn_format(fname, relay_log_info_file, mysql_data_home, "", 4+32); + + int error = 0; + + if (!access(fname,F_OK)) { + /* exist */ + if ((info_fd = my_open(fname, O_RDWR|O_BINARY, MYF(MY_WME))) < 0) { + error = 1; + } else if (init_io_cache(&info_file, info_fd, IO_SIZE*2, + WRITE_CACHE, 0L, 0, MYF(MY_WME))) { + error = 1; + } + + if (error) { + if (info_fd >= 0) + my_close(info_fd, MYF(0)); + goto skip_overwrite; + } + } else { + error = 1; + goto skip_overwrite; + } + + char buff[FN_REFLEN*2+22*2+4], *pos; + + my_b_seek(&info_file, 0L); + pos=strmov(buff, trx_sys_mysql_relay_log_name); + *pos++='\n'; + pos=longlong2str(trx_sys_mysql_relay_log_pos, pos, 10); + *pos++='\n'; + pos=strmov(pos, trx_sys_mysql_master_log_name); + *pos++='\n'; + pos=longlong2str(trx_sys_mysql_master_log_pos, pos, 10); + *pos='\n'; + + if (my_b_write(&info_file, (uchar*) buff, (size_t) (pos-buff)+1)) + error = 1; + if (flush_io_cache(&info_file)) + error = 1; + + end_io_cache(&info_file); + if (info_fd >= 0) + my_close(info_fd, MYF(0)); +skip_overwrite: + if (error) { + fprintf(stderr, + "InnoDB: ERROR: error occured during overwriting relay-log.info.\n"); + } else { + fprintf(stderr, + "InnoDB: relay-log.info was overwritten.\n"); + } + } else { + fprintf(stderr, + "InnoDB: InnoDB and relay-log.info are synchronized. InnoDB will not overwrite it.\n"); + } + } +#endif /* MYSQL_SERVER */ +#endif /* HAVE_REPLICATION */ + innobase_open_tables = hash_create(200); pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST); pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST); @@ -7079,7 +7085,9 @@ ha_innobase::info( ib_table = prebuilt->table; if (flag & HA_STATUS_TIME) { - if (innobase_stats_on_metadata) { + if (innobase_stats_on_metadata + && (thd_sql_command(user_thd) == SQLCOM_ANALYZE + || srv_stats_auto_update)) { /* In sql_show we call with this flag: update then statistics so that they are up-to-date */ @@ -9321,7 +9329,8 @@ ha_innobase::check_if_incompatible_data( if (info_row_type == ROW_TYPE_DEFAULT) info_row_type = ROW_TYPE_COMPACT; if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) && - row_type != info_row_type) { + get_row_type() != ((info->row_type == ROW_TYPE_DEFAULT) + ? ROW_TYPE_COMPACT : info->row_type)) { DBUG_PRINT("info", ("get_row_type()=%d != info->row_type=%d -> " "COMPATIBLE_DATA_NO", @@ -9830,6 +9839,31 @@ static MYSQL_SYSVAR_ULONGLONG(stats_samp "The number of index pages to sample when calculating statistics (default 8)", NULL, NULL, 8, 1, ~0ULL, 0); +const char *innobase_stats_method_names[]= +{ + "nulls_equal", + "nulls_unequal", + "nulls_ignored", + NullS +}; +TYPELIB innobase_stats_method_typelib= +{ + array_elements(innobase_stats_method_names) - 1, "innobase_stats_method_typelib", + innobase_stats_method_names, NULL +}; +static MYSQL_SYSVAR_ENUM(stats_method, srv_stats_method, + PLUGIN_VAR_RQCMDARG, + "Specifies how InnoDB index statistics collection code should threat NULLs. " + "Possible values of name are same to for 'myisam_stats_method'. " + "This is startup parameter.", + NULL, NULL, 0, &innobase_stats_method_typelib); + +static MYSQL_SYSVAR_ULONG(stats_auto_update, srv_stats_auto_update, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable InnoDB's auto update statistics of indexes. " + "(except for ANALYZE TABLE command) 0:disable 1:enable", + NULL, NULL, 1, 0, 1, 0); + static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled, PLUGIN_VAR_OPCMDARG, "Enable InnoDB adaptive hash index (enabled by default). " @@ -9907,6 +9941,12 @@ static MYSQL_SYSVAR_ULONG(sync_spin_loop "Count of spin-loop rounds in InnoDB mutexes", NULL, NULL, 20L, 0L, ~0L, 0); +static MYSQL_SYSVAR_BOOL(thread_concurrency_timer_based, + innobase_thread_concurrency_timer_based, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use InnoDB timer based concurrency throttling. ", + NULL, NULL, FALSE); + static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency, PLUGIN_VAR_RQCMDARG, "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.", @@ -9953,7 +9993,7 @@ static MYSQL_SYSVAR_STR(change_buffering static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity, PLUGIN_VAR_RQCMDARG, "Number of IO operations per second the server can do. Tunes background IO rate.", - NULL, NULL, 100, 100, 999999999, 0); + NULL, NULL, 200, 100, 999999999, 0); static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, @@ -10008,10 +10048,36 @@ static MYSQL_SYSVAR_ENUM(read_ahead, srv "Control read ahead activity. (none, random, linear, [both])", NULL, innodb_read_ahead_update, 3, &read_ahead_typelib); -static MYSQL_SYSVAR_ULONG(adaptive_checkpoint, srv_adaptive_checkpoint, +static +void +innodb_adaptive_checkpoint_update( + THD* thd, + struct st_mysql_sys_var* var, + void* var_ptr, + const void* save) +{ + *(long *)var_ptr= (*(long *)save) % 3; +} +const char *adaptive_checkpoint_names[]= +{ + "none", /* 0 */ + "reflex", /* 1 */ + "estimate", /* 2 */ + /* For compatibility of the older patch */ + "0", /* 3 ("none" + 3) */ + "1", /* 4 ("reflex" + 3) */ + "2", /* 5 ("estimate" + 3) */ + NullS +}; +TYPELIB adaptive_checkpoint_typelib= +{ + array_elements(adaptive_checkpoint_names) - 1, "adaptive_checkpoint_typelib", + adaptive_checkpoint_names, NULL +}; +static MYSQL_SYSVAR_ENUM(adaptive_checkpoint, srv_adaptive_checkpoint, PLUGIN_VAR_RQCMDARG, - "Enable/Disable flushing along modified age. 0:disable 1:enable", - NULL, NULL, 0, 0, 1, 0); + "Enable/Disable flushing along modified age. ([none], reflex, estimate)", + NULL, innodb_adaptive_checkpoint_update, 0, &adaptive_checkpoint_typelib); static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit, PLUGIN_VAR_RQCMDARG, @@ -10021,18 +10087,28 @@ static MYSQL_SYSVAR_ULONG(enable_unsafe_ static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of background read I/O threads in InnoDB.", - NULL, NULL, 1, 1, 64, 0); + NULL, NULL, 8, 1, 64, 0); static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of background write I/O threads in InnoDB.", - NULL, NULL, 1, 1, 64, 0); + NULL, NULL, 8, 1, 64, 0); + +static MYSQL_SYSVAR_ULONG(expand_import, srv_expand_import, + PLUGIN_VAR_RQCMDARG, + "Enable/Disable converting automatically *.ibd files when import tablespace.", + NULL, NULL, 0, 0, 1, 0); static MYSQL_SYSVAR_ULONG(extra_rsegments, srv_extra_rsegments, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Number of extra user rollback segments when create new database.", NULL, NULL, 0, 0, 127, 0); +static MYSQL_SYSVAR_ULONG(dict_size_limit, srv_dict_size_limit, + PLUGIN_VAR_RQCMDARG, + "Limit the allocated memory for dictionary cache. (0: unlimited)", + NULL, NULL, 0, 0, LONG_MAX, 0); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(autoextend_increment), @@ -10069,6 +10145,8 @@ static struct st_mysql_sys_var* innobase MYSQL_SYSVAR(overwrite_relay_log_info), MYSQL_SYSVAR(rollback_on_timeout), MYSQL_SYSVAR(stats_on_metadata), + MYSQL_SYSVAR(stats_method), + MYSQL_SYSVAR(stats_auto_update), MYSQL_SYSVAR(stats_sample_pages), MYSQL_SYSVAR(adaptive_hash_index), MYSQL_SYSVAR(replication_delay), @@ -10078,6 +10156,7 @@ static struct st_mysql_sys_var* innobase MYSQL_SYSVAR(sync_spin_loops), MYSQL_SYSVAR(table_locks), MYSQL_SYSVAR(thread_concurrency), + MYSQL_SYSVAR(thread_concurrency_timer_based), MYSQL_SYSVAR(thread_sleep_delay), MYSQL_SYSVAR(autoinc_lock_mode), MYSQL_SYSVAR(show_verbose_locks), @@ -10093,7 +10172,9 @@ static struct st_mysql_sys_var* innobase MYSQL_SYSVAR(enable_unsafe_group_commit), MYSQL_SYSVAR(read_io_threads), MYSQL_SYSVAR(write_io_threads), + MYSQL_SYSVAR(expand_import), MYSQL_SYSVAR(extra_rsegments), + MYSQL_SYSVAR(dict_size_limit), MYSQL_SYSVAR(use_sys_malloc), MYSQL_SYSVAR(change_buffering), NULL @@ -10287,6 +10368,8 @@ i_s_innodb_cmp, i_s_innodb_cmp_reset, i_s_innodb_cmpmem, i_s_innodb_cmpmem_reset, +i_s_innodb_table_stats, +i_s_innodb_index_stats, i_s_innodb_patches mysql_declare_plugin_end; === modified file 'storage/xtradb/handler/i_s.cc' --- a/storage/xtradb/handler/i_s.cc 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/handler/i_s.cc 2009-06-25 01:43:25 +0000 @@ -45,6 +45,7 @@ extern "C" { #include "dict0dict.h" /* for dict_index_get_if_in_cache */ #include "trx0rseg.h" /* for trx_rseg_struct */ #include "trx0sys.h" /* for trx_sys */ +#include "dict0dict.h" /* for dict_sys */ /* from buf0buf.c */ struct buf_chunk_struct{ ulint mem_size; /* allocated size of the chunk */ @@ -2282,7 +2283,8 @@ i_s_cmpmem_fill_low( RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + mutex_enter(&zip_free_mutex); for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) { buf_buddy_stat_t* buddy_stat = &buf_buddy_stat[x]; @@ -2308,7 +2310,8 @@ i_s_cmpmem_fill_low( } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&zip_free_mutex); DBUG_RETURN(status); } @@ -2653,3 +2656,299 @@ UNIV_INTERN struct st_mysql_plugin i_s_i /* void* */ STRUCT_FLD(__reserved1, NULL) }; + +/*********************************************************************** +*/ +static ST_FIELD_INFO i_s_innodb_table_stats_info[] = +{ + {STRUCT_FLD(field_name, "table_name"), + STRUCT_FLD(field_length, NAME_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "rows"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "clust_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "other_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "modified"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static ST_FIELD_INFO i_s_innodb_index_stats_info[] = +{ + {STRUCT_FLD(field_name, "table_name"), + STRUCT_FLD(field_length, NAME_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "index_name"), + STRUCT_FLD(field_length, NAME_LEN), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "fields"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "row_per_keys"), + STRUCT_FLD(field_length, 256), + STRUCT_FLD(field_type, MYSQL_TYPE_STRING), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, 0), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "index_size"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + {STRUCT_FLD(field_name, "leaf_pages"), + STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS), + STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG), + STRUCT_FLD(value, 0), + STRUCT_FLD(field_flags, MY_I_S_UNSIGNED), + STRUCT_FLD(old_name, ""), + STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}, + + END_OF_ST_FIELD_INFO +}; + +static +int +i_s_innodb_table_stats_fill( +/*========================*/ + THD* thd, + TABLE_LIST* tables, + COND* cond) +{ + TABLE* i_s_table = (TABLE *) tables->table; + int status = 0; + dict_table_t* table; + + DBUG_ENTER("i_s_innodb_table_stats_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + mutex_enter(&(dict_sys->mutex)); + + table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + + while (table) { + if (table->stat_clustered_index_size == 0) { + table = UT_LIST_GET_NEXT(table_LRU, table); + continue; + } + + field_store_string(i_s_table->field[0], table->name); + i_s_table->field[1]->store(table->stat_n_rows); + i_s_table->field[2]->store(table->stat_clustered_index_size); + i_s_table->field[3]->store(table->stat_sum_of_other_index_sizes); + i_s_table->field[4]->store(table->stat_modified_counter); + + if (schema_table_store_record(thd, i_s_table)) { + status = 1; + break; + } + + table = UT_LIST_GET_NEXT(table_LRU, table); + } + + mutex_exit(&(dict_sys->mutex)); + + DBUG_RETURN(status); +} + +static +int +i_s_innodb_index_stats_fill( +/*========================*/ + THD* thd, + TABLE_LIST* tables, + COND* cond) +{ + TABLE* i_s_table = (TABLE *) tables->table; + int status = 0; + dict_table_t* table; + dict_index_t* index; + + DBUG_ENTER("i_s_innodb_index_stats_fill"); + + /* deny access to non-superusers */ + if (check_global_access(thd, PROCESS_ACL)) { + DBUG_RETURN(0); + } + + mutex_enter(&(dict_sys->mutex)); + + table = UT_LIST_GET_FIRST(dict_sys->table_LRU); + + while (table) { + if (table->stat_clustered_index_size == 0) { + table = UT_LIST_GET_NEXT(table_LRU, table); + continue; + } + + ib_int64_t n_rows = table->stat_n_rows; + + if (n_rows < 0) { + n_rows = 0; + } + + index = dict_table_get_first_index(table); + + while (index) { + char buff[256+1]; + char row_per_keys[256+1]; + ulint i; + + field_store_string(i_s_table->field[0], table->name); + field_store_string(i_s_table->field[1], index->name); + i_s_table->field[2]->store(index->n_uniq); + + row_per_keys[0] = '\0'; + if (index->stat_n_diff_key_vals) { + for (i = 1; i <= index->n_uniq; i++) { + ib_int64_t rec_per_key; + if (index->stat_n_diff_key_vals[i]) { + rec_per_key = n_rows / index->stat_n_diff_key_vals[i]; + } else { + rec_per_key = n_rows; + } + snprintf(buff, 256, (i == index->n_uniq)?"%llu":"%llu, ", + rec_per_key); + strncat(row_per_keys, buff, 256 - strlen(row_per_keys)); + } + } + field_store_string(i_s_table->field[3], row_per_keys); + + i_s_table->field[4]->store(index->stat_index_size); + i_s_table->field[5]->store(index->stat_n_leaf_pages); + + if (schema_table_store_record(thd, i_s_table)) { + status = 1; + break; + } + + index = dict_table_get_next_index(index); + } + + if (status == 1) { + break; + } + + table = UT_LIST_GET_NEXT(table_LRU, table); + } + + mutex_exit(&(dict_sys->mutex)); + + DBUG_RETURN(status); +} + +static +int +i_s_innodb_table_stats_init( +/*========================*/ + void* p) +{ + DBUG_ENTER("i_s_innodb_table_stats_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_table_stats_info; + schema->fill_table = i_s_innodb_table_stats_fill; + + DBUG_RETURN(0); +} + +static +int +i_s_innodb_index_stats_init( +/*========================*/ + void* p) +{ + DBUG_ENTER("i_s_innodb_index_stats_init"); + ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p; + + schema->fields_info = i_s_innodb_index_stats_info; + schema->fill_table = i_s_innodb_index_stats_fill; + + DBUG_RETURN(0); +} + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_table_stats = +{ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(name, "INNODB_TABLE_STATS"), + STRUCT_FLD(author, plugin_author), + STRUCT_FLD(descr, "InnoDB table statistics in memory"), + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + STRUCT_FLD(init, i_s_innodb_table_stats_init), + STRUCT_FLD(deinit, i_s_common_deinit), + STRUCT_FLD(version, 0x0100 /* 1.0 */), + STRUCT_FLD(status_vars, NULL), + STRUCT_FLD(system_vars, NULL), + STRUCT_FLD(__reserved1, NULL) +}; + +UNIV_INTERN struct st_mysql_plugin i_s_innodb_index_stats = +{ + STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN), + STRUCT_FLD(info, &i_s_info), + STRUCT_FLD(name, "INNODB_INDEX_STATS"), + STRUCT_FLD(author, plugin_author), + STRUCT_FLD(descr, "InnoDB index statistics in memory"), + STRUCT_FLD(license, PLUGIN_LICENSE_GPL), + STRUCT_FLD(init, i_s_innodb_index_stats_init), + STRUCT_FLD(deinit, i_s_common_deinit), + STRUCT_FLD(version, 0x0100 /* 1.0 */), + STRUCT_FLD(status_vars, NULL), + STRUCT_FLD(system_vars, NULL), + STRUCT_FLD(__reserved1, NULL) +}; === modified file 'storage/xtradb/handler/i_s.h' --- a/storage/xtradb/handler/i_s.h 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/handler/i_s.h 2009-06-25 01:43:25 +0000 @@ -37,5 +37,7 @@ extern struct st_mysql_plugin i_s_innodb extern struct st_mysql_plugin i_s_innodb_cmpmem_reset; extern struct st_mysql_plugin i_s_innodb_patches; extern struct st_mysql_plugin i_s_innodb_rseg; +extern struct st_mysql_plugin i_s_innodb_table_stats; +extern struct st_mysql_plugin i_s_innodb_index_stats; #endif /* i_s_h */ === modified file 'storage/xtradb/handler/innodb_patch_info.h' --- a/storage/xtradb/handler/innodb_patch_info.h 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/handler/innodb_patch_info.h 2009-07-06 05:47:15 +0000 @@ -31,5 +31,12 @@ struct innodb_enhancement { {"innodb_expand_undo_slots","expandable maximum number of undo slots","from 1024 (default) to about 4000","http://www.percona.com/docs/wiki/percona-xtradb"}, {"innodb_extra_rseg","allow to create extra rollback segments","When create new db, the new parameter allows to create more rollback segments","http://www.percona.com/docs/wiki/percona-xtradb"}, {"innodb_overwrite_relay_log_info","overwrite relay-log.info when slave recovery","Building as plugin, it is not used.","http://www.percona.com/docs/wiki/percona-xtradb:innodb_overwrite_relay_log_info"}, +{"innodb_pause_in_spin","use 'pause' instruction during spin loop for x86 (gcc)","","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_thread_concurrency_timer_based","use InnoDB timer based concurrency throttling (backport from MySQL 5.4.0)","",""}, +{"innodb_expand_import","convert .ibd file automatically when import tablespace","the files are generated by xtrabackup export mode.","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_dict_size_limit","Limit dictionary cache size","Variable innodb_dict_size_limit in bytes","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_split_buf_pool_mutex","More fix of buffer_pool mutex","Spliting buf_pool_mutex and optimizing based on innodb_opt_lru_count","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_stats","Additional features about InnoDB statistics/optimizer","","http://www.percona.com/docs/wiki/percona-xtradb"}, +{"innodb_recovery_patches","Bugfixes and adjustments about recovery process","","http://www.percona.com/docs/wiki/percona-xtradb"}, {NULL, NULL, NULL, NULL} }; === modified file 'storage/xtradb/ibuf/ibuf0ibuf.c' --- a/storage/xtradb/ibuf/ibuf0ibuf.c 2009-06-22 08:06:35 +0000 +++ b/storage/xtradb/ibuf/ibuf0ibuf.c 2009-08-03 20:09:53 +0000 @@ -472,6 +472,7 @@ ibuf_init_at_db_start(void) /* Use old-style record format for the insert buffer. */ table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0); + table->n_mysql_handles_opened = 1; /* for pin */ dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0); === modified file 'storage/xtradb/include/buf0buddy.h' --- a/storage/xtradb/include/buf0buddy.h 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/include/buf0buddy.h 2009-06-25 01:43:25 +0000 @@ -49,10 +49,11 @@ buf_buddy_alloc( /* out: allocated block, possibly NULL if lru == NULL */ ulint size, /* in: block size, up to UNIV_PAGE_SIZE */ - ibool* lru) /* in: pointer to a variable that will be assigned + ibool* lru, /* in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool_mutex was temporarily released, or NULL if the LRU list should not be used */ + ibool have_page_hash_mutex) __attribute__((malloc)); /************************************************************************** @@ -63,7 +64,8 @@ buf_buddy_free( /*===========*/ void* buf, /* in: block to be freed, must not be pointed to by the buffer pool */ - ulint size) /* in: block size, up to UNIV_PAGE_SIZE */ + ulint size, /* in: block size, up to UNIV_PAGE_SIZE */ + ibool have_page_hash_mutex) __attribute__((nonnull)); /** Statistics of buddy blocks of a given size. */ === modified file 'storage/xtradb/include/buf0buddy.ic' --- a/storage/xtradb/include/buf0buddy.ic 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/include/buf0buddy.ic 2009-06-25 01:43:25 +0000 @@ -44,10 +44,11 @@ buf_buddy_alloc_low( possibly NULL if lru==NULL */ ulint i, /* in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ - ibool* lru) /* in: pointer to a variable that will be assigned + ibool* lru, /* in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool_mutex was temporarily released, or NULL if the LRU list should not be used */ + ibool have_page_hash_mutex) __attribute__((malloc)); /************************************************************************** @@ -58,8 +59,9 @@ buf_buddy_free_low( /*===============*/ void* buf, /* in: block to be freed, must not be pointed to by the buffer pool */ - ulint i) /* in: index of buf_pool->zip_free[], + ulint i, /* in: index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */ + ibool have_page_hash_mutex) __attribute__((nonnull)); /************************************************************************** @@ -98,14 +100,15 @@ buf_buddy_alloc( /* out: allocated block, possibly NULL if lru == NULL */ ulint size, /* in: block size, up to UNIV_PAGE_SIZE */ - ibool* lru) /* in: pointer to a variable that will be assigned + ibool* lru, /* in: pointer to a variable that will be assigned TRUE if storage was allocated from the LRU list and buf_pool_mutex was temporarily released, or NULL if the LRU list should not be used */ + ibool have_page_hash_mutex) { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); - return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru)); + return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru, have_page_hash_mutex)); } /************************************************************************** @@ -116,11 +119,24 @@ buf_buddy_free( /*===========*/ void* buf, /* in: block to be freed, must not be pointed to by the buffer pool */ - ulint size) /* in: block size, up to UNIV_PAGE_SIZE */ + ulint size, /* in: block size, up to UNIV_PAGE_SIZE */ + ibool have_page_hash_mutex) { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); - buf_buddy_free_low(buf, buf_buddy_get_slot(size)); + if (!have_page_hash_mutex) { + mutex_enter(&LRU_list_mutex); + rw_lock_x_lock(&page_hash_latch); + } + + mutex_enter(&zip_free_mutex); + buf_buddy_free_low(buf, buf_buddy_get_slot(size), TRUE); + mutex_exit(&zip_free_mutex); + + if (!have_page_hash_mutex) { + mutex_exit(&LRU_list_mutex); + rw_lock_x_unlock(&page_hash_latch); + } } #ifdef UNIV_MATERIALIZE === modified file 'storage/xtradb/include/buf0buf.h' --- a/storage/xtradb/include/buf0buf.h 2009-05-04 04:32:30 +0000 +++ b/storage/xtradb/include/buf0buf.h 2009-06-25 01:43:25 +0000 @@ -1024,7 +1024,7 @@ struct buf_page_struct{ /* 2. Page flushing fields; protected by buf_pool_mutex */ - UT_LIST_NODE_T(buf_page_t) list; + /* UT_LIST_NODE_T(buf_page_t) list; */ /* based on state, this is a list node in one of the following lists in buf_pool: @@ -1034,6 +1034,10 @@ struct buf_page_struct{ BUF_BLOCK_ZIP_DIRTY: flush_list BUF_BLOCK_ZIP_PAGE: zip_clean BUF_BLOCK_ZIP_FREE: zip_free[] */ + /* resplit for optimistic use */ + UT_LIST_NODE_T(buf_page_t) free; + UT_LIST_NODE_T(buf_page_t) flush_list; + UT_LIST_NODE_T(buf_page_t) zip_list; /* zip_clean or zip_free[] */ #ifdef UNIV_DEBUG ibool in_flush_list; /* TRUE if in buf_pool->flush_list; when buf_pool_mutex is free, the @@ -1104,11 +1108,11 @@ struct buf_block_struct{ a block is in the unzip_LRU list if page.state == BUF_BLOCK_FILE_PAGE and page.zip.data != NULL */ -#ifdef UNIV_DEBUG +//#ifdef UNIV_DEBUG ibool in_unzip_LRU_list;/* TRUE if the page is in the decompressed LRU list; used in debugging */ -#endif /* UNIV_DEBUG */ +//#endif /* UNIV_DEBUG */ byte* frame; /* pointer to buffer frame which is of size UNIV_PAGE_SIZE, and aligned to an address divisible by @@ -1316,6 +1320,12 @@ struct buf_pool_struct{ /* mutex protecting the buffer pool struct and control blocks, except the read-write lock in them */ extern mutex_t buf_pool_mutex; +extern mutex_t LRU_list_mutex; +extern mutex_t flush_list_mutex; +extern rw_lock_t page_hash_latch; +extern mutex_t free_list_mutex; +extern mutex_t zip_free_mutex; +extern mutex_t zip_hash_mutex; /* mutex protecting the control blocks of compressed-only pages (of type buf_page_t, not buf_block_t) */ extern mutex_t buf_pool_zip_mutex; === modified file 'storage/xtradb/include/buf0buf.ic' --- a/storage/xtradb/include/buf0buf.ic 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/include/buf0buf.ic 2009-06-25 01:43:25 +0000 @@ -100,7 +100,9 @@ buf_pool_get_oldest_modification(void) buf_page_t* bpage; ib_uint64_t lsn; - buf_pool_mutex_enter(); +try_again: + //buf_pool_mutex_enter(); + mutex_enter(&flush_list_mutex); bpage = UT_LIST_GET_LAST(buf_pool->flush_list); @@ -109,9 +111,14 @@ buf_pool_get_oldest_modification(void) } else { ut_ad(bpage->in_flush_list); lsn = bpage->oldest_modification; + if (lsn == 0) { + mutex_exit(&flush_list_mutex); + goto try_again; + } } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(&flush_list_mutex); /* The returned answer may be out of date: the flush_list can change after the mutex has been released. */ @@ -128,7 +135,8 @@ buf_pool_clock_tic(void) /*====================*/ /* out: new clock value */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); buf_pool->ulint_clock++; @@ -246,7 +254,7 @@ buf_page_in_file( case BUF_BLOCK_ZIP_FREE: /* This is a free page in buf_pool->zip_free[]. Such pages should only be accessed by the buddy allocator. */ - ut_error; + /* ut_error; */ /* optimistic */ break; case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: @@ -288,7 +296,7 @@ buf_page_get_LRU_position( const buf_page_t* bpage) /* in: control block */ { ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /* This is used in optimistic */ return(bpage->LRU_position); } @@ -305,7 +313,7 @@ buf_page_get_mutex( { switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_FREE: - ut_error; + /* ut_error; */ /* optimistic */ return(NULL); case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_DIRTY: @@ -410,7 +418,7 @@ buf_page_set_io_fix( buf_page_t* bpage, /* in/out: control block */ enum buf_io_fix io_fix) /* in: io_fix state */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(buf_page_get_mutex(bpage))); bpage->io_fix = io_fix; @@ -438,12 +446,13 @@ buf_page_can_relocate( /*==================*/ const buf_page_t* bpage) /* control block being relocated */ { - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mutex_own(buf_page_get_mutex(bpage))); ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); + /* optimistic */ + //ut_ad(bpage->in_LRU_list); - return(buf_page_get_io_fix(bpage) == BUF_IO_NONE + return(bpage->in_LRU_list && bpage->io_fix == BUF_IO_NONE && bpage->buf_fix_count == 0); } @@ -457,7 +466,7 @@ buf_page_is_old( const buf_page_t* bpage) /* in: control block */ { ut_ad(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); /* This is used in optimistic */ return(bpage->old); } @@ -472,7 +481,8 @@ buf_page_set_old( ibool old) /* in: old */ { ut_a(buf_page_in_file(bpage)); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); + ut_ad(mutex_own(&LRU_list_mutex)); ut_ad(bpage->in_LRU_list); #ifdef UNIV_LRU_DEBUG @@ -728,17 +738,17 @@ buf_block_free( /*===========*/ buf_block_t* block) /* in, own: block to be freed */ { - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); mutex_enter(&block->mutex); ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); - buf_LRU_block_free_non_file_page(block); + buf_LRU_block_free_non_file_page(block, FALSE); mutex_exit(&block->mutex); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); } /************************************************************************* @@ -783,14 +793,23 @@ buf_page_io_query( buf_page_t* bpage) /* in: buf_pool block, must be bufferfixed */ { ibool io_fixed; + mutex_t* block_mutex = buf_page_get_mutex(bpage); - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); +retry_lock: + mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } ut_ad(buf_page_in_file(bpage)); ut_ad(bpage->buf_fix_count > 0); io_fixed = buf_page_get_io_fix(bpage) != BUF_IO_NONE; - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + mutex_exit(block_mutex); return(io_fixed); } @@ -809,7 +828,13 @@ buf_page_get_newest_modification( ib_uint64_t lsn; mutex_t* block_mutex = buf_page_get_mutex(bpage); +retry_lock: mutex_enter(block_mutex); + if (block_mutex != buf_page_get_mutex(bpage)) { + mutex_exit(block_mutex); + block_mutex = buf_page_get_mutex(bpage); + goto retry_lock; + } if (buf_page_in_file(bpage)) { lsn = bpage->newest_modification; @@ -833,7 +858,7 @@ buf_block_modify_clock_inc( buf_block_t* block) /* in: block */ { #ifdef UNIV_SYNC_DEBUG - ut_ad((buf_pool_mutex_own() + ut_ad((mutex_own(&LRU_list_mutex) && (block->page.buf_fix_count == 0)) || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE)); #endif /* UNIV_SYNC_DEBUG */ @@ -917,7 +942,11 @@ buf_page_hash_get( ulint fold; ut_ad(buf_pool); - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); +#ifdef UNIV_SYNC_DEBUG + ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX) + || rw_lock_own(&page_hash_latch, RW_LOCK_SHARED)); +#endif /* Look for the page in the hash table */ @@ -966,11 +995,13 @@ buf_page_peek( { const buf_page_t* bpage; - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); + rw_lock_s_lock(&page_hash_latch); bpage = buf_page_hash_get(space, offset); - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + rw_lock_s_unlock(&page_hash_latch); return(bpage != NULL); } @@ -1032,11 +1063,14 @@ buf_page_release( ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_a(block->page.buf_fix_count > 0); + /* buf_flush_note_modification() should be called before this function. */ +/* if (rw_latch == RW_X_LATCH && mtr->modifications) { buf_pool_mutex_enter(); buf_flush_note_modification(block, mtr); buf_pool_mutex_exit(); } +*/ mutex_enter(&block->mutex); === modified file 'storage/xtradb/include/buf0flu.ic' --- a/storage/xtradb/include/buf0flu.ic 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/include/buf0flu.ic 2009-06-25 01:43:25 +0000 @@ -53,13 +53,23 @@ buf_flush_note_modification( buf_block_t* block, /* in: block which is modified */ mtr_t* mtr) /* in: mtr */ { + ibool use_LRU_mutex = FALSE; + + if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)) + use_LRU_mutex = TRUE; + + if (use_LRU_mutex) + mutex_enter(&LRU_list_mutex); + + mutex_enter(&block->mutex); + ut_ad(block); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count > 0); #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - ut_ad(buf_pool_mutex_own()); + //ut_ad(buf_pool_mutex_own()); ut_ad(mtr->start_lsn != 0); ut_ad(mtr->modifications); @@ -68,16 +78,23 @@ buf_flush_note_modification( block->page.newest_modification = mtr->end_lsn; if (!block->page.oldest_modification) { + mutex_enter(&flush_list_mutex); block->page.oldest_modification = mtr->start_lsn; ut_ad(block->page.oldest_modification != 0); buf_flush_insert_into_flush_list(block); + mutex_exit(&flush_list_mutex); } else { ut_ad(block->page.oldest_modification <= mtr->start_lsn); } + mutex_exit(&block->mutex); + ++srv_buf_pool_write_requests; + + if (use_LRU_mutex) + mutex_exit(&LRU_list_mutex); } /************************************************************************ @@ -92,6 +109,16 @@ buf_flush_recv_note_modification( ib_uint64_t end_lsn) /* in: end lsn of the last mtr in the set of mtr's */ { + ibool use_LRU_mutex = FALSE; + + if(UT_LIST_GET_LEN(buf_pool->unzip_LRU)) + use_LRU_mutex = TRUE; + + if (use_LRU_mutex) + mutex_enter(&LRU_list_mutex); + + mutex_enter(&(block->mutex)); + ut_ad(block); ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); ut_ad(block->page.buf_fix_count > 0); @@ -99,22 +126,27 @@ buf_flush_recv_note_modification( ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ - buf_pool_mutex_enter(); + //buf_pool_mutex_enter(); ut_ad(block->page.newest_modification <= end_lsn); block->page.newest_modification = end_lsn; if (!block->page.oldest_modification) { + mutex_enter(&flush_list_mutex); block->page.oldest_modification = start_lsn; ut_ad(block->page.oldest_modification != 0); buf_flush_insert_sorted_into_flush_list(block); + mutex_exit(&flush_list_mutex); } else { ut_ad(block->page.oldest_modification <= start_lsn); } - buf_pool_mutex_exit(); + //buf_pool_mutex_exit(); + if (use_LRU_mutex) + mutex_exit(&LRU_list_mutex); + mutex_exit(&(block->mutex)); } === modified file 'storage/xtradb/include/buf0lru.h' --- a/storage/xtradb/include/buf0lru.h 2009-05-04 02:45:47 +0000 +++ b/storage/xtradb/include/buf0lru.h 2009-06-25 01:43:25 +0000 @@ -122,10 +122,11 @@ buf_LRU_free_block( buf_page_t* bpage, /* in: block to be freed */ ibool zip, /* in: TRUE if should remove also the compressed page of an uncompressed page */ - ibool* buf_pool_mutex_released); + ibool* buf_pool_mutex_released, /* in: pointer to a variable that will be assigned TRUE if buf_pool_mutex was temporarily released, or NULL */ + ibool have_LRU_mutex); /********************************************************************** Try to free a replaceable block. */ UNIV_INTERN @@ -169,7 +170,8 @@ UNIV_INTERN void buf_LRU_block_free_non_file_page( /*=============================*/ - buf_block_t* block); /* in: block, must not contain a file page */ + buf_block_t* block, /* in: block, must not contain a file page */ + ibool have_page_hash_mutex); /********************************************************************** Adds a block to the LRU list. */ UNIV_INTERN === modified file 'storage/xtradb/include/dict0dict.h' --- a/storage/xtradb/include/dict0dict.h 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/include/dict0dict.h 2009-06-25 01:43:25 +0000 @@ -1102,6 +1102,12 @@ dict_table_get_index_on_name_and_min_id( /* out: index, NULL if does not exist */ dict_table_t* table, /* in: table */ const char* name); /* in: name of the index to find */ + +UNIV_INTERN +void +dict_table_LRU_trim( +/*================*/ + dict_table_t* self); /* Buffers for storing detailed information about the latest foreign key and unique key errors */ extern FILE* dict_foreign_err_file; === modified file 'storage/xtradb/include/dict0dict.ic' --- a/storage/xtradb/include/dict0dict.ic 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/include/dict0dict.ic 2009-06-25 01:43:25 +0000 @@ -723,6 +723,13 @@ dict_table_check_if_in_cache_low( HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, dict_table_t*, table, ut_ad(table->cached), !strcmp(table->name, table_name)); + + /* make young in table_LRU */ + if (table) { + UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table); + UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table); + } + return(table); } @@ -776,6 +783,12 @@ dict_table_get_on_id_low( table = dict_load_table_on_id(table_id); } + /* make young in table_LRU */ + if (table) { + UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table); + UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table); + } + ut_ad(!table || table->cached); /* TODO: should get the type information from MySQL */ === modified file 'storage/xtradb/include/log0log.h' --- a/storage/xtradb/include/log0log.h 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/include/log0log.h 2009-06-25 01:43:25 +0000 @@ -186,6 +186,13 @@ void log_buffer_flush_to_disk(void); /*==========================*/ /******************************************************************** +Flushes the log buffer. Forces it to disk depending on the value of +the configuration parameter innodb_flush_log_at_trx_commit. */ +UNIV_INTERN +void +log_buffer_flush_maybe_sync(void); +/*=============================*/ +/******************************************************************** Advances the smallest lsn for which there are unflushed dirty blocks in the buffer pool and also may make a new checkpoint. NOTE: this function may only be called if the calling thread owns no synchronization objects! */ === modified file 'storage/xtradb/include/rem0cmp.h' --- a/storage/xtradb/include/rem0cmp.h 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/include/rem0cmp.h 2009-06-25 01:43:25 +0000 @@ -177,10 +177,11 @@ cmp_rec_rec_with_match( matched fields; when the function returns, contains the value the for current comparison */ - ulint* matched_bytes);/* in/out: number of already matched + ulint* matched_bytes, /* in/out: number of already matched bytes within the first field not completely matched; when the function returns, contains the value for the current comparison */ + ulint stats_method); /***************************************************************** This function is used to compare two physical records. Only the common first fields are compared. */ === modified file 'storage/xtradb/include/rem0cmp.ic' --- a/storage/xtradb/include/rem0cmp.ic 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/include/rem0cmp.ic 2009-06-25 01:43:25 +0000 @@ -88,5 +88,5 @@ cmp_rec_rec( ulint match_b = 0; return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index, - &match_f, &match_b)); + &match_f, &match_b, 0)); } === modified file 'storage/xtradb/include/srv0srv.h' --- a/storage/xtradb/include/srv0srv.h 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/include/srv0srv.h 2009-06-25 01:43:25 +0000 @@ -127,6 +127,8 @@ extern ulint srv_buf_pool_curr_size; /* extern ulint srv_mem_pool_size; extern ulint srv_lock_table_size; +extern ibool srv_thread_concurrency_timer_based; + extern ulint srv_n_file_io_threads; extern ulint srv_n_read_io_threads; extern ulint srv_n_write_io_threads; @@ -163,6 +165,11 @@ extern ulint srv_fast_shutdown; /* If t extern ibool srv_innodb_status; extern unsigned long long srv_stats_sample_pages; +extern ulint srv_stats_method; +#define SRV_STATS_METHOD_NULLS_EQUAL 0 +#define SRV_STATS_METHOD_NULLS_NOT_EQUAL 1 +#define SRV_STATS_METHOD_IGNORE_NULLS 2 +extern ulint srv_stats_auto_update; extern ibool srv_use_doublewrite_buf; extern ibool srv_use_checksums; @@ -184,8 +191,10 @@ extern ulint srv_enable_unsafe_group_com extern ulint srv_read_ahead; extern ulint srv_adaptive_checkpoint; -extern ulint srv_extra_rsegments; +extern ulint srv_expand_import; +extern ulint srv_extra_rsegments; +extern ulint srv_dict_size_limit; /*-------------------------------------------*/ extern ulint srv_n_rows_inserted; @@ -552,6 +561,7 @@ struct export_var_struct{ ulint innodb_data_writes; ulint innodb_data_written; ulint innodb_data_reads; + ulint innodb_dict_tables; ulint innodb_buffer_pool_pages_total; ulint innodb_buffer_pool_pages_data; ulint innodb_buffer_pool_pages_dirty; === modified file 'storage/xtradb/include/sync0sync.h' --- a/storage/xtradb/include/sync0sync.h 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/include/sync0sync.h 2009-06-25 01:43:25 +0000 @@ -464,8 +464,14 @@ or row lock! */ SYNC_SEARCH_SYS, as memory allocation can call routines there! Otherwise the level is SYNC_MEM_HASH. */ +#define SYNC_BUF_LRU_LIST 157 +#define SYNC_BUF_PAGE_HASH 156 +#define SYNC_BUF_BLOCK 155 +#define SYNC_BUF_FREE_LIST 153 +#define SYNC_BUF_ZIP_FREE 152 +#define SYNC_BUF_ZIP_HASH 151 #define SYNC_BUF_POOL 150 -#define SYNC_BUF_BLOCK 149 +#define SYNC_BUF_FLUSH_LIST 149 #define SYNC_DOUBLEWRITE 140 #define SYNC_ANY_LATCH 135 #define SYNC_THR_LOCAL 133 === modified file 'storage/xtradb/include/univ.i' --- a/storage/xtradb/include/univ.i 2009-06-18 12:39:21 +0000 +++ b/storage/xtradb/include/univ.i 2009-08-03 20:09:53 +0000 @@ -35,7 +35,7 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 1 #define INNODB_VERSION_MINOR 0 #define INNODB_VERSION_BUGFIX 3 -#define PERCONA_INNODB_VERSION 5a +#define PERCONA_INNODB_VERSION 6a /* The following is the InnoDB version as shown in SELECT plugin_version FROM information_schema.plugins; === modified file 'storage/xtradb/include/ut0auxconf.h' --- a/storage/xtradb/include/ut0auxconf.h 2009-04-27 04:54:14 +0000 +++ b/storage/xtradb/include/ut0auxconf.h 2009-06-25 01:43:25 +0000 @@ -12,3 +12,8 @@ If by any chance Makefile.in and ./confi the hack from Makefile.in wiped away then the "real" check from plug.in will take over. */ +/* This is temprary fix for http://bugs.mysql.com/43740 */ +/* force to enable */ +#ifdef HAVE_GCC_ATOMIC_BUILTINS +#define HAVE_ATOMIC_PTHREAD_T +#endif === modified file 'storage/xtradb/log/log0log.c' --- a/storage/xtradb/log/log0log.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/log/log0log.c 2009-06-25 01:43:25 +0000 @@ -1526,6 +1526,26 @@ log_buffer_flush_to_disk(void) } /******************************************************************** +Flush the log buffer. Force it to disk depending on the value of +innodb_flush_log_at_trx_commit. */ +UNIV_INTERN +void +log_buffer_flush_maybe_sync(void) +/*=============================*/ +{ + ib_uint64_t lsn; + + mutex_enter(&(log_sys->mutex)); + + lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */ + log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, + srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE); +} +/******************************************************************** Tries to establish a big enough margin of free space in the log buffer, such that a new log entry can be catenated without an immediate need for a flush. */ static === modified file 'storage/xtradb/log/log0recv.c' --- a/storage/xtradb/log/log0recv.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/log/log0recv.c 2009-07-06 05:47:15 +0000 @@ -110,7 +110,7 @@ the log and store the scanned log record use these free frames to read in pages when we start applying the log records to the database. */ -UNIV_INTERN ulint recv_n_pool_free_frames = 256; +UNIV_INTERN ulint recv_n_pool_free_frames = 1024; /* The maximum lsn we see for a page during the recovery process. If this is bigger than the lsn we are able to scan up to, that is an indication that @@ -1225,6 +1225,8 @@ recv_recover_page( buf_block_get_page_no(block)); if ((recv_addr == NULL) + /* bugfix: http://bugs.mysql.com/bug.php?id=44140 */ + || (recv_addr->state == RECV_BEING_READ && !just_read_in) || (recv_addr->state == RECV_BEING_PROCESSED) || (recv_addr->state == RECV_PROCESSED)) { === modified file 'storage/xtradb/mtr/mtr0mtr.c' --- a/storage/xtradb/mtr/mtr0mtr.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/mtr/mtr0mtr.c 2009-06-25 01:43:25 +0000 @@ -102,6 +102,38 @@ mtr_memo_pop_all( } } +UNIV_INLINE +void +mtr_memo_note_modification_all( +/*===========================*/ + mtr_t* mtr) /* in: mtr */ +{ + mtr_memo_slot_t* slot; + dyn_array_t* memo; + ulint offset; + + ut_ad(mtr); + ut_ad(mtr->magic_n == MTR_MAGIC_N); + ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in + commit */ + ut_ad(mtr->modifications); + + memo = &(mtr->memo); + + offset = dyn_array_get_data_size(memo); + + while (offset > 0) { + offset -= sizeof(mtr_memo_slot_t); + slot = dyn_array_get_element(memo, offset); + + if (UNIV_LIKELY(slot->object != NULL) && + slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_flush_note_modification( + (buf_block_t*)slot->object, mtr); + } + } +} + /**************************************************************** Writes the contents of a mini-transaction log, if any, to the database log. */ static @@ -180,6 +212,8 @@ mtr_commit( if (write_log) { mtr_log_reserve_and_write(mtr); + + mtr_memo_note_modification_all(mtr); } /* We first update the modification info to buffer pages, and only @@ -190,12 +224,13 @@ mtr_commit( required when we insert modified buffer pages in to the flush list which must be sorted on oldest_modification. */ - mtr_memo_pop_all(mtr); - if (write_log) { log_release(); } + /* All unlocking has been moved here, after log_sys mutex release. */ + mtr_memo_pop_all(mtr); + ut_d(mtr->state = MTR_COMMITTED); dyn_array_free(&(mtr->memo)); dyn_array_free(&(mtr->log)); @@ -263,6 +298,12 @@ mtr_memo_release( slot = dyn_array_get_element(memo, offset); if ((object == slot->object) && (type == slot->type)) { + if (mtr->modifications && + UNIV_LIKELY(slot->object != NULL) && + slot->type == MTR_MEMO_PAGE_X_FIX) { + buf_flush_note_modification( + (buf_block_t*)slot->object, mtr); + } mtr_memo_slot_release(mtr, slot); === modified file 'storage/xtradb/os/os0file.c' --- a/storage/xtradb/os/os0file.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/os/os0file.c 2009-06-25 01:43:25 +0000 @@ -73,6 +73,28 @@ UNIV_INTERN ibool os_aio_use_native_aio UNIV_INTERN ibool os_aio_print_debug = FALSE; +/* State for the state of an IO request in simulated AIO. + Protocol for simulated aio: + client requests IO: find slot with reserved = FALSE. Add entry with + status = OS_AIO_NOT_ISSUED. + IO thread wakes: find adjacent slots with reserved = TRUE and status = + OS_AIO_NOT_ISSUED. Change status for slots to + OS_AIO_ISSUED. + IO operation completes: set status for slots to OS_AIO_DONE. set status + for the first slot to OS_AIO_CLAIMED and return + result for that slot. + When there are multiple read and write threads, they all compete to execute + the requests in the array (os_aio_array_t). This avoids the need to load + balance requests at the time the request is made at the cost of waking all + threads when a request is available. +*/ +typedef enum { + OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */ + OS_AIO_ISSUED, /* Being processed by an IO thread. */ + OS_AIO_DONE, /* Request processed. */ + OS_AIO_CLAIMED /* Result being returned to client. */ +} os_aio_status; + /* The aio array slot structure */ typedef struct os_aio_slot_struct os_aio_slot_t; @@ -81,6 +103,8 @@ struct os_aio_slot_struct{ ulint pos; /* index of the slot in the aio array */ ibool reserved; /* TRUE if this slot is reserved */ + os_aio_status status; /* Status for current request. Valid when reserved + is TRUE. Used only in simulated aio. */ time_t reservation_time;/* time when reserved */ ulint len; /* length of the block to read or write */ @@ -91,11 +115,11 @@ struct os_aio_slot_struct{ ulint offset_high; /* 32 high bits of file offset */ os_file_t file; /* file where to read or write */ const char* name; /* file name or path */ - ibool io_already_done;/* used only in simulated aio: - TRUE if the physical i/o already - made and only the slot message - needs to be passed to the caller - of os_aio_simulated_handle */ +// ibool io_already_done;/* used only in simulated aio: +// TRUE if the physical i/o already +// made and only the slot message +// needs to be passed to the caller +// of os_aio_simulated_handle */ fil_node_t* message1; /* message which is given by the */ void* message2; /* the requester of an aio operation and which can be used to identify @@ -141,6 +165,13 @@ struct os_aio_array_struct{ /* Array of events used in simulated aio */ static os_event_t* os_aio_segment_wait_events = NULL; +/* Number for the first global segment for reading. */ +const ulint os_aio_first_read_segment = 2; + +/* Number for the first global segment for writing. Set to +2 + os_aio_read_write_threads. */ +ulint os_aio_first_write_segment = 0; + /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These are NULL when the module has not yet been initialized. */ static os_aio_array_t* os_aio_read_array = NULL; @@ -149,11 +180,17 @@ static os_aio_array_t* os_aio_ibuf_array static os_aio_array_t* os_aio_log_array = NULL; static os_aio_array_t* os_aio_sync_array = NULL; +/* Per thread buffer used for merged IO requests. Used by +os_aio_simulated_handle so that a buffer doesn't have to be allocated +for each request. */ +static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS]; +static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS]; + static ulint os_aio_n_segments = ULINT_UNDEFINED; /* If the following is TRUE, read i/o handler threads try to wait until a batch of new read requests have been posted */ -static ibool os_aio_recommend_sleep_for_read_threads = FALSE; +static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE; UNIV_INTERN ulint os_n_file_reads = 0; UNIV_INTERN ulint os_bytes_read_since_printout = 0; @@ -2956,6 +2993,8 @@ os_aio_init( for (i = 0; i < n_segments; i++) { srv_set_io_thread_op_info(i, "not started yet"); + os_aio_thread_buffer[i] = 0; + os_aio_thread_buffer_size[i] = 0; } n_per_seg = n / n_segments; @@ -2964,6 +3003,7 @@ os_aio_init( /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */ + os_aio_first_write_segment = os_aio_first_read_segment + n_read_threads; os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1); srv_io_thread_function[0] = "insert buffer thread"; @@ -2972,14 +3012,14 @@ os_aio_init( srv_io_thread_function[1] = "log thread"; - os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg, + os_aio_read_array = os_aio_array_create(n_per_seg, n_read_segs); for (i = 2; i < 2 + n_read_segs; i++) { ut_a(i < SRV_MAX_N_IO_THREADS); srv_io_thread_function[i] = "read thread"; } - os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg, + os_aio_write_array = os_aio_array_create(n_per_seg, n_write_segs); for (i = 2 + n_read_segs; i < n_segments; i++) { ut_a(i < SRV_MAX_N_IO_THREADS); @@ -3225,7 +3265,8 @@ loop: slot->buf = buf; slot->offset = offset; slot->offset_high = offset_high; - slot->io_already_done = FALSE; +// slot->io_already_done = FALSE; + slot->status = OS_AIO_NOT_ISSUED; #ifdef WIN_ASYNC_IO control = &(slot->control); @@ -3256,6 +3297,7 @@ os_aio_array_free_slot( ut_ad(slot->reserved); slot->reserved = FALSE; + slot->status = OS_AIO_NOT_ISSUED; array->n_reserved--; @@ -3292,16 +3334,18 @@ os_aio_simulated_wake_handler_thread( segment = os_aio_get_array_and_local_segment(&array, global_segment); - n = array->n_slots / array->n_segments; + n = array->n_slots; /* Look through n slots after the segment * n'th slot */ os_mutex_enter(array->mutex); for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, i + segment * n); + slot = os_aio_array_get_nth_slot(array, i); - if (slot->reserved) { + if (slot->reserved && + (slot->status == OS_AIO_NOT_ISSUED || + slot->status == OS_AIO_DONE)) { /* Found an i/o request */ break; @@ -3311,7 +3355,25 @@ os_aio_simulated_wake_handler_thread( os_mutex_exit(array->mutex); if (i < n) { - os_event_set(os_aio_segment_wait_events[global_segment]); + if (array == os_aio_ibuf_array) { + os_event_set(os_aio_segment_wait_events[0]); + + } else if (array == os_aio_log_array) { + os_event_set(os_aio_segment_wait_events[1]); + + } else if (array == os_aio_read_array) { + ulint x; + for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++) + os_event_set(os_aio_segment_wait_events[x]); + + } else if (array == os_aio_write_array) { + ulint x; + for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++) + os_event_set(os_aio_segment_wait_events[x]); + + } else { + ut_a(0); + } } } @@ -3322,8 +3384,6 @@ void os_aio_simulated_wake_handler_threads(void) /*=======================================*/ { - ulint i; - if (os_aio_use_native_aio) { /* We do not use simulated aio: do nothing */ @@ -3332,9 +3392,10 @@ os_aio_simulated_wake_handler_threads(vo os_aio_recommend_sleep_for_read_threads = FALSE; - for (i = 0; i < os_aio_n_segments; i++) { - os_aio_simulated_wake_handler_thread(i); - } + os_aio_simulated_wake_handler_thread(0); + os_aio_simulated_wake_handler_thread(1); + os_aio_simulated_wake_handler_thread(os_aio_first_read_segment); + os_aio_simulated_wake_handler_thread(os_aio_first_write_segment); } /************************************************************************** @@ -3606,7 +3667,7 @@ os_aio_windows_handle( ut_ad(os_aio_validate()); ut_ad(segment < array->n_segments); - n = array->n_slots / array->n_segments; + n = array->n_slots; if (array == os_aio_sync_array) { os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); @@ -3615,12 +3676,12 @@ os_aio_windows_handle( srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); i = os_event_wait_multiple(n, (array->native_events) - + segment * n); + ); } os_mutex_enter(array->mutex); - slot = os_aio_array_get_nth_slot(array, i + segment * n); + slot = os_aio_array_get_nth_slot(array, i); ut_a(slot->reserved); @@ -3685,10 +3746,13 @@ os_aio_simulated_handle( os_aio_slot_t* slot; os_aio_slot_t* slot2; os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; + os_aio_slot_t* lowest_request; + os_aio_slot_t* oldest_request; ulint n_consecutive; ulint total_len; ulint offs; ulint lowest_offset; + ulint oldest_offset; ulint biggest_age; ulint age; byte* combined_buf; @@ -3696,6 +3760,7 @@ os_aio_simulated_handle( ibool ret; ulint n; ulint i; + time_t now; segment = os_aio_get_array_and_local_segment(&array, global_segment); @@ -3708,7 +3773,7 @@ restart: ut_ad(os_aio_validate()); ut_ad(segment < array->n_segments); - n = array->n_slots / array->n_segments; + n = array->n_slots; /* Look through n slots after the segment * n'th slot */ @@ -3730,9 +3795,9 @@ restart: done */ for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, i + segment * n); + slot = os_aio_array_get_nth_slot(array, i); - if (slot->reserved && slot->io_already_done) { + if (slot->reserved && slot->status == OS_AIO_DONE) { if (os_aio_print_debug) { fprintf(stderr, @@ -3754,67 +3819,57 @@ restart: then pick the one at the lowest offset. */ biggest_age = 0; - lowest_offset = ULINT_MAX; + now = time(NULL); + oldest_request = lowest_request = NULL; + oldest_offset = lowest_offset = ULINT_MAX; + /* Find the oldest request and the request with the smallest offset */ for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, i + segment * n); + slot = os_aio_array_get_nth_slot(array, i); - if (slot->reserved) { - age = (ulint)difftime(time(NULL), - slot->reservation_time); + if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) { + age = (ulint)difftime(now, slot->reservation_time); if ((age >= 2 && age > biggest_age) || (age >= 2 && age == biggest_age - && slot->offset < lowest_offset)) { + && slot->offset < oldest_offset)) { /* Found an i/o request */ - consecutive_ios[0] = slot; - - n_consecutive = 1; - biggest_age = age; - lowest_offset = slot->offset; + oldest_request = slot; + oldest_offset = slot->offset; } - } - } - - if (n_consecutive == 0) { - /* There were no old requests. Look for an i/o request at the - lowest offset in the array (we ignore the high 32 bits of the - offset in these heuristics) */ - - lowest_offset = ULINT_MAX; - - for (i = 0; i < n; i++) { - slot = os_aio_array_get_nth_slot(array, - i + segment * n); - - if (slot->reserved && slot->offset < lowest_offset) { + /* Look for an i/o request at the lowest offset in the array + * (we ignore the high 32 bits of the offset) */ + if (slot->offset < lowest_offset) { /* Found an i/o request */ - consecutive_ios[0] = slot; - - n_consecutive = 1; - + lowest_request = slot; lowest_offset = slot->offset; } } } - if (n_consecutive == 0) { + if (!lowest_request && !oldest_request) { /* No i/o requested at the moment */ goto wait_for_io; } - slot = consecutive_ios[0]; + if (oldest_request) { + slot = oldest_request; + } else { + slot = lowest_request; + } + consecutive_ios[0] = slot; + n_consecutive = 1; /* Check if there are several consecutive blocks to read or write */ consecutive_loop: for (i = 0; i < n; i++) { - slot2 = os_aio_array_get_nth_slot(array, i + segment * n); + slot2 = os_aio_array_get_nth_slot(array, i); if (slot2->reserved && slot2 != slot && slot2->offset == slot->offset + slot->len @@ -3822,7 +3877,8 @@ consecutive_loop: && slot->offset + slot->len > slot->offset && slot2->offset_high == slot->offset_high && slot2->type == slot->type - && slot2->file == slot->file) { + && slot2->file == slot->file + && slot2->status == OS_AIO_NOT_ISSUED) { /* Found a consecutive i/o request */ @@ -3851,6 +3907,8 @@ consecutive_loop: for (i = 0; i < n_consecutive; i++) { total_len += consecutive_ios[i]->len; + ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED); + consecutive_ios[i]->status = OS_AIO_ISSUED; } if (n_consecutive == 1) { @@ -3858,7 +3916,14 @@ consecutive_loop: combined_buf = slot->buf; combined_buf2 = NULL; } else { - combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE); + if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) { + if (os_aio_thread_buffer[global_segment]) + ut_free(os_aio_thread_buffer[global_segment]); + + os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE); + os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE; + } + combined_buf2 = os_aio_thread_buffer[global_segment]; ut_a(combined_buf2); @@ -3869,6 +3934,9 @@ consecutive_loop: this assumes that there is just one i/o-handler thread serving a single segment of slots! */ + ut_a(slot->reserved); + ut_a(slot->status == OS_AIO_ISSUED); + os_mutex_exit(array->mutex); if (slot->type == OS_FILE_WRITE && n_consecutive > 1) { @@ -3924,16 +3992,13 @@ consecutive_loop: } } - if (combined_buf2) { - ut_free(combined_buf2); - } - os_mutex_enter(array->mutex); /* Mark the i/os done in slots */ for (i = 0; i < n_consecutive; i++) { - consecutive_ios[i]->io_already_done = TRUE; + ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED); + consecutive_ios[i]->status = OS_AIO_DONE; } /* We return the messages for the first slot now, and if there were @@ -3943,6 +4008,8 @@ consecutive_loop: slot_io_done: ut_a(slot->reserved); + ut_a(slot->status == OS_AIO_DONE); + slot->status = OS_AIO_CLAIMED; *message1 = slot->message1; *message2 = slot->message2; === modified file 'storage/xtradb/rem/rem0cmp.c' --- a/storage/xtradb/rem/rem0cmp.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/rem/rem0cmp.c 2009-06-25 01:43:25 +0000 @@ -892,10 +892,11 @@ cmp_rec_rec_with_match( matched fields; when the function returns, contains the value the for current comparison */ - ulint* matched_bytes) /* in/out: number of already matched + ulint* matched_bytes, /* in/out: number of already matched bytes within the first field not completely matched; when the function returns, contains the value for the current comparison */ + ulint stats_method) { #ifndef UNIV_HOTBACKUP ulint rec1_n_fields; /* the number of fields in rec */ @@ -989,7 +990,11 @@ cmp_rec_rec_with_match( if (rec1_f_len == rec2_f_len) { - goto next_field; + if (stats_method == SRV_STATS_METHOD_NULLS_EQUAL) { + goto next_field; + } else { + ret = -1; + } } else if (rec2_f_len == UNIV_SQL_NULL) { === modified file 'storage/xtradb/row/row0mysql.c' --- a/storage/xtradb/row/row0mysql.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/row/row0mysql.c 2009-06-25 01:43:25 +0000 @@ -854,6 +854,9 @@ row_update_statistics_if_needed( table->stat_modified_counter = counter + 1; + if (!srv_stats_auto_update) + return; + /* Calculate new statistics if 1 / 16 of table has been modified since the last time a statistics batch was run, or if stat_modified_counter > 2 000 000 000 (to avoid wrap-around). === modified file 'storage/xtradb/scripts/install_innodb_plugins.sql' --- a/storage/xtradb/scripts/install_innodb_plugins.sql 2009-01-29 16:54:13 +0000 +++ b/storage/xtradb/scripts/install_innodb_plugins.sql 2009-06-25 01:43:25 +0000 @@ -12,3 +12,5 @@ INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_BLOB SONAME 'ha_innodb.so'; INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_INDEX SONAME 'ha_innodb.so'; INSTALL PLUGIN innodb_rseg SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_table_stats SONAME 'ha_innodb.so'; +INSTALL PLUGIN innodb_index_stats SONAME 'ha_innodb.so'; === modified file 'storage/xtradb/srv/srv0srv.c' --- a/storage/xtradb/srv/srv0srv.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/srv/srv0srv.c 2009-07-06 05:47:15 +0000 @@ -285,6 +285,7 @@ Value 10 should be good if there are les computer. Bigger computers need bigger values. Value 0 will disable the concurrency check. */ +UNIV_INTERN ibool srv_thread_concurrency_timer_based = FALSE; UNIV_INTERN ulong srv_thread_concurrency = 0; UNIV_INTERN ulong srv_commit_concurrency = 0; @@ -336,6 +337,8 @@ UNIV_INTERN ibool srv_innodb_status = FA /* When estimating number of different key values in an index, sample this many index pages */ UNIV_INTERN unsigned long long srv_stats_sample_pages = 8; +UNIV_INTERN ulint srv_stats_method = 0; +UNIV_INTERN ulint srv_stats_auto_update = 1; UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE; UNIV_INTERN ibool srv_use_checksums = TRUE; @@ -361,14 +364,18 @@ UNIV_INTERN ulint srv_flush_neighbor_pag UNIV_INTERN ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */ UNIV_INTERN ulint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */ -UNIV_INTERN ulint srv_adaptive_checkpoint = 0; /* 0:disable 1:enable */ +UNIV_INTERN ulint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */ + +UNIV_INTERN ulint srv_expand_import = 0; /* 0:disable 1:enable */ UNIV_INTERN ulint srv_extra_rsegments = 0; /* extra rseg for users */ +UNIV_INTERN ulint srv_dict_size_limit = 0; /*-------------------------------------------*/ UNIV_INTERN ulong srv_n_spin_wait_rounds = 20; UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500; UNIV_INTERN ulong srv_thread_sleep_delay = 10000; UNIV_INTERN ulint srv_spin_wait_delay = 5; +UNIV_INTERN ulint srv_spins_microsec = 50; UNIV_INTERN ibool srv_priority_boost = TRUE; #ifdef UNIV_DEBUG @@ -657,6 +664,47 @@ are indexed by the type of the thread. * UNIV_INTERN ulint srv_n_threads_active[SRV_MASTER + 1]; UNIV_INTERN ulint srv_n_threads[SRV_MASTER + 1]; +static +void +srv_align_spins_microsec(void) +{ + ulint start_sec, end_sec; + ulint start_usec, end_usec; + ib_uint64_t usecs; + + /* change temporary */ + srv_spins_microsec = 1; + + if (ut_usectime(&start_sec, &start_usec)) { + srv_spins_microsec = 50; + goto end; + } + + ut_delay(100000); + + if (ut_usectime(&end_sec, &end_usec)) { + srv_spins_microsec = 50; + goto end; + } + + usecs = (end_sec - start_sec) * 1000000LL + (end_usec - start_usec); + + if (usecs) { + srv_spins_microsec = 100000 / usecs; + if (srv_spins_microsec == 0) + srv_spins_microsec = 1; + if (srv_spins_microsec > 50) + srv_spins_microsec = 50; + } else { + srv_spins_microsec = 50; + } +end: + if (srv_spins_microsec != 50) + fprintf(stderr, + "InnoDB: unit of spin count at ut_delay() is aligned to %lu\n", + srv_spins_microsec); +} + /************************************************************************* Sets the info describing an i/o thread current state. */ UNIV_INTERN @@ -889,6 +937,8 @@ srv_init(void) dict_table_t* table; ulint i; + srv_align_spins_microsec(); + srv_sys = mem_alloc(sizeof(srv_sys_t)); kernel_mutex_temp = mem_alloc(sizeof(mutex_t)); @@ -1009,6 +1059,75 @@ UNIV_INTERN ulong srv_max_purge_lag = 0 /************************************************************************* Puts an OS thread to wait if there are too many concurrent threads (>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */ + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS +static void +enter_innodb_with_tickets(trx_t* trx) +{ + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER; + return; +} + +static void +srv_conc_enter_innodb_timer_based(trx_t* trx) +{ + lint conc_n_threads; + ibool has_yielded = FALSE; + ulint has_slept = 0; + + if (trx->declared_to_be_inside_innodb) { + ut_print_timestamp(stderr); + fputs( +" InnoDB: Error: trying to declare trx to enter InnoDB, but\n" +"InnoDB: it already is declared.\n", stderr); + trx_print(stderr, trx, 0); + putc('\n', stderr); + } +retry: + if (srv_conc_n_threads < (lint) srv_thread_concurrency) { + conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1); + if (conc_n_threads <= (lint) srv_thread_concurrency) { + enter_innodb_with_tickets(trx); + return; + } + __sync_add_and_fetch(&srv_conc_n_threads, -1); + } + if (!has_yielded) + { + has_yielded = TRUE; + os_thread_yield(); + goto retry; + } + if (trx->has_search_latch + || NULL != UT_LIST_GET_FIRST(trx->trx_locks)) { + + conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1); + enter_innodb_with_tickets(trx); + return; + } + if (has_slept < 2) + { + trx->op_info = "sleeping before entering InnoDB"; + os_thread_sleep(10000); + trx->op_info = ""; + has_slept++; + } + conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1); + enter_innodb_with_tickets(trx); + return; +} + +static void +srv_conc_exit_innodb_timer_based(trx_t* trx) +{ + __sync_add_and_fetch(&srv_conc_n_threads, -1); + trx->declared_to_be_inside_innodb = FALSE; + trx->n_tickets_to_enter_innodb = 0; + return; +} +#endif + UNIV_INTERN void srv_conc_enter_innodb( @@ -1039,6 +1158,13 @@ srv_conc_enter_innodb( return; } +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + if (srv_thread_concurrency_timer_based) { + srv_conc_enter_innodb_timer_based(trx); + return; + } +#endif + os_fast_mutex_lock(&srv_conc_mutex); retry: if (trx->declared_to_be_inside_innodb) { @@ -1182,6 +1308,14 @@ srv_conc_force_enter_innodb( } ut_ad(srv_conc_n_threads >= 0); +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + if (srv_thread_concurrency_timer_based) { + __sync_add_and_fetch(&srv_conc_n_threads, 1); + trx->declared_to_be_inside_innodb = TRUE; + trx->n_tickets_to_enter_innodb = 1; + return; + } +#endif os_fast_mutex_lock(&srv_conc_mutex); @@ -1215,6 +1349,13 @@ srv_conc_force_exit_innodb( return; } +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + if (srv_thread_concurrency_timer_based) { + srv_conc_exit_innodb_timer_based(trx); + return; + } +#endif + os_fast_mutex_lock(&srv_conc_mutex); ut_ad(srv_conc_n_threads > 0); @@ -1934,6 +2075,7 @@ srv_export_innodb_status(void) export_vars.innodb_data_reads = os_n_file_reads; export_vars.innodb_data_writes = os_n_file_writes; export_vars.innodb_data_written = srv_data_written; + export_vars.innodb_dict_tables= (dict_sys ? UT_LIST_GET_LEN(dict_sys->table_LRU) : 0); export_vars.innodb_buffer_pool_read_requests = buf_pool->n_page_gets; export_vars.innodb_buffer_pool_write_requests = srv_buf_pool_write_requests; @@ -2348,6 +2490,8 @@ srv_master_thread( ibool skip_sleep = FALSE; ulint i; + ib_uint64_t lsn_old; + ib_uint64_t oldest_lsn; #ifdef UNIV_DEBUG_THREAD_CREATION @@ -2365,6 +2509,9 @@ srv_master_thread( mutex_exit(&kernel_mutex); + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); loop: /*****************************************************************/ /* ---- When there is database activity by users, we cycle in this @@ -2399,6 +2546,19 @@ loop: if (!skip_sleep) { os_thread_sleep(1000000); + + /* + mutex_enter(&(log_sys->mutex)); + oldest_lsn = buf_pool_get_oldest_modification(); + ib_uint64_t lsn = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + + if(oldest_lsn) + fprintf(stderr, + "InnoDB flush: age pct: %lu, lsn progress: %lu\n", + (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age, + lsn - lsn_old); + */ } skip_sleep = FALSE; @@ -2437,14 +2597,15 @@ loop: + log_sys->n_pending_writes; n_ios = log_sys->n_log_ios + buf_pool->n_pages_read + buf_pool->n_pages_written; - if (n_pend_ios < 3 && (n_ios - n_ios_old < PCT_IO(5))) { + if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) { srv_main_thread_op_info = "doing insert buffer merge"; ibuf_contract_for_n_pages( TRUE, PCT_IBUF_IO((srv_insert_buffer_batch_size / 4))); srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + /* No fsync when srv_flush_log_at_trx_commit != 1 */ + log_buffer_flush_maybe_sync(); } if (UNIV_UNLIKELY(buf_get_modified_ratio_pct() @@ -2462,13 +2623,16 @@ loop: iteration of this loop. */ skip_sleep = TRUE; - } else if (srv_adaptive_checkpoint) { + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + } else if (srv_adaptive_checkpoint == 1) { /* Try to keep modified age not to exceed max_checkpoint_age * 7/8 line */ mutex_enter(&(log_sys->mutex)); - + lsn_old = log_sys->lsn; oldest_lsn = buf_pool_get_oldest_modification(); if (oldest_lsn == 0) { @@ -2504,7 +2668,93 @@ loop: mutex_exit(&(log_sys->mutex)); } } + } else if (srv_adaptive_checkpoint == 2) { + /* Try to keep modified age not to exceed + max_checkpoint_age * 7/8 line */ + + mutex_enter(&(log_sys->mutex)); + + oldest_lsn = buf_pool_get_oldest_modification(); + if (oldest_lsn == 0) { + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + + } else { + if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) { + /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */ + /* We should not flush from here. */ + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + } else if ((log_sys->lsn - oldest_lsn) + > (log_sys->max_checkpoint_age)/2 ) { + + /* defence line (max_checkpoint_age * 1/2) */ + ib_uint64_t lsn = log_sys->lsn; + + mutex_exit(&(log_sys->mutex)); + + ib_uint64_t level, bpl; + buf_page_t* bpage; + + mutex_enter(&flush_list_mutex); + + level = 0; + bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); + + while (bpage != NULL) { + ib_uint64_t oldest_modification = bpage->oldest_modification; + if (oldest_modification != 0) { + level += log_sys->max_checkpoint_age + - (lsn - oldest_modification); + } + bpage = UT_LIST_GET_NEXT(flush_list, bpage); + } + + if (level) { + bpl = ((ib_uint64_t) UT_LIST_GET_LEN(buf_pool->flush_list) + * UT_LIST_GET_LEN(buf_pool->flush_list) + * (lsn - lsn_old)) / level; + } else { + bpl = 0; + } + + mutex_exit(&flush_list_mutex); + + if (!srv_use_doublewrite_buf) { + /* flush is faster than when doublewrite */ + bpl = (bpl * 3) / 4; + } + + if (bpl) { +retry_flush_batch: + n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, + bpl, + oldest_lsn + (lsn - lsn_old)); + if (n_pages_flushed == ULINT_UNDEFINED) { + os_thread_sleep(5000); + goto retry_flush_batch; + } + } + + lsn_old = lsn; + /* + fprintf(stderr, + "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n", + (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age, + lsn - lsn_old, bpl); + */ + } else { + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); + } + } + + } else { + mutex_enter(&(log_sys->mutex)); + lsn_old = log_sys->lsn; + mutex_exit(&(log_sys->mutex)); } if (srv_activity_count == old_activity_count) { @@ -2537,7 +2787,8 @@ loop: buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), IB_ULONGLONG_MAX); srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + /* No fsync when srv_flush_log_at_trx_commit != 1 */ + log_buffer_flush_maybe_sync(); } /* We run a batch of insert buffer merge every 10 seconds, @@ -2547,7 +2798,8 @@ loop: ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO((srv_insert_buffer_batch_size / 4))); srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + /* No fsync when srv_flush_log_at_trx_commit != 1 */ + log_buffer_flush_maybe_sync(); /* We run a full purge every 10 seconds, even if the server were active */ @@ -2718,7 +2970,14 @@ flush_loop: srv_main_thread_op_info = "flushing log"; - log_buffer_flush_to_disk(); + current_time = time(NULL); + if (difftime(current_time, last_flush_time) > 1) { + log_buffer_flush_to_disk(); + last_flush_time = current_time; + } else { + /* No fsync when srv_flush_log_at_trx_commit != 1 */ + log_buffer_flush_maybe_sync(); + } srv_main_thread_op_info = "making checkpoint"; === modified file 'storage/xtradb/srv/srv0start.c' --- a/storage/xtradb/srv/srv0start.c 2009-06-09 15:08:46 +0000 +++ b/storage/xtradb/srv/srv0start.c 2009-08-03 20:09:53 +0000 @@ -1269,7 +1269,7 @@ innobase_start_or_create_for_mysql(void) os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD * srv_n_file_io_threads, srv_n_read_io_threads, srv_n_write_io_threads, - SRV_MAX_N_PENDING_SYNC_IOS * 8); + SRV_MAX_N_PENDING_SYNC_IOS); } else { os_aio_init(SRV_N_PENDING_IOS_PER_THREAD * srv_n_file_io_threads, === modified file 'storage/xtradb/sync/sync0sync.c' --- a/storage/xtradb/sync/sync0sync.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/sync/sync0sync.c 2009-06-25 01:43:25 +0000 @@ -1081,6 +1081,12 @@ sync_thread_add_level( case SYNC_TRX_SYS_HEADER: case SYNC_FILE_FORMAT_TAG: case SYNC_DOUBLEWRITE: + case SYNC_BUF_LRU_LIST: + case SYNC_BUF_FLUSH_LIST: + case SYNC_BUF_PAGE_HASH: + case SYNC_BUF_FREE_LIST: + case SYNC_BUF_ZIP_FREE: + case SYNC_BUF_ZIP_HASH: case SYNC_BUF_POOL: case SYNC_SEARCH_SYS: case SYNC_SEARCH_SYS_CONF: @@ -1107,7 +1113,7 @@ sync_thread_add_level( /* Either the thread must own the buffer pool mutex (buf_pool_mutex), or it is allowed to latch only ONE buffer block (block->mutex or buf_pool_zip_mutex). */ - ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL) + ut_a((sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST) && sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1)) || sync_thread_levels_g(array, SYNC_BUF_BLOCK)); break; === modified file 'storage/xtradb/ut/ut0ut.c' --- a/storage/xtradb/ut/ut0ut.c 2009-03-26 06:11:11 +0000 +++ b/storage/xtradb/ut/ut0ut.c 2009-06-25 01:43:25 +0000 @@ -372,6 +372,8 @@ ut_get_year_month_day( /***************************************************************** Runs an idle loop on CPU. The argument gives the desired delay in microseconds on 100 MHz Pentium + Visual C++. */ +extern ulint srv_spins_microsec; + UNIV_INTERN ulint ut_delay( @@ -383,7 +385,11 @@ ut_delay( j = 0; - for (i = 0; i < delay * 50; i++) { + for (i = 0; i < delay * srv_spins_microsec; i++) { +#if (defined (__i386__) || defined (__x86_64__)) && defined (__GNUC__) + /* it is equal to the instruction 'pause' */ + __asm__ __volatile__ ("rep; nop"); +#endif j += i; }
participants (1)
-
knielsen@knielsen-hq.org