Hi Marko, I looked a bit more at the idea that we have discussed a couple times, of storing the binlog in an InnoDB tablespace to avoid the need for two-phase commit between binlog and InnoDB and save one (or potentially both) fsync()s during a commit. I managed to get some InnoDB code written that is able to create a tablespace and write to it, patch below or on github: https://github.com/MariaDB/server/commits/knielsen_binlog_in_engine https://github.com/MariaDB/server/commit/95958c3842ebf0f7e358d6c3f51b887bd99... This is far from a complete patch, just an exercise for me on how InnoDB tablespaces and mini-transactions work in detail. But it turned out to be useful for me to get started and find a few questions to ask. My general approach to writing is to use fsp_page_create() for the first write to a page, and then buf_page_get_gen() for subsequent writes. But maybe this should be refined for the actual implementation. I'm thinking if perhaps fsp_page_create() does too much, you mentioned earlier that some parts of the page header could be simplified/omitted. And maybe there is a way to pin the current page in the buffer pool so buf_page_get_gen() is not needed for every write? I'm currently passing RW_SX_LATCH to buf_page_get_gen() (otherwise I got an assertion when writing). I'm not sure though how these latches work, or if binlog writing would need such latches; maybe it makes more sense to have a simple mutex protecting page access? Another assertion was fixed by doing mtr.set_named_space() before writing. Again, I'm not sure what this does exactly or if it's appropriate? I tried in this patch to reserve 2 "special" tablespace ids for the binlog tablespaces. Idea would be to cycle between them, keeping at most the two last tablespaces active. But do the tablespace IDs appear in the redo log and used for recovery? In that case, I assume that all binlog tablespaces written since the last InnoDB checkpoint will need a unique tablespace ID? So maybe 2 is too few. I was thinking maybe the binlog could allocate new tablespace IDs as necessary, but re-use them after each InnoDB checkpoint. This would avoid wasting ids and eventually hitting the 2**32 limit. - Kristian. commit 95958c3842ebf0f7e358d6c3f51b887bd9948845 (HEAD -> binlog_in_inno, origin/knielsen_binlog_in_engine) Author: Kristian Nielsen <knielsen@knielsen-hq.org> Date: Sun Feb 25 17:41:50 2024 +0100 Binlog in Engine: Very first sketch, able to create and write an InnoDB tablespace Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org> diff --git a/mysql-test/suite/binlog/t/binlog_in_engine.test b/mysql-test/suite/binlog/t/binlog_in_engine.test new file mode 100644 index 00000000000..947139c9bcc --- /dev/null +++ b/mysql-test/suite/binlog/t/binlog_in_engine.test @@ -0,0 +1,11 @@ +--source include/have_innodb.inc +--source include/have_binlog_format_mixed.inc + +CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB; +INSERT INTO t1 VALUES (1); +BEGIN; +INSERT INTO t1 VALUES (2); +INSERT INTO t1 VALUES (3); +COMMIT; +SELECT * FROM t1 ORDER BY a; +DROP TABLE t1; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index bd43429eb5d..09d8fdaff39 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1626,7 +1626,8 @@ static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) { const auto space_id= space->id; - ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND); + ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND || + space_id == SRV_SPACE_ID_BINLOG0 || space_id == SRV_SPACE_ID_BINLOG1); bool may_have_skipped= false; ulint max_n_flush= srv_io_capacity; diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 0ce54df6574..e83225a4883 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -184,7 +184,7 @@ it is an absolute path. */ const char* fil_path_to_mysql_datadir; /** Common InnoDB file extensions */ -const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg" }; +const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg", ".ibb" }; /** Number of pending tablespace flushes */ Atomic_counter<ulint> fil_n_pending_tablespace_flushes; @@ -1044,6 +1044,9 @@ fil_space_t *fil_space_t::create(uint32_t id, uint32_t flags, if (UNIV_LIKELY(id <= fil_system.max_assigned_id)) { break; } + if (id == SRV_SPACE_ID_BINLOG0 || id == SRV_SPACE_ID_BINLOG1) { + break; + } if (UNIV_UNLIKELY(srv_operation == SRV_OPERATION_BACKUP)) { break; } @@ -1603,9 +1606,10 @@ inline void mtr_t::log_file_op(mfile_type_t type, uint32_t space_id, ut_ad(!(byte(type) & 15)); /* fil_name_parse() requires that there be at least one path - separator and that the file path end with ".ibd". */ + separator and that the file path end with ".ibd" or "ibb". */ ut_ad(strchr(path, '/')); - ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD)); + ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD) || + !strcmp(&path[strlen(path) - strlen(DOT_IBB)], DOT_IBB)); m_modifications= true; if (!is_logged()) diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 787bda53895..f1bb42b3a15 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -3763,3 +3763,129 @@ void fsp_shrink_temp_space() mtr.commit(); sql_print_information("InnoDB: Temporary tablespace truncated successfully"); } + + + +fil_space_t* binlog_space; +buf_block_t *binlog_cur_block; +uint32_t binlog_cur_page_no; +uint32_t binlog_cur_page_offset; + +/** Create a binlog tablespace file +@param[in] name file name +@return DB_SUCCESS or error code */ +dberr_t fsp_binlog_tablespace_create(const char* name) +{ + pfs_os_file_t fh; + bool ret; + + uint32_t size= (1<<20) >> srv_page_size_shift /* ToDo --max-binlog-size */; + if(srv_read_only_mode) + return DB_ERROR; + + os_file_create_subdirs_if_needed(name); + + /* ToDo: Do we need here an mtr.log_file_op(FILE_CREATE) like in fil_ibd_create(()? */ + fh = os_file_create( + innodb_data_file_key, + name, + OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_AIO, OS_DATA_FILE, srv_read_only_mode, &ret); + + if (!ret) { + os_file_close(fh); + return DB_ERROR; + } + + /* ToDo: Enryption? */ + fil_encryption_t mode= FIL_ENCRYPTION_OFF; + fil_space_crypt_t* crypt_data= nullptr; + + /* We created the binlog file and now write it full of zeros */ + if (!os_file_set_size(name, fh, + os_offset_t{size} << srv_page_size_shift)) { + ib::error() << "Unable to allocate " << name; + os_file_close(fh); + os_file_delete(innodb_data_file_key, name); + return DB_ERROR; + } + + mysql_mutex_lock(&fil_system.mutex); + uint32_t space_id= SRV_SPACE_ID_BINLOG0; + if (!(binlog_space= fil_space_t::create(space_id, + ( FSP_FLAGS_FCRC32_MASK_MARKER | + FSP_FLAGS_FCRC32_PAGE_SSIZE()), + FIL_TYPE_TABLESPACE, crypt_data, + mode, true))) { + mysql_mutex_unlock(&fil_system.mutex); + return DB_ERROR; + } + + fil_node_t* node = binlog_space->add(name, fh, size, false, true); + IF_WIN(node->find_metadata(), node->find_metadata(fh, true)); + mysql_mutex_unlock(&fil_system.mutex); + + binlog_cur_page_no= 0; + binlog_cur_page_offset= FIL_PAGE_DATA; + return DB_SUCCESS; +} + +void fsp_binlog_write_start(uint32_t page_no, + const uchar *data, uint32_t len, mtr_t *mtr) +{ + buf_block_t *block= fsp_page_create(binlog_space, page_no, mtr); + mtr->memcpy<mtr_t::MAYBE_NOP>(*block, FIL_PAGE_DATA + block->page.frame, + data, len); + binlog_cur_block= block; +} + +void fsp_binlog_write_offset(uint32_t page_no, uint32_t offset, + const uchar *data, uint32_t len, mtr_t *mtr) +{ + dberr_t err; + /* ToDo: Is RW_SX_LATCH appropriate here? */ + buf_block_t *block= buf_page_get_gen(page_id_t{binlog_space->id, page_no}, + 0, RW_SX_LATCH, binlog_cur_block, + BUF_GET, mtr, &err); + ut_a(err == DB_SUCCESS); + mtr->memcpy<mtr_t::MAYBE_NOP>(*block, + offset + block->page.frame, + data, len); +} + +void fsp_binlog_append(const uchar *data, uint32_t len, mtr_t *mtr) +{ + ut_ad(binlog_cur_page_offset <= srv_page_size - FIL_PAGE_DATA_END); + uint32_t remain= ((uint32_t)srv_page_size - FIL_PAGE_DATA_END) - + binlog_cur_page_offset; + // ToDo: Some kind of mutex to protect binlog access. + while (len > 0) { + if (remain < 4) { + binlog_cur_page_offset= FIL_PAGE_DATA; + remain= ((uint32_t)srv_page_size - FIL_PAGE_DATA_END) - + binlog_cur_page_offset; + ++binlog_cur_page_no; + } + uint32_t this_len= std::min<uint32_t>(len, remain); + if (binlog_cur_page_offset == FIL_PAGE_DATA) + fsp_binlog_write_start(binlog_cur_page_no, data, this_len, mtr); + else + fsp_binlog_write_offset(binlog_cur_page_no, binlog_cur_page_offset, + data, this_len, mtr); + len-= this_len; + data+= this_len; + binlog_cur_page_offset+= this_len; + } +} + + +void fsp_binlog_test(const uchar *data, uint32_t len) +{ + mtr_t mtr; + mtr.start(); + if (!binlog_space) + fsp_binlog_tablespace_create("./binlog-000000.ibb"); + mtr.set_named_space(binlog_space); + fsp_binlog_append(data, len, &mtr); + mtr.commit(); +} diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 93127bb1c3a..df2bd07d2dc 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -4481,6 +4481,10 @@ innobase_commit( if (commit_trx || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { + /* ToDo: This is just a random very initial test of writing + something into a binlog tablespace. */ + if (!opt_bootstrap) + fsp_binlog_test((const uchar *)"Hulubulu!!?!", 12); /* Run the fast part of commit if we did not already. */ if (!trx->active_commit_ordered) { innobase_commit_ordered_2(trx, thd); diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index f3660eff7c6..17b35f2f892 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1129,10 +1129,12 @@ enum ib_extention { NO_EXT = 0, IBD = 1, ISL = 2, - CFG = 3 + CFG = 3, + IBB = 4 }; extern const char* dot_ext[]; #define DOT_IBD dot_ext[IBD] +#define DOT_IBB dot_ext[IBB] #define DOT_ISL dot_ext[ISL] #define DOT_CFG dot_ext[CFG] diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index ddc45e53fe6..26a45518ba2 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -579,6 +579,8 @@ void fsp_system_tablespace_truncate(); /** Truncate the temporary tablespace */ void fsp_shrink_temp_space(); +extern void fsp_binlog_test(const uchar *data, uint32_t len); + #ifndef UNIV_DEBUG # define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr) #endif diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h index 757ead55d03..e3d45796190 100644 --- a/storage/innobase/include/fsp0types.h +++ b/storage/innobase/include/fsp0types.h @@ -27,8 +27,12 @@ Created May 26, 2009 Vasil Dimov #pragma once #include "ut0byte.h" -/** All persistent tablespaces have a smaller fil_space_t::id than this. */ +/** All persistent tablespaces (except binlog tablespaces) have a smaller +fil_space_t::id than this. */ constexpr uint32_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0U; +/** Binlog tablespaces. */ +constexpr uint32_t SRV_SPACE_ID_BINLOG0 = SRV_SPACE_ID_UPPER_BOUND + 1; +constexpr uint32_t SRV_SPACE_ID_BINLOG1 = SRV_SPACE_ID_UPPER_BOUND + 2; /** The fil_space_t::id of the innodb_temporary tablespace. */ constexpr uint32_t SRV_TMP_SPACE_ID= 0xFFFFFFFEU; diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index e2419309764..86f6e3794f6 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -25,7 +25,7 @@ Mini-transaction log record encoding and decoding #include "mtr0mtr.h" /** The smallest invalid page identifier for persistent tablespaces */ -constexpr page_id_t end_page_id{SRV_SPACE_ID_UPPER_BOUND, 0}; +constexpr page_id_t end_page_id{SRV_SPACE_ID_BINLOG1, 0}; /** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */ constexpr uint32_t MIN_2BYTE= 1 << 7; diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index ef31a4d00c1..310acb73071 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -2973,7 +2973,8 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse(source &l, bool if_exists) if (is_predefined_tablespace(space_id)) goto file_rec_error; - if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4)) + if (fnend - fn < 4 || + (memcmp(fnend - 4, DOT_IBD, 4) && memcmp(fnend - 4, DOT_IBB, 4))) goto file_rec_error; if (UNIV_UNLIKELY(!recv_needed_recovery && srv_read_only_mode))