
At file:///home/psergey/dev2/maria-5.3-dsmrr-cpk-r4/ ------------------------------------------------------------ revno: 2808 revision-id: psergey@askmonty.org-20100808071354-v1d4objmg07ii10i parent: psergey@askmonty.org-20100717210544-rb0u05miqoxnsug3 committer: Sergey Petrunya <psergey@askmonty.org> branch nick: maria-5.3-dsmrr-cpk-r4 timestamp: Sun 2010-08-08 11:13:54 +0400 message: DS-MRR, key-ordered retrievals: commit for buildbot === modified file 'sql/multi_range_read.cc' --- a/sql/multi_range_read.cc 2010-07-17 21:05:44 +0000 +++ b/sql/multi_range_read.cc 2010-08-08 07:13:54 +0000 @@ -287,6 +287,84 @@ * DS-MRR implementation ***************************************************************************/ +void SimpleBuffer::write(const uchar *data, size_t bytes) +{ + DBUG_ASSERT(have_space_for(bytes)); + + if (direction == -1) + write_pos -= bytes; + + memcpy(write_pos, data, bytes); + + if (direction == 1) + write_pos += bytes; +} + +bool SimpleBuffer::have_space_for(size_t bytes) +{ + if (direction == 1) + return (write_pos + bytes < end); + else + return (write_pos - bytes >= start); +} + +size_t SimpleBuffer::used_size() +{ + return (direction == 1)? write_pos - read_pos : read_pos - write_pos; +} + +uchar *SimpleBuffer::read(size_t bytes) +{ + DBUG_ASSERT(have_data(bytes)); + uchar *res; + if (direction == 1) + { + res= read_pos; + read_pos += bytes; + return res; + } + else + { + read_pos= read_pos - bytes; + return read_pos; + } +} + +bool SimpleBuffer::have_data(size_t bytes) +{ + return (direction == 1)? (write_pos - read_pos >= (ptrdiff_t)bytes) : + (read_pos - write_pos >= (ptrdiff_t)bytes); +} + +void SimpleBuffer::reset_for_writing() +{ + if (direction == 1) + write_pos= read_pos= start; + else + write_pos= read_pos= end; +} + +void SimpleBuffer::reset_for_reading() +{ +/* +Do we need this at all? + if (direction == 1) + pos= start; + else + pos= end; +//end? +*/ +} + +uchar *SimpleBuffer::end_of_space() +{ + if (direction == 1) + return start; + else + return end; +//TODO: check this. +} + /** DS-MRR: Initialize and start MRR scan @@ -309,7 +387,6 @@ void *seq_init_param, uint n_ranges, uint mode, HANDLER_BUFFER *buf) { - uint elem_size; Item *pushed_cond= NULL; handler *new_h2= 0; THD *thd= current_thd; @@ -330,69 +407,69 @@ } use_default_impl= FALSE; is_mrr_assoc= !test(mode & HA_MRR_NO_ASSOCIATION); - - // psergey2: split the buffer: + /* - psergey2-note: we can't split the buffer here because we don't know how key - length. we'll only be able to do it when we've got the first range. + Figure out what steps we'll need to do */ + do_sort_keys= FALSE; if ((mode & HA_MRR_SINGLE_POINT) && optimizer_flag(thd, OPTIMIZER_SWITCH_MRR_SORT_KEYS)) { - //do_sort_keys= TRUE; // will use key buffer to sort keys; + do_sort_keys= TRUE; use_key_pointers= test(mode & HA_MRR_MATERIALIZED_KEYS); } - /* + do_rowid_fetch= FALSE; - if (!doing_cpk_scan && !index_only_read) - { - do_rowid_fetch= TRUE; //will use rowid buffer to store/sort rowids, etc - } - - - if (do_sort_keys && do_rowid_fetch) - { - split buffer space proportionally - } - else - { - // give all space to one buffer - if (do_sort_keys) - { - //sort_buffer_start= ...; - } - else - { - DBUG_ASSERT(do_rowid_fetch); - //rowid_buffer_start= ...; - } - } + doing_cpk_scan= check_cpk_scan(h->active_index, mode); + if (!doing_cpk_scan /* && !index_only_read */) + { + /* Will use rowid buffer to store/sort rowids, etc */ + do_rowid_fetch= TRUE; + } + DBUG_ASSERT(do_sort_keys || do_rowid_fetch); + + full_buf= buf->buffer; + full_buf_end= buf->buffer_end; + + /* + At start, alloc all of the buffer for rowids. Key sorting code will grab a + piece if necessary. */ - mrr_buf= buf->buffer; - mrr_buf_end= buf->buffer_end; + rowid_buffer.set_buffer_space(full_buf, full_buf_end, 1); if (is_mrr_assoc) status_var_increment(table->in_use->status_var.ha_multi_range_read_init_count); - - if ((doing_cpk_scan= check_cpk_scan(h->active_index, mode))) + + /* + psergey2-todo: for CPK scans: + - use MRR irrespectively of @@mrr_sort_keys setting, + - dont do rowid retrieval. + */ + if (do_sort_keys) { /* It's a DS-MRR/CPK scan */ - cpk_tuple_length= 0; /* dummy value telling it needs to be inited */ - cpk_have_range= FALSE; + key_tuple_length= 0; /* dummy value telling it needs to be inited */ + key_buff_elem_size= 0; + in_index_range= FALSE; h->mrr_iter= seq_funcs->init(seq_init_param, n_ranges, mode); h->mrr_funcs= *seq_funcs; + keyno= h->active_index != MAX_KEY? h->active_index : h2->active_index; dsmrr_fill_key_buffer(); - if (dsmrr_eof) - buf->end_of_used_area= mrr_buf_last; - DBUG_RETURN(0); /* nothing could go wrong while filling the buffer */ - } - - /* In regular DS-MRR, buffer stores {rowid, range_id} pairs */ - elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*); - mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/ elem_size)* elem_size; - mrr_buf_end= mrr_buf_last; - + if (dsmrr_eof && !do_rowid_fetch) + buf->end_of_used_area= key_buffer.end_of_space(); + } + + if (!do_rowid_fetch) + { + /* + We have the keys and won't need to fetch rowids, as key lookup will be + the last operation, done in multi_range_read_next(). + */ + DBUG_RETURN(0); + } + + rowid_buff_elem_size= h->ref_length + (is_mrr_assoc? sizeof(char*) : 0); /* psergey2: this is only needed when - doing a rowid-to-row scan @@ -416,7 +493,7 @@ if (check_stack_overrun(thd, 5*STACK_MIN_SIZE, (uchar*) &new_h2)) DBUG_RETURN(1); DBUG_ASSERT(h->active_index != MAX_KEY); - uint mrr_keyno= h->active_index; + keyno= h->active_index; /* Create a separate handler object to do rnd_pos() calls. */ if (!(new_h2= h->clone(thd->mem_root)) || @@ -426,7 +503,7 @@ DBUG_RETURN(1); } - if (mrr_keyno == h->pushed_idx_cond_keyno) + if (keyno == h->pushed_idx_cond_keyno) pushed_cond= h->pushed_idx_cond; /* @@ -443,12 +520,14 @@ h2= new_h2; /* Ok, now can put it into h2 */ table->prepare_for_position(); h2->extra(HA_EXTRA_KEYREAD); - - if (h2->ha_index_init(mrr_keyno, FALSE)) + h2->mrr_funcs= *seq_funcs; //psergey3-todo: sort out where to store + h2->mrr_iter= h->mrr_iter; + + if (h2->ha_index_init(keyno, FALSE)) goto error; if (pushed_cond) - h2->idx_cond_push(mrr_keyno, pushed_cond); + h2->idx_cond_push(keyno, pushed_cond); } else { @@ -467,10 +546,15 @@ if (res) goto error; } + + if (!do_sort_keys && + h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, + mode, buf)) + { + goto error; + } - if (h2->handler::multi_range_read_init(seq_funcs, seq_init_param, n_ranges, - mode, buf) || - dsmrr_fill_rowid_buffer()) + if (dsmrr_fill_rowid_buffer()) { goto error; } @@ -479,7 +563,7 @@ adjust *buf to indicate that the remaining buffer space will not be used. */ if (dsmrr_eof) - buf->end_of_used_area= mrr_buf_last; + buf->end_of_used_area= rowid_buffer.end_of_space(); /* h->inited == INDEX may occur when 'range checked for each record' is @@ -526,21 +610,24 @@ /** - DS-MRR: Fill the buffer with rowids and sort it by rowid + DS-MRR: Fill and sort the rowid buffer {This is an internal function of DiskSweep MRR implementation} + Scan the MRR ranges and collect ROWIDs (or {ROWID, range_id} pairs) into buffer. When the buffer is full or scan is completed, sort the buffer by rowid and return. The function assumes that rowids buffer is empty when it is invoked. + New2: + we will need to scan either + - the source sequence getting records + - use dsmrr_next_from_index.. + dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're scanning. - psergey2: this func will 'fill the rowid buffer'. If filling the rowid buffer - requires that key buffer is filled/sorted first, will do that, too. - @param h Table handler @retval 0 OK, the next portion of rowids is in the buffer, @@ -554,40 +641,32 @@ int res; DBUG_ENTER("DsMrr_impl::dsmrr_fill_rowid_buffer"); - mrr_buf_cur= mrr_buf; - mrr_buf_next_identical= mrr_buf_cur; - /* - psergey2-todo: - - call here fill/sort key buffer, if needed. - - psergey2-todo: then, get keys either from - - multi_range_read_next() - - sorted key buffer + rowid_buffer.reset_for_writing(); + identical_rowid_ptr= NULL; + + while (rowid_buffer.have_space_for(rowid_buff_elem_size)) + { + if (do_sort_keys) + res= dsmrr_next_from_index(&range_info); + else + res= h2->handler::multi_range_read_next(&range_info); + + if (res) + break; - psergey2-todo: if we're traversing an ordered key sequence, - check if next keys are the same as previous. - (note that it's easy as ordered sequence allows forward/backward - navigation so we don't need to buffer things) - */ - while ((mrr_buf_cur < mrr_buf_end) && - !(res= h2->handler::multi_range_read_next(&range_info))) - { + KEY_MULTI_RANGE *curr_range= &h2->handler::mrr_cur_range; - if (h2->mrr_funcs.skip_index_tuple && + if (!do_sort_keys && /* If keys are sorted then this check is already done */ + h2->mrr_funcs.skip_index_tuple && h2->mrr_funcs.skip_index_tuple(h2->mrr_iter, curr_range->ptr)) continue; - /* Put rowid, or {rowid, range_id} pair into the buffer */ h2->position(table->record[0]); - memcpy(mrr_buf_cur, h2->ref, h2->ref_length); - mrr_buf_cur += h2->ref_length; + rowid_buffer.write(h2->ref, h2->ref_length); if (is_mrr_assoc) - { - memcpy(mrr_buf_cur, &range_info, sizeof(void*)); - mrr_buf_cur += sizeof(void*); - } + rowid_buffer.write((uchar*)&range_info, sizeof(void*)); } if (res && res != HA_ERR_END_OF_FILE) @@ -596,12 +675,11 @@ /* Sort the buffer contents by rowid */ uint elem_size= h->ref_length + (int)is_mrr_assoc * sizeof(void*); - uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size; + uint n_rowids= rowid_buffer.used_size() / elem_size; - my_qsort2(mrr_buf, n_rowids, elem_size, (qsort2_cmp)rowid_cmp, - (void*)h); - mrr_buf_last= mrr_buf_cur; - mrr_buf_cur= mrr_buf; + my_qsort2(rowid_buffer.used_area(), n_rowids, elem_size, + (qsort2_cmp)rowid_cmp, (void*)h); + DBUG_RETURN(0); } @@ -616,8 +694,8 @@ { DsMrr_impl *dsmrr= (DsMrr_impl*)arg; TABLE *table= dsmrr->h->table; - - KEY_PART_INFO *part= table->key_info[table->s->primary_key].key_part; + int res; + KEY_PART_INFO *part= table->key_info[dsmrr->keyno].key_part; if (dsmrr->use_key_pointers) { @@ -626,15 +704,31 @@ key2= *((uchar**)key2); // todo is this alignment-safe? } - uchar *key1_end= key1 + dsmrr->cpk_tuple_length; + uchar *key1_end= key1 + dsmrr->key_tuple_length; while (key1 < key1_end) { Field* f = part->field; int len = part->store_length; - int res = f->cmp(key1, key2); - if (res) + if (part->null_bit) + { + if (*key1) // key1 == NULL + { + if (!*key2) // key1(NULL) < key2(notNULL) + return -1; + goto equals; + } + else if (*key2) // key1(notNULL) > key2 (NULL) + return 1; + // Step over NULL byte for f->cmp(). + key1++; + key2++; + len--; + } + + if ((res= f->key_cmp(key1, key2))) return res; +equals: key1 += len; key2 += len; part++; @@ -644,75 +738,123 @@ /* + Setup key/rowid buffer sizes based on sample_key + + DESCRIPTION + Setup key/rowid buffer sizes based on sample_key and its length. + + This function must be called when all buffer space is empty. +*/ + +void DsMrr_impl::setup_buffer_sizes(key_range *sample_key) +{ + key_tuple_length= sample_key->length; + key_tuple_map= sample_key->keypart_map; + key_size_in_keybuf= use_key_pointers ? sizeof(char*) : + key_tuple_length; + key_buff_elem_size= key_size_in_keybuf + + (int)is_mrr_assoc * sizeof(void*); + + uint rowid_buf_elem_size= h->ref_length + + (int)is_mrr_assoc * sizeof(char*); + + KEY *key_info= &h->table->key_info[keyno]; + /* + Use rec_per_key statistics as a basis to find out how many rowids + we'll get for each key value. + TODO: are we guaranteed to get r_p_c==1 for unique keys? + TODO: what should be the default value to use when there is no + statistics? + */ + uint parts= my_count_bits(key_tuple_map); + ulong rpc; + if ((rpc= key_info->rec_per_key[parts - 1])) + { + rowid_buf_elem_size *= rpc; + } + + double fraction_for_rowids= + ((double) rowid_buf_elem_size / + ((double)rowid_buf_elem_size + key_buff_elem_size)); + + uint bytes_for_rowids= + round(fraction_for_rowids * (full_buf_end - full_buf)); + + uint bytes_for_keys= (full_buf_end - full_buf) - bytes_for_rowids; + + if (bytes_for_keys < key_buff_elem_size + 1) + { + uint add= key_buff_elem_size + 1 - bytes_for_keys; + bytes_for_rowids -= add; + DBUG_ASSERT(bytes_for_rowids >= + (h->ref_length + (int)is_mrr_assoc * sizeof(char*) + 1)); + } + + rowid_buffer.set_buffer_space(full_buf, full_buf + bytes_for_rowids, 1); + key_buffer.set_buffer_space(full_buf + bytes_for_rowids, full_buf_end, 1); + + index_ranges_unique= test(key_info->flags & HA_NOSAME && + key_info->key_parts == + my_count_bits(sample_key->keypart_map)); +} + + +/* DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort SYNOPSIS DsMrr_impl::dsmrr_fill_key_buffer() DESCRIPTION - DS-MRR/CPK: Fill the buffer with (lookup_tuple, range_id) pairs and sort + DS-MRR/CPK: Enumerate the input range (=key) sequence, fill the key buffer + (lookup_key, range_id) pairs and sort. dsmrr_eof is set to indicate whether we've exhausted the list of ranges we're scanning. - - psergey2-q: can this be used for filling/sorting key buffer in general case? - a: yes. - qq: can we push sequence iteration init down into here? */ void DsMrr_impl::dsmrr_fill_key_buffer() { - //psergey2: here, no identicals detection is necessary since we always scan - // the unordered sequence. int res; KEY_MULTI_RANGE cur_range; DBUG_ENTER("DsMrr_impl::dsmrr_fill_key_buffer"); - mrr_buf_cur= mrr_buf; - while ((mrr_buf_cur < mrr_buf_end) && + // reset the buffer for writing. + key_buffer.reset_for_writing(); + + while ((key_buffer.have_space_for(key_buff_elem_size)) && !(res= h->mrr_funcs.next(h->mrr_iter, &cur_range))) { DBUG_ASSERT(cur_range.range_flag & EQ_RANGE); - if (!cpk_tuple_length) + if (!key_tuple_length) { - cpk_tuple_length= cur_range.start_key.length; - key_buf_element_size= use_key_pointers ? sizeof(char*) : - cpk_tuple_length; - - cpk_is_unique_scan= test(table->key_info[h->active_index].key_parts == - my_count_bits(cur_range.start_key.keypart_map)); - uint elem_size= key_buf_element_size + (int)is_mrr_assoc * sizeof(void*); - mrr_buf_last= mrr_buf + ((mrr_buf_end - mrr_buf)/elem_size) * elem_size; - mrr_buf_end= mrr_buf_last; + /* This only happens when we've just started filling the buffer */ + DBUG_ASSERT(key_buffer.used_size() == 0); + setup_buffer_sizes(&cur_range.start_key); } - //psergey2: if keys are materialized, store pointers, not copy keys - /* Put key, or {key, range_id} pair into the buffer */ if (use_key_pointers) - memcpy(mrr_buf_cur, &cur_range.start_key.key, sizeof(char*)); + key_buffer.write((uchar*)&cur_range.start_key.key, sizeof(char*)); else - memcpy(mrr_buf_cur, cur_range.start_key.key, cpk_tuple_length); - - mrr_buf_cur += key_buf_element_size; - + key_buffer.write(cur_range.start_key.key, key_tuple_length); + if (is_mrr_assoc) - { - memcpy(mrr_buf_cur, &cur_range.ptr, sizeof(void*)); - mrr_buf_cur += sizeof(void*); - } + key_buffer.write((uchar*)&cur_range.ptr, sizeof(void*)); } dsmrr_eof= test(res); /* Sort the buffer contents by rowid */ - uint elem_size= key_buf_element_size + (int)is_mrr_assoc * sizeof(void*); - uint n_rowids= (mrr_buf_cur - mrr_buf) / elem_size; + uint key_elem_size= key_size_in_keybuf + (int)is_mrr_assoc * sizeof(void*); + uint n_keys= key_buffer.used_size() / key_elem_size; - my_qsort2(mrr_buf, n_rowids, elem_size, + my_qsort2(key_buffer.used_area(), n_keys, key_elem_size, (qsort2_cmp)DsMrr_impl::key_tuple_cmp, (void*)this); - mrr_buf_last= mrr_buf_cur; - mrr_buf_cur= mrr_buf; + + last_identical_key_ptr= NULL; + in_identical_keys_range= FALSE; + DBUG_VOID_RETURN; } @@ -721,60 +863,105 @@ DS-MRR/CPK: multi_range_read_next() function DESCRIPTION - DsMrr_impl::dsmrr_next_cpk() + DsMrr_impl::dsmrr_next_from_index() range_info OUT identifier of range that the returned record belongs to DESCRIPTION - DS-MRR/CPK: multi_range_read_next() function. - This is similar to DsMrr_impl::dsmrr_next(), the differences are that - - we get records with index_read(), not with rnd_pos() - - we may get multiple records for one key (=element of the buffer) - - unlike dsmrr_fill_rowid_buffer(), dsmrr_fill_key_buffer() never fails. - + + This function walks over key buffer and does index reads, i.e. it produces + {current_record, range_id} pairs. + + The function has the same call contract like multi_range_read_next()'s. + + We actually iterate nested sequences: + + - a disjoint sequence of index ranges + - each range has multiple records + - each record goes into multiple identical ranges. + RETURN 0 OK, next record was successfully read HA_ERR_END_OF_FILE End of records Other Some other error - - psergey2-todo: this should detect identical keys. */ -int DsMrr_impl::dsmrr_next_cpk(char **range_info) +int DsMrr_impl::dsmrr_next_from_index(char **range_info_arg) { int res; - - while (cpk_have_range) - { - - if (h->mrr_funcs.skip_record && - h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL)) - { - cpk_have_range= FALSE; + uchar *key_in_buf; + handler *file= do_rowid_fetch? h2: h; + + while (in_identical_keys_range) + { +//read_and_check: + /* Read record/key pointer from the buffer */ + key_in_buf= identical_key_it.get_next(key_size_in_keybuf); + if (is_mrr_assoc) + cur_range_info= (char*)identical_key_it.get_next(sizeof(void*)); + + if (key_in_buf == last_identical_key_ptr) + { + /* We're looking at the last of the identical keys */ + in_identical_keys_range= FALSE; + } +check_record: + if ((h->mrr_funcs.skip_index_tuple && + h->mrr_funcs.skip_index_tuple(h->mrr_iter, *(char**)cur_range_info)) || + (h->mrr_funcs.skip_record && + h->mrr_funcs.skip_record(h->mrr_iter, *(char**)cur_range_info, NULL))) + { + continue; + } + memcpy(range_info_arg, cur_range_info, sizeof(void*)); + + return 0; + } + + /* Try returrning next record from the current range */ + while (in_index_range) + { + res= file->ha_index_next_same(table->record[0], cur_index_tuple, + key_tuple_length); + + if (res) + { + if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND) + return res; /* Fatal error */ + + in_index_range= FALSE; /* no more records here */ break; } - - uchar *lookup_tuple= use_key_pointers? (*((uchar**)mrr_buf_cur)) : mrr_buf_cur; - res= h->index_next_same(table->record[0], lookup_tuple, cpk_tuple_length); - - if (h->mrr_funcs.skip_index_tuple && - h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info)) - continue; - - if (res != HA_ERR_END_OF_FILE) - { + + if (last_identical_key_ptr) + { + in_identical_keys_range= TRUE; + identical_key_it.init(&key_buffer); + cur_range_info= first_identical_range_info; + } + + goto check_record; + // goto read_and_check; + } + + while(1) + { + DBUG_ASSERT(!in_identical_keys_range && !in_index_range); + + /* Jump over the keys that were handled by identical key processing */ + if (last_identical_key_ptr) + { + while (key_buffer.read(key_size_in_keybuf) != last_identical_key_ptr) + { + if (is_mrr_assoc) + key_buffer.read(sizeof(void*)); + } if (is_mrr_assoc) - memcpy(range_info, &cpk_saved_range_info, sizeof(void*)); - return res; + key_buffer.read(sizeof(void*)); + last_identical_key_ptr= NULL; } - /* No more records in this range. Exit this loop and go get another range */ - cpk_have_range= FALSE; - } - - do - { /* First, make sure we have a range at start of the buffer */ - if (mrr_buf_cur == mrr_buf_last) + if (!key_buffer.have_data(key_buff_elem_size)) { if (dsmrr_eof) { @@ -782,58 +969,56 @@ goto end; } dsmrr_fill_key_buffer(); - } - if (mrr_buf_cur == mrr_buf_last) - { - res= HA_ERR_END_OF_FILE; - goto end; + if (!key_buffer.have_data(key_buff_elem_size)) + { + res= HA_ERR_END_OF_FILE; + goto end; + } } - /* Ok, got the range. Try making a lookup. */ - uchar *lookup_tuple= use_key_pointers? (*((uchar**)mrr_buf_cur)) : mrr_buf_cur; - mrr_buf_cur += key_buf_element_size; + /* Get the next range to scan*/ + cur_index_tuple= key_in_buf= key_buffer.read(key_size_in_keybuf); + if (use_key_pointers) + cur_index_tuple= *((uchar**)cur_index_tuple); + if (is_mrr_assoc) - { - memcpy(&cpk_saved_range_info, mrr_buf_cur, sizeof(void*)); - mrr_buf_cur += sizeof(void*) * test(is_mrr_assoc); - } + cur_range_info= (char*)key_buffer.read(sizeof(void*)); - if (h->mrr_funcs.skip_record && - h->mrr_funcs.skip_record(h->mrr_iter, cpk_saved_range_info, NULL)) - continue; - - res= h->index_read(table->record[0], lookup_tuple, cpk_tuple_length, - HA_READ_KEY_EXACT); - - /* - Check pushed index condition. Performance-wise, it does not make any - sense to put this call here (the above call has already accessed the full - record). That's the best I could do, though, because: - - ha_innobase doesn't support IndexConditionPushdown on clustered PK - - MRR interface doesn't allow the storage engine to refuse a pushed index - condition. - Having this call here is not fully harmless: EXPLAIN shows "pushed index - condition", which is technically true but doesn't bring the benefits that - one might expect. - */ - if (h->mrr_funcs.skip_index_tuple && - h->mrr_funcs.skip_index_tuple(h->mrr_iter, cpk_saved_range_info)) - continue; - - if (res && res != HA_ERR_END_OF_FILE) - goto end; - - if (!res) - { - memcpy(range_info, &cpk_saved_range_info, sizeof(void*)); - /* - Attempt reading more rows from this range only if there actually can - be multiple matches: - */ - cpk_have_range= !cpk_is_unique_scan; - break; - } - } while (true); + /* Do index lookup */ + if ((res= file->ha_index_read_map(table->record[0], cur_index_tuple, + key_tuple_map, HA_READ_KEY_EXACT))) + { + if (res != HA_ERR_END_OF_FILE && res != HA_ERR_KEY_NOT_FOUND) + return res; + continue; /* to next key and make another lookup */ + } + + /* Check if subsequent keys in the key buffer are the same as this one */ + { + uchar *ptr; + identical_key_it.init(&key_buffer); + last_identical_key_ptr= NULL; + while ((ptr= identical_key_it.get_next(key_size_in_keybuf))) + { + if (is_mrr_assoc) + identical_key_it.get_next(sizeof(void*)); + + if (key_tuple_cmp(this, key_in_buf, ptr)) + break; + + last_identical_key_ptr= ptr; + } + if (last_identical_key_ptr) + { + in_identical_keys_range= TRUE; + identical_key_it.init(&key_buffer); + first_identical_range_info= cur_range_info; + } + } + + in_index_range= !index_ranges_unique; + goto check_record; + } end: return res; @@ -842,9 +1027,6 @@ /** DS-MRR implementation: multi_range_read_next() function - - psergey2-todo: put identical rowid detection code here - it should always work because rowid sequences are always sorted */ int DsMrr_impl::dsmrr_next(char **range_info) @@ -852,89 +1034,108 @@ int res; uchar *cur_range_info= 0; uchar *rowid; + uchar *range_id; if (use_default_impl) return h->handler::multi_range_read_next(range_info); - if (doing_cpk_scan) - return dsmrr_next_cpk(range_info); + if (!do_rowid_fetch) + return dsmrr_next_from_index(range_info); - if (mrr_buf_next_identical != mrr_buf_cur) + while (identical_rowid_ptr) { /* - There are multiple rowids. Return the record again, now with different - range_id + Current record (the one we've returned in previous call) was obtained + from a rowid that matched multiple range_ids. Return this record again, + with next matching range_id. */ - do - { - if (is_mrr_assoc) - memcpy(range_info, mrr_buf_next_identical + h->ref_length, sizeof(uchar*)); - } while (!h2->mrr_funcs.skip_record || - !h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) range_info, rowid)); - - mrr_buf_next_identical += h->ref_length + sizeof(void*) * test(is_mrr_assoc); - return 0; + rowid= rowid_buffer.read(h->ref_length); + if (is_mrr_assoc) + { + uchar *range_ptr= rowid_buffer.read(sizeof(uchar*)); + memcpy(range_info, range_ptr, sizeof(uchar*)); + } + + if (rowid == identical_rowid_ptr) + { + identical_rowid_ptr= NULL; /* reached the last of identical rowids */ + } + + if (!h2->mrr_funcs.skip_record || + !h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) *range_info, rowid)) + { + return 0; + } } - do + while (1) { - if (mrr_buf_cur == mrr_buf_last) + if (!rowid_buffer.have_data(1)) { if (dsmrr_eof) - { - res= HA_ERR_END_OF_FILE; - goto end; - } - res= dsmrr_fill_rowid_buffer(); - if (res) - goto end; + return HA_ERR_END_OF_FILE; + + if (do_sort_keys && key_buffer.used_size() == 0) + dsmrr_fill_key_buffer(); + + if ((res= dsmrr_fill_rowid_buffer())) + return res; } - /* return eof if there are no rowids in the buffer after re-fill attempt */ - if (mrr_buf_cur == mrr_buf_last) + /* Return eof if there are no rowids in the buffer after re-fill attempt */ + if (!rowid_buffer.have_data(1)) + return HA_ERR_END_OF_FILE; + + rowid= rowid_buffer.read(h->ref_length); + identical_rowid_ptr= NULL; + + if (is_mrr_assoc) { - res= HA_ERR_END_OF_FILE; - goto end; + range_id= rowid_buffer.read(sizeof(uchar*)); + memcpy(&cur_range_info, range_id, sizeof(uchar*)); + memcpy(range_info, range_id, sizeof(uchar*)); } - rowid= mrr_buf_cur; - - if (is_mrr_assoc) - memcpy(&cur_range_info, mrr_buf_cur + h->ref_length, sizeof(uchar**)); - size_t element_size= h->ref_length + sizeof(void*) * test(is_mrr_assoc); - mrr_buf_cur += element_size; - mrr_buf_next_identical= mrr_buf_cur; - + //psergey2-note: the below isn't right- we won't want to skip over this + // rowid because this (rowid, range_id) pair has nothing.. the next + // identical rowids might have something.. (but we set identicals later, + // dont we?) if (h2->mrr_funcs.skip_record && h2->mrr_funcs.skip_record(h2->mrr_iter, (char *) cur_range_info, rowid)) continue; + res= h->ha_rnd_pos(table->record[0], rowid); if (res == HA_ERR_RECORD_DELETED) continue; - if (0)//(!res) + /* + Check if subsequent buffer elements have the same rowid value as this + one. If yes, remember this fact so that we don't make any more rnd_pos() + calls with this value. + */ + if (!res) { /* Note: this implies that SQL layer doesn't touch table->record[0] between calls. */ - uchar *current_el= mrr_buf_cur - element_size; - while (mrr_buf_cur != mrr_buf_last && - !h2->cmp_ref(current_el, mrr_buf_cur)) + uchar *ptr; + SimpleBuffer::PeekIterator identical_rowid_it; + identical_rowid_it.init(&rowid_buffer); + while ((ptr= identical_rowid_it.get_next(h->ref_length))) { - mrr_buf_cur += element_size; + if (is_mrr_assoc) + identical_rowid_it.get_next(sizeof(void*)); + + if (h2->cmp_ref(rowid, ptr)) + break; + identical_rowid_ptr= ptr; } } - break; - - } while (true); - - if (is_mrr_assoc) - { - memcpy(range_info, rowid + h->ref_length, sizeof(void*)); + return 0; } -end: + return res; } === modified file 'sql/multi_range_read.h' --- a/sql/multi_range_read.h 2010-07-17 21:05:44 +0000 +++ b/sql/multi_range_read.h 2010-08-08 07:13:54 +0000 @@ -38,6 +38,96 @@ /* + A simple memory buffer for reading and writing. + + when writing, there is no user-visible "current" position, although + internally 'pos' points to just after the end of used area (or at the + start of it for reverse buffer). + + When reading, there is current position pointing at start (for reverse + buffer, end) of the element that will be read next. + ^^ why end for reverse? it's more logical to point at start + + One can peek at what's behind that element by using advance_ptr function. + + TODO: will the reverse buffer store {tuple; rowid} or {rowid; tuple} pairs? + (why does it matter??? Read and write in the same order and then it + shouldn't matter.) +*/ + +class SimpleBuffer +{ + uchar *start; + uchar *end; + uchar *read_pos; + uchar *write_pos; + + /* + 1 <=> buffer grows/is filled/is read from start to end + -1 <=> everthing is done from end to start instead. + */ + int direction; +public: + /* Write-mode functions */ + void reset_for_writing(); + void write(const uchar *data, size_t bytes); + bool have_space_for(size_t bytes); + + uchar *used_area() { return (direction == 1)? read_pos : write_pos; } + size_t used_size(); + + /* Read-mode functions */ + void reset_for_reading(); + + uchar *read(size_t bytes); + bool have_data(size_t bytes); + uchar *end_of_space(); + + /* Control functions */ + void set_buffer_space(uchar *start_arg, uchar *end_arg, int direction_arg) + { + start= start_arg; + end= end_arg; + direction= direction_arg; + reset_for_writing(); + } + + friend class PeekIterator; + class PeekIterator + { + // if direction==1 : pointer to what to return next + // if direction==-1: pointer to the end of what is to be returned next + uchar *pos; + SimpleBuffer *sb; + public: + void init(SimpleBuffer *sb_arg) + { + sb= sb_arg; + pos= sb->read_pos; + } + /* Return pointer to next chunk of nbytes bytes and avance over it */ + uchar *get_next(size_t nbytes) + { + if (sb->direction == 1) + { + if (pos + nbytes > sb->write_pos) + return NULL; + uchar *res= pos; + pos += nbytes; + return res; + } + else + { + if (pos - nbytes <= sb->write_pos) + return NULL; + pos -= nbytes; + return pos; + } + } + }; +}; + +/* DS-MRR implementation for one table. Create/use one object of this class for each ha_{myisam/innobase/etc} object. That object will be further referred to as "the handler" @@ -73,6 +163,8 @@ scanning. */ + + class DsMrr_impl { public: @@ -108,14 +200,27 @@ /* Secondary handler object. It is used for scanning the index */ handler *h2; + uchar *full_buf; + uchar *full_buf_end; + /* Buffer to store rowids, or (rowid, range_id) pairs */ - uchar *mrr_buf; - uchar *mrr_buf_cur; /* Current position when reading/writing */ - uchar *mrr_buf_last; /* When reading: end of used buffer space */ - uchar *mrr_buf_end; /* End of the buffer */ - - uchar *mrr_buf_next_identical; + SimpleBuffer rowid_buffer; + + uchar *identical_rowid_ptr; + + /* Identical keys */ + bool in_identical_keys_range; + uchar *last_identical_key_ptr; + SimpleBuffer::PeekIterator identical_key_it; + + SimpleBuffer key_buffer; + + uint keyno; + + /* Execution control */ + bool do_sort_keys; bool use_key_pointers; + bool do_rowid_fetch; bool dsmrr_eof; /* TRUE <=> We have reached EOF when reading index tuples */ @@ -129,18 +234,33 @@ /** DS-MRR/CPK variables start */ /* Length of lookup tuple being used, in bytes */ - uint cpk_tuple_length; + uint key_tuple_length; + key_part_map key_tuple_map; + /* + This is + = key_tuple_length if we copy keys to buffer + = sizeof(void*) if we're using pointers to materialized keys. + */ + uint key_size_in_keybuf; + + /* = key_size_in_keybuf [ + sizeof(range_assoc_info) ] */ + uint key_buff_elem_size; + + /* = h->ref_length [ + sizeof(range_assoc_info) ] */ + uint rowid_buff_elem_size; - uint key_buf_element_size; /* TRUE <=> We're scanning on a full primary key (and not on prefix), and so can get max. one match for each key */ - bool cpk_is_unique_scan; + bool index_ranges_unique; /* TRUE<=> we're in a middle of enumerating records from a range */ - bool cpk_have_range; - /* Valid if cpk_have_range==TRUE: range_id of the range we're enumerating */ - char *cpk_saved_range_info; + bool in_index_range; + uchar *cur_index_tuple; + /* if in_index_range==TRUE: range_id of the range we're enumerating */ + char *cur_range_info; + + char *first_identical_range_info; bool choose_mrr_impl(uint keyno, ha_rows rows, uint *flags, uint *bufsz, COST_VECT *cost); @@ -150,6 +270,11 @@ static int key_tuple_cmp(void* arg, uchar* key1, uchar* key2); int dsmrr_fill_rowid_buffer(); void dsmrr_fill_key_buffer(); - int dsmrr_next_cpk(char **range_info); + int dsmrr_next_from_index(char **range_info); + + void setup_buffer_sizes(key_range *sample_key); + + static range_seq_t key_buf_seq_init(void *init_param, uint n_ranges, uint flags); + static uint key_buf_seq_next(range_seq_t rseq, KEY_MULTI_RANGE *range); };