Re: [Maria-developers] f3dda8e: MDEV-15357 Wrong result with join_cache_level=4, BNLH join

15 Jun 2018

Hi, Igor!

On Apr 30, Igor Babaev wrote:
...
revision-id: f3dda8e2002dc2ad630c07bda0d8c00be7269907 (mariadb-10.2.14-72-gf3dda8e)
parent(s): b4c5e4a717e3ce2c2d434106cc74417fe9a1d3dc
author: Igor Babaev
committer: Igor Babaev
timestamp: 2018-04-30 21:26:38 -0700
message:
MDEV-15357 Wrong result with join_cache_level=4, BNLH join
This bug was introduced by the architectural changes of the patch
for MDEV-11640. The patch moved initialization of join caches after
the call of make_join_readinfo(). As any failure to initialize
a join cache caused denial of its usage the execution code for
the query had to be revised. This revision required rolling back many
actions taken by make_join_readinfo(). It was not done in the patch.
As a result if a denial of join cache happened some join conditions
could be lost. This was exactly the cause of wrong results in the
bug's reported test case.
Thus the introduced architectural change is not valid and it would be
better to roll it back. At the same time two new methods
adjust_read_set_for_vcol_keyread() and get_covering_index_for_scan()
were added to the class JOIN_TAB to resolve the problems of MDEV-11640.

diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc
index 3612cb6..242cb91 100644
--- a/sql/sql_join_cache.cc
+++ b/sql/sql_join_cache.cc
@@ -226,6 +226,16 @@ void JOIN_CACHE::calc_record_fields()
     flag_fields+= MY_TEST(tab->table->maybe_null);
     fields+= tab->used_fields;
     blobs+= tab->used_blobs;
+    if (tab->type == JT_ALL || tab->type == JT_HASH)
+    {
+      uint idx= tab->get_covering_index_for_scan();
as you moved JOIN_CACHE initialization back, I suspect that
this calc_record_fields() now happens too early for
get_covering_index_for_scan() to be useful.

And why only JT_ALL and JT_HASH?
...
+      if (idx != MAX_KEY)
+      {
+        fields-= bitmap_bits_set(tab->table->read_set);
+        tab->adjust_read_set_for_vcol_keyread(idx);
+        fields+= bitmap_bits_set(tab->table->read_set);
+      }
+    }
   }
   if ((with_match_flag= join_tab->use_match_flag()))
     flag_fields++;
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index 30ef26f..e5b75ec 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -11404,6 +11375,61 @@ void JOIN_TAB::remove_redundant_bnl_scan_conds()
 }
+uint JOIN_TAB::get_covering_index_for_scan()
+{
+  int idx= MAX_KEY;
+  if (table->no_keyread)
+    return idx;
+  if (select && select->quick &&
+      select->quick->index != MAX_KEY && //not index_merge
+      table->covering_keys.is_set(select->quick->index))
+    idx= select->quick->index;
+  else if (!table->covering_keys.is_clear_all() &&
+          !(select && select->quick))
+  {                                    // Only read index tree
+    if (loosescan_match_tab)
+      idx= loosescan_key;
+    else
+    {
+#ifdef BAD_OPTIMIZATION
+      /*
+        It has turned out that the below change, while speeding things
+        up for disk-bound loads, slows them down for cases when the data
+        is in disk cache (see BUG#35850):
+        See bug #26447: "Using the clustered index for a table scan
+        is always faster than using a secondary index".
+      */
+      if (table->s->primary_key != MAX_KEY &&
+          table->file->primary_key_is_clustered())
+        idx= table->s->primary_key;
+      else
+#endif
+        idx= find_shortest_key(table, & table->covering_keys);
+    }
+  }
+  return idx;
+}
+
+
+void JOIN_TAB::adjust_read_set_for_vcol_keyread(uint keyread_idx)
+{
+  if (!table->vcol_set || bitmap_is_clear_all(table->vcol_set))
+    return;
+
+  MY_BITMAP *keyread_set= &table->cond_set;    // is free for use at call
normally one uses tmp_set for this, not hijacks cond_set
...
+  bitmap_clear_all(keyread_set);
+  table->mark_columns_used_by_index(keyread_idx, keyread_set);
+  bitmap_intersect(keyread_set, table->read_set);
+  bitmap_intersect(keyread_set, table->vcol_set);
this looks wrong, you probably should do

  keyread_set INTERSECT (read_set UNION vcol_set)
...
+  if (!bitmap_is_clear_all(keyread_set))
+  {
+    bitmap_clear_all(keyread_set);
+    table->mark_columns_used_by_index(keyread_idx, keyread_set);
+    bitmap_intersect(table->read_set, keyread_set);
+  }
I don't understand that.
1. What if bitmap_is_clear_all(keyread_set) - what does it mean?
2. Why do you recalculate keyread_set, why not simply copy keyread_set
   into read_set ?
...
+}
+
+
 /*
   Plan refinement stage: do various setup things for the executor
@@ -11526,7 +11552,10 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
       tab->read_first_record= tab->type == JT_SYSTEM ? join_read_system
                                                      : join_read_const;
       if (table->covering_keys.is_set(tab->ref.key) && !table->no_keyread)
+      {
+        tab->adjust_read_set_for_vcol_keyread(tab->ref.key);
         table->file->ha_start_keyread(tab->ref.key);
better combine both into

    inline void JOIN_TAB::start_keyread()
    {
      adjust_read_set_for_vcol_keyread(ref.key);
      tab->table->file->ha_start_keyread(ref.key);
    }

or even JOIN_TAB::start_keyread_if_needed() putting the if() inside too
...
+      }
       else if ((!jcl || jcl > 4) && !tab->ref.is_access_triggered())
         push_index_cond(tab, tab->ref.key);
       break;
@@ -11606,35 +11641,19 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
            }
          }
        }
-       if (!table->no_keyread)
+       uint idx= tab->get_covering_index_for_scan();
+       if (idx != MAX_KEY)
        {
          if (tab->select && tab->select->quick &&
               tab->select->quick->index != MAX_KEY && //not index_merge
              table->covering_keys.is_set(tab->select->quick->index))
-            table->file->ha_start_keyread(tab->select->quick->index);
+            table->file->ha_start_keyread(idx);
why do you need an if() here - you already checked that in
get_covering_index_for_scan().

I generally don't like this code at all. In JOIN_CACHE you simply
use tab->get_covering_index_for_scan(), but here you use it
*plus* a bunch of if()s afterwards. It surely creates an impression
that JOIN_CACHE and make_join_readinfo may have a different idea
about whether keyread is used.

Ideally here should be just

      if (idx != MAX_KEY)
        tab->start_keyread(idx);

and even then it would be kinda questionable (because preconditions
might change between two get_covering_index_for_scan() calls).
...
else if (!table->covering_keys.is_clear_all() &&
                   !(tab->select && tab->select->quick))
-         {                                     // Only read index tree
-            if (tab->loosescan_match_tab)
-              tab->index= tab->loosescan_key;
-            else 
-            {
-#ifdef BAD_OPTIMIZATION
-              /*
-                It has turned out that the below change, while speeding things
-                up for disk-bound loads, slows them down for cases when the data
-                is in disk cache (see BUG#35850):
-                See bug #26447: "Using the clustered index for a table scan
-                is always faster than using a secondary index".
-              */
-              if (table->s->primary_key != MAX_KEY &&
-                  table->file->primary_key_is_clustered())
-                tab->index= table->s->primary_key;
-              else
-#endif
-                tab->index=find_shortest_key(table, & table->covering_keys);
-            }
+         {                                     // Only read index tree
+            tab->index= idx;
            tab->read_first_record= join_read_first;
+            table->file->ha_start_keyread(tab->index);
             /* Read with index_first / index_next */
            tab->type= tab->type == JT_ALL ? JT_NEXT : JT_HASH_NEXT;            
          }
Regards,
Sergei
Chief Architect MariaDB
and security@mariadb.org

    

Re: [Maria-developers] f3dda8e: MDEV-15357 Wrong result with join_cache_level=4, BNLH join

Sergei Golubchik