Hi serg,

I have implemented your suggestions. In the test case I created a table with 25 rows. Explain for 'select * from t1 where a > 19' gave 'ALL' while explain for 'select * from t1 where a > 20' gives range.
I have also written public methods ha_scan_time() and ha_read_time(). I should replace every occurrence of handler::scan_time() with ha_scan_time(), right? How do I make sure that I don't miss any place?
Also regarding making everything in the class Cost_factors static vs creating an object and using it everywhere: cann't we use a namespace Cost_factors? How is it done usually? I don't see much usage of namespaces in the code.
I'll send the updated patch as soon as these things are sorted out.

Regards
Anshu


On Mon, Jun 9, 2014 at 11:25 PM, Sergei Golubchik <serg@mariadb.org> wrote:
Hi, Anshu!

On Jun 08, Anshu Avinash wrote:
> Hi all,
>
> As per serg comments on the previous commit, I have modified the code to
> add an init function which will initialize all the cost factors at the
> start of the server. The latest code is at
> https://github.com/igniting/server/commits/selfTuningOptimizer. I have also
> attached the diff here.

Much better! Good work!
See my comments below.

> diff --git a/mysql-test/t/costmodel.test b/mysql-test/t/costmodel.test
> --- /dev/null
> +++ b/mysql-test/t/costmodel.test
> @@ -0,0 +1,37 @@
> +--disable_warnings
> +DROP DATABASE IF EXISTS world;
> +--enable_warnings
> +
> +CREATE DATABASE world;
> +use world;
> +
> +--source include/world_schema.inc
> +--disable_query_log
> +--disable_result_log
> +--disable_warnings
> +--source include/world.inc
> +--enable_query_log
> +--enable_result_log
> +--enable_warnings
> +
> +EXPLAIN
> +  SELECT c.name, ci.name FROM Country c, City ci
> +    WHERE c.capital=ci.id;
> +
> +use mysql;
> +
> +UPDATE optimizer_cost_factors
> +SET const_value=1000.0
> +WHERE const_name='READ_TIME_FACTOR';
> +
> +--enable_reconnect
> +--exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
> +--source include/wait_until_connected_again.inc
> +
> +use world;
> +
> +EXPLAIN
> +  SELECT c.name, ci.name FROM Country c, City ci
> +    WHERE c.capital=ci.id;
> +
> +DROP DATABASE world;

This is a very correctly written test file. Good!

But just to test whether READ_TIME_FACTOR and SCAN_TIME_FACTOR work,
I'd suggest to use a simpler test, not a world database, not a join,
but as I described in another email - one table, minimal structure,
one simple SELECT.

> diff --git a/scripts/mysql_system_tables.sql b/scripts/mysql_system_tables.sql
> --- a/scripts/mysql_system_tables.sql
> +++ b/scripts/mysql_system_tables.sql
> @@ -229,3 +229,10 @@ CREATE TABLE IF NOT EXISTS index_stats (db_name varchar(64) NOT NULL, table_name
>  -- we avoid mixed-engine transactions.
>  set storage_engine=@orig_storage_engine;
>  CREATE TABLE IF NOT EXISTS gtid_slave_pos (domain_id INT UNSIGNED NOT NULL, sub_id BIGINT UNSIGNED NOT NULL, server_id INT UNSIGNED NOT NULL, seq_no BIGINT UNSIGNED NOT NULL, PRIMARY KEY (domain_id, sub_id)) comment='Replication slave GTID position';
> +
> +-- Tables for Self Tuning Cost Optimizer
> +
> +CREATE TABLE IF NOT EXISTS optimizer_cost_factors (const_name varchar(64) NOT NULL, const_value double NOT NULL, PRIMARY KEY (const_name) ) ENGINE=MyISAM CHARACTER SET utf8 COLLATE utf8_bin comment='Constants for optimizer';

please, make it latin1, not utf8. We don't need to support utf8 names for cost factors :)

> +
> +-- Remember for later if all_constants table already existed
> +set @had_optimizer_cost_factors_table= @@warning_count != 0;
> diff --git a/scripts/mysql_system_tables_data.sql b/scripts/mysql_system_tables_data.sql
> --- a/scripts/mysql_system_tables_data.sql
> +++ b/scripts/mysql_system_tables_data.sql
> @@ -53,3 +53,9 @@ INSERT INTO tmp_proxies_priv VALUES ('localhost', 'root', '', '', TRUE, '', now(
>  REPLACE INTO tmp_proxies_priv SELECT @current_hostname, 'root', '', '', TRUE, '', now() FROM DUAL WHERE @current_hostname != 'localhost';
>  INSERT INTO  proxies_priv SELECT * FROM tmp_proxies_priv WHERE @had_proxies_priv_table=0;
>  DROP TABLE tmp_proxies_priv;
> +
> +CREATE TEMPORARY TABLE tmp_optimizer_cost_factors LIKE optimizer_cost_factors;
> +INSERT INTO tmp_optimizer_cost_factors VALUES ('READ_TIME_FACTOR', 1.0);
> +INSERT INTO tmp_optimizer_cost_factors VALUES ('SCAN_TIME_FACTOR', 1.0);
> +INSERT INTO optimizer_cost_factors SELECT * FROM tmp_optimizer_cost_factors WHERE @had_optimizer_cost_factors_table=0;
> +DROP TABLE tmp_optimizer_cost_factors;

Eh. I'd suggest to not use a temporary table and not use WHERE @had_optimizer_cost_factors_table
but simply

  INSERT INTO optimizer_cost_factors VALUES ('READ_TIME_FACTOR', 1.0);
  INSERT INTO optimizer_cost_factors VALUES ('SCAN_TIME_FACTOR', 1.0);

the difference is - if optimizer_cost_factors table already exists, your
code will insert *nothing at all*. But the one I suggest will insert
missing constants while keeping existing values intact. For example,
if the table optimizer_cost_factors exists, and has only one row
{ READ_TIME_FACTOR, 10.0 }. Then after your script it will still have
this one row, while after my suggested solution it'll be

  | READ_TIME_FACTOR | 10.0 |
  | SCAN_TIME_FACTOR | 1.0  |

which, I think, is better. If you'd like you can even use INSERT IGNORE to
ignore duplicate key errors. But they'll be ignored by the client anyway, so
it doesn't matter much in this case.

> diff --git a/sql/handler.h b/sql/handler.h
> --- a/sql/handler.h
> +++ b/sql/handler.h
> @@ -2741,7 +2742,8 @@ public:
>      reset_statistics();
>    }
>    virtual double scan_time()
> -  { return ulonglong2double(stats.data_file_length) / IO_SIZE + 2; }
> +  { return Cost_factors::get_scan_time_factor() *
> +            (ulonglong2double(stats.data_file_length) / IO_SIZE + 2); }

No, this is wrong. You use the scan factor *in the virtual method*,
which means that every storage engine that overwrite handler::scan_time()
will not use Cost_factors::get_scan_time_factor().

The correct solution - see ha_index_init, ha_rnd_next, etc. You make
handler::scan_time() a *protected* method. And create a public non-virtual
method handler::ha_scan_time(). Like this

double ha_scan_time()
{ return Cost_factors::scan_factor() * scan_time(); }

(above I've also renamed the get_scan_time_factor() to be a bit shorter)

>    /**
>       The cost of reading a set of ranges from the table using an index
> @@ -2755,7 +2757,7 @@ public:
>       using an index by calling it using read_time(index, 1, table_size).
>    */
>    virtual double read_time(uint index, uint ranges, ha_rows rows)
> -  { return rows2double(ranges+rows); }
> +  { return Cost_factors::get_read_time_factor() * rows2double(ranges+rows); }

same here. While you have your cost factors in the virtual method,
they won't have any effect on the optimizer, you should be able to
see it in the test case.

>
>    /**
>      Calculate cost of 'keyread' scan for given index and number of records.
> diff --git a/sql/opt_costmodel.h b/sql/opt_costmodel.h
> --- /dev/null
> +++ b/sql/opt_costmodel.h
> @@ -0,0 +1,25 @@
> +/* Interface to get constants */
> +
> +#ifndef SQL_OPT_COSTMODEL_INCLUDED
> +#define SQL_OPT_COSTMODEL_INCLUDED
> +
> +class Cost_factors
> +{
> +private:
> +  static bool isInitialized;

please, try to stick to naming conventions that we have in MariaDB.
as you might've noticed, we don't use camelCase.

> +  static double read_time_factor;
> +  static double scan_time_factor;
> +
> +public:
> +  static void init();
> +  static inline double get_read_time_factor()
> +  {
> +    return read_time_factor;
> +  }
> +  static inline double get_scan_time_factor()
> +  {
> +    return scan_time_factor;
> +  }
> +};

Ok, so you've made everything static and never instantiated a single instance
of the Cost_factors class. I see. What benefits does this have as compared to
making nothing inside the class static, but creating one static Cost_factors
object?

> +
> +#endif /* SQL_OPT_COSTMODEL_INCLUDED */
> diff --git a/sql/opt_costmodel.cc b/sql/opt_costmodel.cc
> --- /dev/null
> +++ b/sql/opt_costmodel.cc
> @@ -0,0 +1,103 @@
> +#include "sql_base.h"
> +#include "key.h"
> +#include "records.h"
> +#include "opt_costmodel.h"
> +
> +/* Name of database to which the optimizer_cost_factors table belongs */
> +static const LEX_STRING db_name= { C_STRING_WITH_LEN("mysql") };
> +
> +/* Name of all_constants table */
> +static const LEX_STRING table_name = { C_STRING_WITH_LEN("optimizer_cost_factors") };
> +
> +/* Columns in the optimizer_cost_factors_table */
> +enum cost_factors_col
> +{
> +  COST_FACTORS_CONST_NAME,
> +  COST_FACTORS_CONST_VALUE
> +};
> +
> +/* Name of the constants present in table */
> +static const LEX_STRING read_time_factor_name = { C_STRING_WITH_LEN("READ_TIME_FACTOR") };
> +static const LEX_STRING scan_time_factor_name = { C_STRING_WITH_LEN("SCAN_TIME_FACTOR") };
> +
> +/* Helper functions for Cost_factors::init() */
> +
> +static
> +inline int open_table(THD *thd, TABLE_LIST *table,
> +                      Open_tables_backup *backup,
> +                      bool for_write)
> +{
> +  enum thr_lock_type lock_type_arg= for_write? TL_WRITE: TL_READ;
> +  table->init_one_table(db_name.str, db_name.length, table_name.str,
> +                        table_name.length, table_name.str, lock_type_arg);
> +  return open_system_tables_for_read(thd, table, backup);
> +}
> +
> +static inline void clean_up(THD *thd)
> +{
> +  close_mysql_tables(thd);
> +  delete thd;
> +}
> +
> +/* Initialize static class members */
> +bool Cost_factors::isInitialized= false;
> +double Cost_factors::read_time_factor= 1.0;
> +double Cost_factors::scan_time_factor= 1.0;
> +
> +/* Interface functions */
> +
> +void Cost_factors::init()
> +{
> +  TABLE_LIST table_list;
> +  Open_tables_backup open_tables_backup;
> +  READ_RECORD read_record_info;
> +  TABLE *table;
> +  MEM_ROOT mem;
> +  init_sql_alloc(&mem, 1024, 0, MYF(0));
> +  THD *new_thd = new THD;
> +
> +  if(!new_thd)
> +  {
> +    free_root(&mem, MYF(0));
> +    DBUG_VOID_RETURN;
> +  }
> +
> +  new_thd->thread_stack= (char *) &new_thd;
> +  new_thd->store_globals();
> +  new_thd->set_db(db_name.str, db_name.length);
> +
> +  if(open_table(new_thd, &table_list, &open_tables_backup, FALSE))
> +  {
> +    clean_up(new_thd);
> +    DBUG_VOID_RETURN;

it's ok to do goto err: here and put the err: label at the end
with the cleanup code - this pattern is used very often in MariaDB

> +  }
> +
> +  table= table_list.table;
> +  if(init_read_record(&read_record_info, new_thd, table, NULL, 1, 0, FALSE))
> +  {
> +    clean_up(new_thd);
> +    DBUG_VOID_RETURN;
> +  }
> +
> +  table->use_all_columns();
> +  while (!read_record_info.read_record(&read_record_info))
> +  {
> +    LEX_STRING const_name;
> +    const_name.str= get_field(&mem, table->field[COST_FACTORS_CONST_NAME]);
> +    const_name.length= strlen(const_name.str);

you don't use const_name.length anywhere, you don't need LEX_STRING here.

> +
> +    double const_value;
> +    const_value= table->field[COST_FACTORS_CONST_VALUE]->val_real();
> +    if(!strcmp(const_name.str, read_time_factor_name.str))
> +    {
> +      Cost_factors::read_time_factor= const_value;
> +    }
> +    else if(!strcmp(const_name.str, scan_time_factor_name.str))
> +    {
> +      Cost_factors::scan_time_factor= const_value;
> +    }

To avoid many if/else if you can do like this:

  struct st_factor {
     const char *name;
     double *value;
  } factors = {
  { "READ_TIME_FACTOR", & Cost_factors::read_time_factor },
  { "SCAN_TIME_FACTOR", & Cost_factors::scan_time_factor },
  { 0, 0 }};

or even

  #define FACTOR(X)   { #X, &Cost_factors::X }
  struct st_factor {
     const char *name;
     double *value;
  } factors = {
      FACTOR(read_time_factor),
      FACTOR(scan_time_factor),
      { 0, 0 }};

either way, instead of if/elseif you do

  for (st_factor *f= factors; f->name; f++)
  {
    if (strcasecmp(f->name, const_name) == 0)
    {
      *(f->value)= const_value;
      break;
    }
  }
  if (f->name == 0)
    sql_print_warning("Invalid row in the optimizer_cost_factors table: %s", const_name);

> +  }
> +
> +  clean_up(new_thd);
> +  DBUG_VOID_RETURN;
> +}
Regards,
Sergei