[Maria-developers] MDEV-3792: review of the handler part of the cassandra engine

14 Dec 2012

Hi, Sergey,

The handler is quite ok. I've only had few mostly cosmetical comments.
...
=== modified file 'cmake/build_configurations/mysql_release.cmake'

--- cmake/build_configurations/mysql_release.cmake	2012-06-06 12:15:29 +0000
+++ cmake/build_configurations/mysql_release.cmake	2012-11-12 15:11:23 +0000
@@ -46,6 +46,8 @@ SET(FEATURE_SET_large   5)
 SET(FEATURE_SET_xlarge  6)
 SET(FEATURE_SET_community 7)
+SET(WITH_CASSANDRA_STORAGE_ENGINE  ON)
+
This probably should be reverted before pushing into main, right?
...
IF(FEATURE_SET)
   STRING(TOLOWER ${FEATURE_SET} feature_set)
   SET(num ${FEATURE_SET_${feature_set}})
=== modified file 'sql/sql_join_cache.cc'
--- sql/sql_join_cache.cc	2012-03-24 17:21:22 +0000
+++ sql/sql_join_cache.cc	2012-11-12 15:11:23 +0000
@@ -4543,7 +4546,7 @@ bool JOIN_CACHE_BKAH::prepare_look_for_m
 {
   last_matching_rec_ref_ptr= next_matching_rec_ref_ptr= 0;
   if (no_association &&
-      (curr_matching_chain= get_matching_chain_by_join_key()))
+      !(curr_matching_chain= get_matching_chain_by_join_key())) //psergey: added '!'
Is that something that should be pushed into the main branch or
it's a temporary hack for cassandra, that should be reverted?
...
return 1;
   last_matching_rec_ref_ptr= get_next_rec_ref(curr_matching_chain);
   return 0;
=== added file 'storage/cassandra/CMakeLists.txt'
--- storage/cassandra/CMakeLists.txt	1970-01-01 00:00:00 +0000
+++ storage/cassandra/CMakeLists.txt	2012-11-12 15:11:23 +0000
@@ -0,0 +1,25 @@
+
+SET(cassandra_sources 
+     ha_cassandra.cc 
+     ha_cassandra.h 
+     cassandra_se.h
+     cassandra_se.cc
+     gen-cpp/Cassandra.cpp
+     gen-cpp/cassandra_types.h 
+     gen-cpp/cassandra_types.cpp
+     gen-cpp/cassandra_constants.h 
+     gen-cpp/cassandra_constants.cpp
+     gen-cpp/Cassandra.h)
+
+#INCLUDE_DIRECTORIES(BEFORE ${Boost_INCLUDE_DIRS})
+
+#INCLUDE_DIRECTORIES(AFTER /usr/local/include/thrift)
+INCLUDE_DIRECTORIES(AFTER /home/buildbot/build/thrift-inst/include/thrift/)
this needs to be fixed before pushing into the main tree
I suppose, you'll need to detect here if thrift is available
and disable the engine otherwise
...
+
+# 
+STRING(REPLACE "-fno-exceptions" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+STRING(REPLACE "-fno-implicit-templates" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+#LINK_DIRECTORIES(/home/psergey/cassandra/thrift/lib)
+
+MYSQL_ADD_PLUGIN(cassandra ${cassandra_sources} STORAGE_ENGINE LINK_LIBRARIES thrift)
+# was: STORAGE_ENGINE MANDATORY
=== added file 'storage/cassandra/cassandra_se.cc'
--- storage/cassandra/cassandra_se.cc	1970-01-01 00:00:00 +0000
+++ storage/cassandra/cassandra_se.cc	2012-11-12 15:11:23 +0000
@@ -0,0 +1,809 @@
+
+// Cassandra includes:
+#include <inttypes.h>
+#include <netinet/in.h>
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+#include "Thrift.h"
+#include "transport/TSocket.h"
+#include "transport/TTransport.h"
+#include "transport/TBufferTransports.h"
+#include "protocol/TProtocol.h"
+#include "protocol/TBinaryProtocol.h"
+#include "gen-cpp/Cassandra.h"
+// cassandra includes end
+
+#include "cassandra_se.h"
+
+struct st_mysql_lex_string
+{
+  char *str;
+  size_t length;
+};
+
+using namespace std;
+using namespace apache::thrift;
+using namespace apache::thrift::transport;
+using namespace apache::thrift::protocol;
+using namespace org::apache::cassandra;
+
+
+void Cassandra_se_interface::print_error(const char *format, ...)
+{
+  va_list ap;
+  va_start(ap, format);
+  // it's not a problem if output was truncated
+  vsnprintf(err_buffer, sizeof(err_buffer), format, ap);
my_vsnprintf please
...
+  va_end(ap);
+}
+
...
=== added file 'storage/cassandra/ha_cassandra.h'
--- storage/cassandra/ha_cassandra.h	1970-01-01 00:00:00 +0000
+++ storage/cassandra/ha_cassandra.h	2012-11-12 15:11:23 +0000
@@ -0,0 +1,328 @@
+/*
+   Copyright (c) 2012, Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+
+#include "my_global.h"                   /* ulonglong */
+#include "thr_lock.h"                    /* THR_LOCK, THR_LOCK_DATA */
+#include "handler.h"                     /* handler */
+#include "my_base.h"                     /* ha_rows */
+
+#include "cassandra_se.h"
+
+/** @brief
+  CASSANDRA_SHARE is a structure that will be shared among all open handlers.
+  This example implements the minimum of what you will probably need.
+*/
+typedef struct st_cassandra_share {
+  char *table_name;
+  uint table_name_length,use_count;
+  mysql_mutex_t mutex;
+  THR_LOCK lock;
+} CASSANDRA_SHARE;
+
+class ColumnDataConverter;
+
+struct ha_table_option_struct;
+
+
+struct st_dynamic_column_value;
+
+typedef bool (* CAS2DYN_CONVERTER)(const char *cass_data,
+                                   int cass_data_len,
+                                   struct st_dynamic_column_value *value);
+typedef bool (* DYN2CAS_CONVERTER)(struct st_dynamic_column_value *value,
+                                   char **cass_data,
+                                   int *cass_data_len,
+                                   void *buf, void **freemem);
+struct cassandra_type_def
+{
+  const char *name;
+  CAS2DYN_CONVERTER cassandra_to_dynamic;
+  DYN2CAS_CONVERTER dynamic_to_cassandra;
+};
+
+typedef struct cassandra_type_def CASSANDRA_TYPE_DEF;
+
+enum cassandtra_type_enum {CT_BIGINT, CT_INT, CT_COUNTER, CT_FLOAT, CT_DOUBLE,
+  CT_BLOB, CT_ASCII, CT_TEXT, CT_TIMESTAMP, CT_UUID, CT_BOOLEAN, CT_VARINT,
+  CT_DECIMAL};
+
+typedef enum cassandtra_type_enum CASSANDRA_TYPE;
+
+
+
+/** @brief
+  Class definition for the storage engine
+*/
+class ha_cassandra: public handler
+{
+  friend class Column_name_enumerator_impl;
+  THR_LOCK_DATA lock;      ///< MySQL lock
+  CASSANDRA_SHARE *share;    ///< Shared lock info
+
+  Cassandra_se_interface *se;
+
+  /* description of static part of the table definition */
+  ColumnDataConverter **field_converters;
+  uint n_field_converters;
+
+  CASSANDRA_TYPE_DEF *default_type_def;
+  /* description of dynamic columns part */
+  CASSANDRA_TYPE_DEF *special_type_field_converters;
+  LEX_STRING *special_type_field_names;
+  uint n_special_type_fields;
+  DYNAMIC_ARRAY dynamic_values, dynamic_names;
+  DYNAMIC_STRING dynamic_rec;
+
+  ColumnDataConverter *rowkey_converter;
+
+  bool setup_field_converters(Field **field, uint n_fields);
+  void free_field_converters();
+
+  int read_cassandra_columns(bool unpack_pk);
+  int check_table_options(struct ha_table_option_struct* options);
+
+  bool doing_insert_batch;
+  ha_rows insert_rows_batched;
+
+  uint dyncol_field;
+  bool dyncol_set;
+
+  /* Used to produce 'wrong column %s at row %lu' warnings */
+  ha_rows insert_lineno;
+  void print_conversion_error(const char *field_name,
+                              char *cass_value, int cass_value_len);
+  int connect_and_check_options(TABLE *table_arg);
+public:
+  ha_cassandra(handlerton *hton, TABLE_SHARE *table_arg);
+  ~ha_cassandra()
+  {
+    free_field_converters();
+    delete se;
+  }
+
+  /** @brief
+    The name that will be used for display purposes.
+   */
+  const char *table_type() const { return "CASSANDRA"; }
+
+  /** @brief
+    The name of the index type that will be used for display.
+    Don't implement this method unless you really have indexes.
+   */
+  const char *index_type(uint inx) { return "HASH"; }
+
+  /** @brief
+    The file extensions.
+   */
+  const char **bas_ext() const;
+
+  /** @brief
+    This is a list of flags that indicate what functionality the storage engine
+    implements. The current table flags are documented in handler.h
+  */
+  ulonglong table_flags() const
+  {
+    /*
+      HA_BINLOG_STMT_CAPABLE
+        We are saying that this engine is just statement capable to have
+        an engine that can only handle statement-based logging. This is
+        used in testing.
+      HA_REC_NOT_IN_SEQ
+        If we don't set it, filesort crashes, because it assumes rowids are
+        1..8 byte numbers
+    */
+    return HA_BINLOG_STMT_CAPABLE |
+           HA_REC_NOT_IN_SEQ;
HA_NO_TRANSACTIONS is unset. Is this engine transactional?
HA_PARTIAL_COLUMN_READ is unset. Do you always work with all columns, ignoring read_set and write_set?
HA_TABLE_SCAN_ON_INDEX is unset. Do you store the data MyISAM-style, index and data in separate files?
HA_FAST_KEY_READ is unset. is reading keys in order faster in cassandra, than random reads?
HA_REQUIRE_PRIMARY_KEY is unset. but a primary key is required.
HA_PRIMARY_KEY_IN_READ_INDEX is unset. but as far as I see, primary key columns are always returned (and needed for position())
HA_PRIMARY_KEY_REQUIRED_FOR_POSITION is unset, but a primary key is required for position()
HA_NO_AUTO_INCREMENT is unset. is auto-increment supported?
...
+
+  }
+
+  /** @brief
+    This is a bitmap of flags that indicates how the storage engine
+    implements indexes. The current index flags are documented in
+    handler.h. If you do not implement indexes, just return zero here.
+
+      @details
+    part is the key part to check. First key part is 0.
+    If all_parts is set, MySQL wants to know the flags for the combined
+    index, up to and including 'part'.
+  */
+  ulong index_flags(uint inx, uint part, bool all_parts) const
+  {
+    return 0;
+  }
+
+  /** @brief
+    unireg.cc will call max_supported_record_length(), max_supported_keys(),
+    max_supported_key_parts(), uint max_supported_key_length()
+    to make sure that the storage engine can handle the data it is about to
+    send. Return *real* limits of your storage engine here; MySQL will do
+    min(your_limits, MySQL_limits) automatically.
+   */
+  uint max_supported_record_length() const { return HA_MAX_REC_LENGTH; }
+
+  /* Support only one Primary Key, for now */
+  uint max_supported_keys()          const { return 1; }
+  uint max_supported_key_parts()     const { return 1; }
+
+  /** @brief
+    unireg.cc will call this to make sure that the storage engine can handle
+    the data it is about to send. Return *real* limits of your storage engine
+    here; MySQL will do min(your_limits, MySQL_limits) automatically.
+
+      @details
+    There is no need to implement ..._key_... methods if your engine doesn't
+    support indexes.
+   */
+  uint max_supported_key_length()    const { return 16*1024; /* just to return something*/ }
+
+  int index_init(uint idx, bool sorted);
+
+  int index_read_map(uchar * buf, const uchar * key,
+                     key_part_map keypart_map,
+                     enum ha_rkey_function find_flag);
+
+  /** @brief
+    Called in test_quick_select to determine if indexes should be used.
+  */
+  virtual double scan_time() { return (double) (stats.records+stats.deleted) / 20.0+10; }
does that make any sense? (I understand that it's copy-paste)
stats.deleted, for example, is never set.
...
+
+  /** @brief
+    This method will never be called if you do not implement indexes.
+  */
+  virtual double read_time(uint, uint, ha_rows rows)
+  { return (double) rows /  20.0+1; }
same question
...
+
+  virtual void start_bulk_insert(ha_rows rows);
+  virtual int end_bulk_insert();
+
+  virtual int reset();
+
+
+  int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+  int multi_range_read_next(range_id_t *range_info);
+  ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                      void *seq_init_param,
+                                      uint n_ranges, uint *bufsz,
+                                      uint *flags, COST_VECT *cost);
+  ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                uint key_parts, uint *bufsz,
+                                uint *flags, COST_VECT *cost);
+  int multi_range_read_explain_info(uint mrr_mode, char *str, size_t size);
+
+private:
+  bool source_exhausted;
+  bool mrr_start_read();
+  int check_field_options(Field **fields);
+  int read_dyncol(DYNAMIC_ARRAY *vals, DYNAMIC_ARRAY *names,
+                  String *valcol, char **freenames);
+  int write_dynamic_row(DYNAMIC_ARRAY *names, DYNAMIC_ARRAY *vals);
+  void static free_dynamic_row(DYNAMIC_ARRAY *vals, DYNAMIC_ARRAY *names,
+                               char *free_names);
+  CASSANDRA_TYPE_DEF * get_cassandra_field_def(char *cass_name,
+                                               int cass_name_length);
+public:
+
+  /*
+    Everything below are methods that we implement in ha_example.cc.
may be it'd be better to remove the references to ha_example.cc (and example
in general) before pushing this into the main tree?
...
+
+    Most of these methods are not obligatory, skip them and
+    MySQL will treat them as not implemented
+  */
+  /** @brief
+    We implement this in ha_example.cc; it's a required method.
+  */
+  int open(const char *name, int mode, uint test_if_locked);    // required
+
+  /** @brief
+    We implement this in ha_example.cc; it's a required method.
+  */
+  int close(void);                                              // required
+
+  /** @brief
+    We implement this in ha_example.cc. It's not an obligatory method;
+    skip it and and MySQL will treat it as not implemented.
+  */
+  int write_row(uchar *buf);
+
+  /** @brief
+    We implement this in ha_example.cc. It's not an obligatory method;
+    skip it and and MySQL will treat it as not implemented.
+  */
+  int update_row(const uchar *old_data, uchar *new_data);
+
+  /** @brief
+    We implement this in ha_example.cc. It's not an obligatory method;
+    skip it and and MySQL will treat it as not implemented.
+  */
+  int delete_row(const uchar *buf);
+
+  /** @brief
+    We implement this in ha_example.cc. It's not an obligatory method;
+    skip it and and MySQL will treat it as not implemented.
+  */
+  int index_next(uchar *buf);
+
+  /** @brief
+    We implement this in ha_example.cc. It's not an obligatory method;
+    skip it and and MySQL will treat it as not implemented.
+  */
+  int index_prev(uchar *buf);
+
+  /** @brief
+    We implement this in ha_example.cc. It's not an obligatory method;
+    skip it and and MySQL will treat it as not implemented.
+  */
+  int index_first(uchar *buf);
why to have unimplemented methods here?
I'd rather remove them.
...
+
+  /** @brief
+    We implement this in ha_example.cc. It's not an obligatory method;
+    skip it and and MySQL will treat it as not implemented.
+  */
+  int index_last(uchar *buf);
+
+  /** @brief
+    Unlike index_init(), rnd_init() can be called two consecutive times
+    without rnd_end() in between (it only makes sense if scan=1). In this
+    case, the second call should prepare for the new table scan (e.g if
+    rnd_init() allocates the cursor, the second call should position the
+    cursor to the start of the table; no need to deallocate and allocate
+    it again. This is a required method.
+  */
+  int rnd_init(bool scan);                                      //required
+  int rnd_end();
+  int rnd_next(uchar *buf);                                     ///< required
+  int rnd_pos(uchar *buf, uchar *pos);                          ///< required
+  void position(const uchar *record);                           ///< required
+  int info(uint);                                               ///< required
+  int extra(enum ha_extra_function operation);
+  int external_lock(THD *thd, int lock_type);                   ///< required
+  int delete_all_rows(void);
+  ha_rows records_in_range(uint inx, key_range *min_key,
+                           key_range *max_key);
+  int delete_table(const char *from);
+  int create(const char *name, TABLE *form,
+             HA_CREATE_INFO *create_info);                      ///< required
+  bool check_if_incompatible_data(HA_CREATE_INFO *info,
+                                  uint table_changes);
+
+  THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
+                             enum thr_lock_type lock_type);     ///< required
+};
=== added file 'storage/cassandra/ha_cassandra.cc'
--- storage/cassandra/ha_cassandra.cc	1970-01-01 00:00:00 +0000
+++ storage/cassandra/ha_cassandra.cc	2012-11-12 15:11:23 +0000
@@ -0,0 +1,2674 @@
+/*
+   Copyright (c) 2012, Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation        // gcc: Class implementation
+#endif
+
+#include <mysql/plugin.h>
+#include "ha_cassandra.h"
+#include "sql_class.h"
+
+#define DYNCOL_USUAL 20
+#define DYNCOL_DELTA 100
+#define DYNCOL_USUAL_REC 1024
+#define DYNCOL_DELTA_REC 1024
+
+static handler *cassandra_create_handler(handlerton *hton,
+                                       TABLE_SHARE *table,
+                                       MEM_ROOT *mem_root);
+
+extern int dynamic_column_error_message(enum_dyncol_func_result rc);
+
+handlerton *cassandra_hton;
+
+
+/*
+   Hash used to track the number of open tables; variable for example share
+   methods
+*/
+static HASH cassandra_open_tables;
+
+/* The mutex used to init the hash; variable for example share methods */
+mysql_mutex_t cassandra_mutex;
+
+
+/**
+  Structure for CREATE TABLE options (table options).
+  It needs to be called ha_table_option_struct.
+
+  The option values can be specified in the CREATE TABLE at the end:
+  CREATE TABLE ( ... ) *here*
+*/
+
+struct ha_table_option_struct
+{
+  const char *thrift_host;
+  int         thrift_port;
+  const char *keyspace;
+  const char *column_family;
+};
+
+
+ha_create_table_option cassandra_table_option_list[]=
+{
+  /*
+    one option that takes an arbitrary string
+  */
+  HA_TOPTION_STRING("thrift_host", thrift_host),
+  HA_TOPTION_NUMBER("thrift_port", thrift_port, 9160, 1, 65535, 0),
+  HA_TOPTION_STRING("keyspace", keyspace),
+  HA_TOPTION_STRING("column_family", column_family),
+  HA_TOPTION_END
+};
+
+/**
+  Structure for CREATE TABLE options (field options).
+*/
+
+struct ha_field_option_struct
+{
+  bool dyncol_field;
+};
+
+ha_create_table_option cassandra_field_option_list[]=
+{
+  /*
+    Collect all other columns as dynamic here,
+    the valid values are YES/NO, ON/OFF, 1/0.
+    The default is 0, that is true, yes, on.
really? looks like a typo. 0 is false, 1 is true.
...
+  */
+  HA_FOPTION_BOOL("DYNAMIC_COLUMN_STORAGE", dyncol_field, 0),
+  HA_FOPTION_END
+};
+
+static MYSQL_THDVAR_ULONG(insert_batch_size, PLUGIN_VAR_RQCMDARG,
+  "Number of rows in an INSERT batch",
+  NULL, NULL, /*default*/ 100, /*min*/ 1, /*max*/ 1024*1024*1024, 0);
+
+static MYSQL_THDVAR_ULONG(multiget_batch_size, PLUGIN_VAR_RQCMDARG,
+  "Number of rows in a multiget(MRR) batch",
+  NULL, NULL, /*default*/ 100, /*min*/ 1, /*max*/ 1024*1024*1024, 0);
+
+static MYSQL_THDVAR_ULONG(rnd_batch_size, PLUGIN_VAR_RQCMDARG,
+  "Number of rows in an rnd_read (full scan) batch",
+  NULL, NULL, /*default*/ 10*1000, /*min*/ 1, /*max*/ 1024*1024*1024, 0);
+
+static MYSQL_THDVAR_ULONG(failure_retries, PLUGIN_VAR_RQCMDARG,
+  "Number of times to retry Cassandra calls that failed due to timeouts or "
+  "network communication problems. The default, 0, means not to retry.",
+  NULL, NULL, /*default*/ 0, /*min*/ 0, /*max*/ 1024*1024*1024, 0);
+
+/* These match values in enum_cassandra_consistency_level */
+const char *cassandra_consistency_level[] =
+{
+  "ONE",
+  "QUORUM",
+  "LOCAL_QUORUM",
+  "EACH_QUORUM",
+  "ALL",
+  "ANY",
+  "TWO",
+  "THREE",
+   NullS
+};
+
+TYPELIB cassandra_consistency_level_typelib= {
+  array_elements(cassandra_consistency_level) - 1, "",
+  cassandra_consistency_level, NULL
+};
+
+
+static MYSQL_THDVAR_ENUM(write_consistency, PLUGIN_VAR_RQCMDARG,
+  "Cassandra consistency level to use for write operations", NULL, NULL,
+  ONE, &cassandra_consistency_level_typelib);
+
+static MYSQL_THDVAR_ENUM(read_consistency, PLUGIN_VAR_RQCMDARG,
+  "Cassandra consistency level to use for read operations", NULL, NULL,
+  ONE, &cassandra_consistency_level_typelib);
+
+
+mysql_mutex_t cassandra_default_host_lock;
+static char* cassandra_default_thrift_host = NULL;
+static char cassandra_default_host_buf[256]="";
+
+static void
+cassandra_default_thrift_host_update(THD *thd,
+                                     struct st_mysql_sys_var* var,
+                                     void* var_ptr, /*!< out: where the
+                                                    formal string goes */
+                                     const void* save) /*!< in: immediate result
+                                                       from check function */
+{
+  const char *new_host= *((char**)save);
+  const size_t max_len= sizeof(cassandra_default_host_buf);
+
+  mysql_mutex_lock(&cassandra_default_host_lock);
+
+  if (new_host)
+  {
+    strncpy(cassandra_default_host_buf, new_host, max_len-1);
+    cassandra_default_host_buf[max_len-1]= 0;
+    cassandra_default_thrift_host= cassandra_default_host_buf;
+  }
+  else
+  {
+    cassandra_default_host_buf[0]= 0;
+    cassandra_default_thrift_host= NULL;
+  }
+
+  *((const char**)var_ptr)= cassandra_default_thrift_host;
+
+  mysql_mutex_unlock(&cassandra_default_host_lock);
+}
+
+
+static MYSQL_SYSVAR_STR(default_thrift_host, cassandra_default_thrift_host,
+                        PLUGIN_VAR_RQCMDARG,
+                        "Default host for Cassandra thrift connections",
+                        /*check*/NULL,
+                        cassandra_default_thrift_host_update,
+                        /*default*/NULL);
+
+static struct st_mysql_sys_var* cassandra_system_variables[]= {
+  MYSQL_SYSVAR(insert_batch_size),
+  MYSQL_SYSVAR(multiget_batch_size),
+  MYSQL_SYSVAR(rnd_batch_size),
+
+  MYSQL_SYSVAR(default_thrift_host),
+  MYSQL_SYSVAR(write_consistency),
+  MYSQL_SYSVAR(read_consistency),
+  MYSQL_SYSVAR(failure_retries),
+  NULL
+};
+
+
+static SHOW_VAR cassandra_status_variables[]= {
+  {"row_inserts",
+    (char*) &cassandra_counters.row_inserts,         SHOW_LONG},
+  {"row_insert_batches",
+    (char*) &cassandra_counters.row_insert_batches,  SHOW_LONG},
+
+  {"multiget_reads",
+    (char*) &cassandra_counters.multiget_reads,      SHOW_LONG},
+  {"multiget_keys_scanned",
+    (char*) &cassandra_counters.multiget_keys_scanned, SHOW_LONG},
+  {"multiget_rows_read",
+    (char*) &cassandra_counters.multiget_rows_read,  SHOW_LONG},
+
+  {"timeout_exceptions",
+    (char*) &cassandra_counters.timeout_exceptions, SHOW_LONG},
+  {"unavailable_exceptions",
+    (char*) &cassandra_counters.unavailable_exceptions, SHOW_LONG},
+  {NullS, NullS, SHOW_LONG}
+};
+
+
+Cassandra_status_vars cassandra_counters;
+Cassandra_status_vars cassandra_counters_copy;
+
+
+/**
+  @brief
+  Function we use in the creation of our hash to get key.
+*/
+
+static uchar* cassandra_get_key(CASSANDRA_SHARE *share, size_t *length,
+                             my_bool not_used __attribute__((unused)))
+{
+  *length=share->table_name_length;
+  return (uchar*) share->table_name;
+}
+
+#ifdef HAVE_PSI_INTERFACE
+static PSI_mutex_key ex_key_mutex_example, ex_key_mutex_CASSANDRA_SHARE_mutex;
+
+static PSI_mutex_info all_cassandra_mutexes[]=
+{
+  { &ex_key_mutex_example, "cassandra", PSI_FLAG_GLOBAL},
+  { &ex_key_mutex_CASSANDRA_SHARE_mutex, "CASSANDRA_SHARE::mutex", 0}
+};
+
+static void init_cassandra_psi_keys()
+{
+  const char* category= "cassandra";
+  int count;
+
+  if (PSI_server == NULL)
+    return;
+
+  count= array_elements(all_cassandra_mutexes);
+  PSI_server->register_mutex(category, all_cassandra_mutexes, count);
+}
+#endif
+
+static int cassandra_init_func(void *p)
+{
+  DBUG_ENTER("cassandra_init_func");
+
+#ifdef HAVE_PSI_INTERFACE
+  init_cassandra_psi_keys();
+#endif
+
+  cassandra_hton= (handlerton *)p;
+  mysql_mutex_init(ex_key_mutex_example, &cassandra_mutex, MY_MUTEX_INIT_FAST);
+  (void) my_hash_init(&cassandra_open_tables,system_charset_info,32,0,0,
+                      (my_hash_get_key) cassandra_get_key,0,0);
+
+  cassandra_hton->state=   SHOW_OPTION_YES;
+  cassandra_hton->create=  cassandra_create_handler;
+  /*
+    Don't specify HTON_CAN_RECREATE in flags. re-create is used by TRUNCATE
+    TABLE to create an *empty* table from scratch. Cassandra table won't be
+    emptied if re-created.
+  */
HTON_ALTER_NOT_SUPPORTED is not set. ALTER works?
HTON_TEMPORARY_NOT_SUPPORTED is not set. CREATE TEMPORARY TABLE works?
HTON_NO_PARTITION is not set. Partitioning works?
...
+  cassandra_hton->flags=   0;
+  cassandra_hton->table_options= cassandra_table_option_list;
+  cassandra_hton->field_options= cassandra_field_option_list;
+
+  mysql_mutex_init(0 /* no instrumentation */,
why no instrumentation ?
...
+                   &cassandra_default_host_lock, MY_MUTEX_INIT_FAST);
+
+  DBUG_RETURN(0);
+}
+
+
+static int cassandra_done_func(void *p)
+{
+  int error= 0;
+  DBUG_ENTER("cassandra_done_func");
+  if (cassandra_open_tables.records)
+    error= 1;
+  my_hash_free(&cassandra_open_tables);
+  mysql_mutex_destroy(&cassandra_mutex);
+  mysql_mutex_destroy(&cassandra_default_host_lock);
+  DBUG_RETURN(error);
+}
+
+
+/**
+  @brief
+  Example of simple lock controls. The "share" it creates is a
+  structure we will pass to each cassandra handler. Do you have to have
+  one of these? Well, you have pieces that are used for locking, and
+  they are needed to function.
+*/
+
+static CASSANDRA_SHARE *get_share(const char *table_name, TABLE *table)
+{
+  CASSANDRA_SHARE *share;
+  uint length;
+  char *tmp_name;
+
+  mysql_mutex_lock(&cassandra_mutex);
+  length=(uint) strlen(table_name);
+
+  if (!(share=(CASSANDRA_SHARE*) my_hash_search(&cassandra_open_tables,
+                                              (uchar*) table_name,
+                                              length)))
+  {
+    if (!(share=(CASSANDRA_SHARE *)
+          my_multi_malloc(MYF(MY_WME | MY_ZEROFILL),
+                          &share, sizeof(*share),
+                          &tmp_name, length+1,
+                          NullS)))
+    {
+      mysql_mutex_unlock(&cassandra_mutex);
+      return NULL;
+    }
+
+    share->use_count=0;
+    share->table_name_length=length;
+    share->table_name=tmp_name;
+    strmov(share->table_name,table_name);
+    if (my_hash_insert(&cassandra_open_tables, (uchar*) share))
+      goto error;
+    thr_lock_init(&share->lock);
+    mysql_mutex_init(ex_key_mutex_CASSANDRA_SHARE_mutex,
+                     &share->mutex, MY_MUTEX_INIT_FAST);
+  }
+  share->use_count++;
+  mysql_mutex_unlock(&cassandra_mutex);
+
+  return share;
+
+error:
+  mysql_mutex_destroy(&share->mutex);
+  my_free(share);
+
+  return NULL;
+}
+
+
+/**
+  @brief
+  Free lock controls. We call this whenever we close a table. If the table had
+  the last reference to the share, then we free memory associated with it.
+*/
+
+static int free_share(CASSANDRA_SHARE *share)
+{
+  mysql_mutex_lock(&cassandra_mutex);
+  if (!--share->use_count)
+  {
+    my_hash_delete(&cassandra_open_tables, (uchar*) share);
+    thr_lock_delete(&share->lock);
+    mysql_mutex_destroy(&share->mutex);
+    my_free(share);
+  }
+  mysql_mutex_unlock(&cassandra_mutex);
+
+  return 0;
+}
+
+
+static handler* cassandra_create_handler(handlerton *hton,
+                                       TABLE_SHARE *table,
+                                       MEM_ROOT *mem_root)
+{
+  return new (mem_root) ha_cassandra(hton, table);
+}
+
+
+ha_cassandra::ha_cassandra(handlerton *hton, TABLE_SHARE *table_arg)
+  :handler(hton, table_arg),
+   se(NULL), field_converters(NULL),
+   special_type_field_converters(NULL),
+   special_type_field_names(NULL), n_special_type_fields(0),
+   rowkey_converter(NULL),
+   dyncol_field(0), dyncol_set(0)
+{}
+
+
+static const char *ha_cassandra_exts[] = {
+  NullS
+};
+
+const char **ha_cassandra::bas_ext() const
+{
+  return ha_cassandra_exts;
+}
+
+
+int ha_cassandra::connect_and_check_options(TABLE *table_arg)
+{
+  ha_table_option_struct *options= table_arg->s->option_struct;
+  int res;
+  DBUG_ENTER("ha_cassandra::connect_and_check_options");
+
+  if ((res= check_field_options(table_arg->s->field)) ||
+      (res= check_table_options(options)))
+    DBUG_RETURN(res);
+
+  se= create_cassandra_se();
+  se->set_column_family(options->column_family);
+  const char *thrift_host= options->thrift_host? options->thrift_host:
+                           cassandra_default_thrift_host;
+  if (se->connect(thrift_host, options->thrift_port, options->keyspace))
+  {
+    my_error(ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), se->error_str());
+    DBUG_RETURN(HA_ERR_NO_CONNECTION);
+  }
+
+  if (setup_field_converters(table_arg->field, table_arg->s->fields))
+  {
+    DBUG_RETURN(HA_ERR_NO_CONNECTION);
+  }
+
+  DBUG_RETURN(0);
+}
+
+
+int ha_cassandra::check_field_options(Field **fields)
+{
+  Field **field;
+  uint i;
+  DBUG_ENTER("ha_cassandra::check_field_options");
+  for (field= fields, i= 0; *field; field++, i++)
+  {
+    ha_field_option_struct *field_options= (*field)->option_struct;
+    if (field_options && field_options->dyncol_field)
+    {
+      if (dyncol_set || (*field)->type() != MYSQL_TYPE_BLOB)
+      {
+         my_error(ER_WRONG_FIELD_SPEC, MYF(0), (*field)->field_name);
+         DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+      }
+      dyncol_set= 1;
+      dyncol_field= i;
+      bzero(&dynamic_values, sizeof(dynamic_values));
+      bzero(&dynamic_names, sizeof(dynamic_names));
+      bzero(&dynamic_rec, sizeof(dynamic_rec));
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+int ha_cassandra::open(const char *name, int mode, uint test_if_locked)
+{
+  DBUG_ENTER("ha_cassandra::open");
+
+  if (!(share = get_share(name, table)))
+    DBUG_RETURN(1);
+  thr_lock_data_init(&share->lock,&lock,NULL);
+
+  DBUG_ASSERT(!se);
+  /*
+    Don't do the following on open: it prevents SHOW CREATE TABLE when the server
+    has gone away.
+  */
+  /*
+  int res;
+  if ((res= connect_and_check_options(table)))
+  {
+    DBUG_RETURN(res);
+  }
+  */
+
+  info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+  insert_lineno= 0;
+
+  DBUG_RETURN(0);
+}
+
+
+int ha_cassandra::close(void)
+{
+  DBUG_ENTER("ha_cassandra::close");
+  delete se;
+  se= NULL;
+  free_field_converters();
+  DBUG_RETURN(free_share(share));
+}
+
+
+int ha_cassandra::check_table_options(ha_table_option_struct *options)
+{
+  if (!options->thrift_host && (!cassandra_default_thrift_host ||
+                                !cassandra_default_thrift_host[0]))
+  {
+    my_error(ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0),
+             "thrift_host table option must be specified, or "
+             "@@cassandra_default_thrift_host must be set");
+    return HA_WRONG_CREATE_OPTION;
+  }
+
+  if (!options->keyspace || !options->column_family)
+  {
+    my_error(ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0),
+             "keyspace and column_family table options must be specified");
+    return HA_WRONG_CREATE_OPTION;
+  }
+  return 0;
+}
+
+
+/**
+  @brief
+  create() is called to create a table. The variable name will have the name
+  of the table.
+
+  @details
+  When create() is called you do not need to worry about
+  opening the table. Also, the .frm file will have already been
+  created so adjusting create_info is not necessary. You can overwrite
+  the .frm file at this point if you wish to change the table
+  definition, but there are no methods currently provided for doing
+  so.
+
+  Called from handle.cc by ha_create_table().
+
+  @see
+  ha_create_table() in handle.cc
+*/
+
+int ha_cassandra::create(const char *name, TABLE *table_arg,
+                         HA_CREATE_INFO *create_info)
+{
+  int res;
+  DBUG_ENTER("ha_cassandra::create");
+
+  Field **pfield= table_arg->s->field;
+  if (!((*pfield)->flags & NOT_NULL_FLAG))
+  {
+    my_error(ER_WRONG_COLUMN_NAME, MYF(0), "First column must be NOT NULL");
+    DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+  }
this is unnecessary. the second check guarantees that the first column is
NOT NULL, that is, this if() is redundant
...
+
+  if (table_arg->s->keys != 1 || table_arg->s->primary_key !=0 ||
+      table_arg->key_info[0].key_parts != 1 ||
+      table_arg->key_info[0].key_part[0].fieldnr != 1)
+  {
+    my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+             "Table must have PRIMARY KEY defined over the first column");
+    DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+  }
...
+CASSANDRA_TYPE get_cassandra_type(const char *validator)
+{
+  CASSANDRA_TYPE rc;
+  switch(validator[32])
+  {
+  case 'L':
+    rc= CT_BIGINT;
+    break;
+  case 'I':
+    rc= (validator[35] == '3' ? CT_INT : CT_VARINT);
+    rc= CT_INT;
+    break;
+  case 'C':
+    rc= CT_COUNTER;
+    break;
+  case 'F':
+    rc= CT_FLOAT;
+    break;
+  case 'D':
+    switch (validator[33])
+    {
+    case 'o':
+      rc= CT_DOUBLE;
+      break;
+    case 'a':
+      rc= CT_TIMESTAMP;
+      break;
+    case 'e':
+      rc= CT_DECIMAL;
+      break;
+    default:
+      rc= CT_BLOB;
+      break;
+    }
+    break;
+  case 'B':
+    rc= (validator[33] == 'o' ? CT_BOOLEAN : CT_BLOB);
+    break;
+  case 'A':
+    rc= CT_ASCII;
+    break;
+  case 'U':
+    rc= (validator[33] == 'T' ? CT_TEXT : CT_UUID);
+    break;
+  default:
+    rc= CT_BLOB;
+  }
+  DBUG_ASSERT(strcmp(cassandra_types[rc].name, validator) == 0);
+  return rc;
+}
+
+ColumnDataConverter *map_field_to_validator(Field *field, const char *validator_name)
+{
+  ColumnDataConverter *res= NULL;
+
+  switch(field->type()) {
+    case MYSQL_TYPE_TINY:
+      if (!strcmp(validator_name, validator_boolean))
why do you strcmp here, while you're only check selected characters in
get_cassandra_type() ?
you could've called get_cassandra_type() for validator_name instead
and compare enum values, instead of strings.
...
+      {
+        res= new TinyintDataConverter;
+        break;
+      }
+      /* fall through: */
+    case MYSQL_TYPE_SHORT:
+    case MYSQL_TYPE_LONGLONG:
+    {
+      bool is_counter= false;
+      if (!strcmp(validator_name, validator_bigint) ||
+          !strcmp(validator_name, validator_timestamp) ||
+          (is_counter= !strcmp(validator_name, validator_counter)))
+        res= new BigintDataConverter(!is_counter);
+      break;
+    }
+    case MYSQL_TYPE_FLOAT:
+      if (!strcmp(validator_name, validator_float))
+        res= new FloatDataConverter;
+      break;
+
+    case MYSQL_TYPE_DOUBLE:
+      if (!strcmp(validator_name, validator_double))
+        res= new DoubleDataConverter;
+      break;
+
+    case MYSQL_TYPE_TIMESTAMP:
+      if (!strcmp(validator_name, validator_timestamp))
+        res= new TimestampDataConverter;
+      break;
+
+    case MYSQL_TYPE_STRING: // these are space padded CHAR(n) strings.
+      if (!strcmp(validator_name, validator_uuid) &&
+          field->real_type() == MYSQL_TYPE_STRING &&
+          field->field_length == 36)
+      {
+        // UUID maps to CHAR(36), its text representation
+        res= new UuidDataConverter;
+        break;
+      }
+      /* fall through: */
+    case MYSQL_TYPE_VAR_STRING:
+    case MYSQL_TYPE_VARCHAR:
+    {
+      /*
+        Cassandra's "varint" type is a binary-encoded arbitary-length
+        big-endian number.
+        - It can be mapped to VARBINARY(N), with sufficiently big N.
+        - If the value does not fit into N bytes, it is an error. We should not
+          truncate it, because that is just as good as returning garbage.
+        - varint should not be mapped to BINARY(N), because BINARY(N) values
+          are zero-padded, which will work as multiplying the value by
+          2^k for some value of k.
+      */
+      if (field->type() == MYSQL_TYPE_VARCHAR &&
+          field->binary() &&
+          (!strcmp(validator_name, validator_varint) ||
+           !strcmp(validator_name, validator_decimal)))
+      {
+        res= new StringCopyConverter(field->field_length);
+        break;
+      }
+
+      if (!strcmp(validator_name, validator_blob) ||
+          !strcmp(validator_name, validator_ascii) ||
+          !strcmp(validator_name, validator_text))
+      {
+        res= new StringCopyConverter((size_t)-1);
+      }
+      break;
+    }
+    case MYSQL_TYPE_LONG:
+      if (!strcmp(validator_name, validator_int))
+        res= new Int32DataConverter;
+      break;
+
+    default:;
+  }
+  return res;
+}
+
+
+bool ha_cassandra::setup_field_converters(Field **field_arg, uint n_fields)
+{
+  char *col_name;
+  int  col_name_len;
+  char *col_type;
+  int col_type_len;
+  size_t ddl_fields= se->get_ddl_size();
+  const char *default_type= se->get_default_validator();
+  uint max_non_default_fields;
+  DBUG_ENTER("ha_cassandra::setup_field_converters");
+  DBUG_ASSERT(default_type);
+
+  DBUG_ASSERT(!field_converters);
+  DBUG_ASSERT(dyncol_set == 0 || dyncol_set == 1);
+
+  /*
+    We always should take into account that in case of using dynamic columns
+    sql description contain one field which does not described in
+    Cassandra DDL also key field is described separately. So that
+    is why we use "n_fields - dyncol_set - 1" or "ddl_fields + 2".
+  */
+  max_non_default_fields= ddl_fields + 2 - n_fields;
+  if (ddl_fields < (n_fields - dyncol_set - 1))
+  {
+    se->print_error("Some of SQL fields were not mapped to Cassandra's fields");
+    my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+    DBUG_RETURN(true);
+  }
+
+  /* allocate memory in one chunk */
+  size_t memsize= sizeof(ColumnDataConverter*) * n_fields +
+    (sizeof(LEX_STRING) + sizeof(CASSANDRA_TYPE_DEF))*
+    (dyncol_set ? max_non_default_fields : 0);
+  if (!(field_converters= (ColumnDataConverter**)my_malloc(memsize, MYF(0))))
+    DBUG_RETURN(true);
+  bzero(field_converters, memsize);
+  n_field_converters= n_fields;
+
+  if (dyncol_set)
+  {
+    special_type_field_converters=
+      (CASSANDRA_TYPE_DEF *)(field_converters + n_fields);
+    special_type_field_names=
+      ((LEX_STRING*)(special_type_field_converters + max_non_default_fields));
+  }
+
+  if (dyncol_set)
+  {
+    if (init_dynamic_array(&dynamic_values,
+                           sizeof(DYNAMIC_COLUMN_VALUE),
+                           DYNCOL_USUAL, DYNCOL_DELTA))
+      DBUG_RETURN(true);
+    else
+      if (init_dynamic_array(&dynamic_names,
+                             sizeof(LEX_STRING),
+                             DYNCOL_USUAL, DYNCOL_DELTA))
+      {
+        delete_dynamic(&dynamic_values);
+        DBUG_RETURN(true);
+      }
+      else
+        if (init_dynamic_string(&dynamic_rec, NULL,
+                                DYNCOL_USUAL_REC, DYNCOL_DELTA_REC))
+        {
+          delete_dynamic(&dynamic_values);
+          delete_dynamic(&dynamic_names);
+          DBUG_RETURN(true);
+        }
+
+    /* Dynamic column field has special processing */
+    field_converters[dyncol_field]= NULL;
+
+    default_type_def= cassandra_types + get_cassandra_type(default_type);
+  }
+
+  se->first_ddl_column();
+  uint n_mapped= 0;
+  while (!se->next_ddl_column(&col_name, &col_name_len, &col_type,
+                              &col_type_len))
+  {
+    Field **field;
+    uint i;
+    /* Mapping for the 1st field is already known */
+    for (field= field_arg + 1, i= 1; *field; field++, i++)
+    {
+      if ((!dyncol_set || dyncol_field != i) &&
+          !strcmp((*field)->field_name, col_name))
+      {
+        n_mapped++;
+        ColumnDataConverter **conv= field_converters + (*field)->field_index;
+        if (!(*conv= map_field_to_validator(*field, col_type)))
+        {
+          se->print_error("Failed to map column %s to datatype %s",
+                          (*field)->field_name, col_type);
+          my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+          DBUG_RETURN(true);
+        }
+        (*conv)->field= *field;
+      }
+    }
+    if (dyncol_set && !(*field)) // is needed and not found
+    {
+      DBUG_PRINT("info",("Field not found: %s", col_name));
+      if (strcmp(col_type, default_type))
+      {
+        DBUG_PRINT("info",("Field '%s' non-default type: '%s'",
+                           col_name, col_type));
+        special_type_field_names[n_special_type_fields].length= col_name_len;
+        special_type_field_names[n_special_type_fields].str= col_name;
+        special_type_field_converters[n_special_type_fields]=
+          cassandra_types[get_cassandra_type(col_type)];
+        n_special_type_fields++;
+      }
+    }
+  }
+
+  if (n_mapped != n_fields - 1 - dyncol_set)
+  {
+    Field *first_unmapped= NULL;
+    /* Find the first field */
+    for (uint i= 1; i < n_fields;i++)
+    {
+      if (!field_converters[i])
+      {
+        first_unmapped= field_arg[i];
+        break;
+      }
+    }
+    DBUG_ASSERT(first_unmapped);
+
+    se->print_error("Field `%s` could not be mapped to any field in Cassandra",
generally (here and everywhere) use %`s instead of `%s`
...
+                    first_unmapped->field_name);
+    my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+    DBUG_RETURN(true);
+  }
+
+  /*
+    Setup type conversion for row_key.
+  */
+  se->get_rowkey_type(&col_name, &col_type);
+  if (col_name && strcmp(col_name, (*field_arg)->field_name))
+  {
+    se->print_error("PRIMARY KEY column must match Cassandra's name '%s'",
+                    col_name);
+    my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+    DBUG_RETURN(true);
+  }
+  if (!col_name && strcmp("rowkey", (*field_arg)->field_name))
+  {
+    se->print_error("target column family has no key_alias defined, "
+                    "PRIMARY KEY column must be named 'rowkey'");
+    my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+    DBUG_RETURN(true);
+  }
+
+  if (col_type != NULL)
+  {
+    if (!(rowkey_converter= map_field_to_validator(*field_arg, col_type)))
+    {
+      se->print_error("Failed to map PRIMARY KEY to datatype %s", col_type);
+      my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+      DBUG_RETURN(true);
+    }
+    rowkey_converter->field= *field_arg;
+  }
+  else
+  {
+    se->print_error("Cassandra's rowkey has no defined datatype (todo: support this)");
+    my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+    DBUG_RETURN(true);
+  }
+
+  DBUG_RETURN(false);
+}
+
+
+void ha_cassandra::free_field_converters()
+{
+  delete rowkey_converter;
+  rowkey_converter= NULL;
+
+  if (dyncol_set)
+  {
+    delete_dynamic(&dynamic_values);
+    delete_dynamic(&dynamic_names);
+    dynstr_free(&dynamic_rec);
+  }
+  if (field_converters)
+  {
+    for (uint i=0; i < n_field_converters; i++)
+      if (field_converters[i])
+      {
+        DBUG_ASSERT(!dyncol_set || i == dyncol_field);
+        delete field_converters[i];
+      }
+    my_free(field_converters);
+    field_converters= NULL;
+  }
+}
+
+
+int ha_cassandra::index_init(uint idx, bool sorted)
+{
+  int ires;
+  if (!se && (ires= connect_and_check_options(table)))
+    return ires;
+  return 0;
+}
+
+void store_key_image_to_rec(Field *field, uchar *ptr, uint len);
+
+int ha_cassandra::index_read_map(uchar *buf, const uchar *key,
+                                 key_part_map keypart_map,
+                                 enum ha_rkey_function find_flag)
+{
+  int rc= 0;
+  DBUG_ENTER("ha_cassandra::index_read_map");
+
+  if (find_flag != HA_READ_KEY_EXACT)
+    DBUG_RETURN(HA_ERR_WRONG_COMMAND);
did you verify that the server expects HA_ERR_WRONG_COMMAND from
this method and correctly handles it?
or, perhaps, find_flag is always HA_READ_KEY_EXACT here,
because of your index_flags() ?
...
+
+  uint key_len= calculate_key_len(table, active_index, key, keypart_map);
+  store_key_image_to_rec(table->field[0], (uchar*)key, key_len);
+
+  char *cass_key;
+  int cass_key_len;
+  my_bitmap_map *old_map;
+
+  old_map= dbug_tmp_use_all_columns(table, table->read_set);
+
+  if (rowkey_converter->mariadb_to_cassandra(&cass_key, &cass_key_len))
+  {
+    /* We get here when making lookups like uuid_column='not-an-uuid' */
+    dbug_tmp_restore_column_map(table->read_set, old_map);
+    DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+  }
+
+  dbug_tmp_restore_column_map(table->read_set, old_map);
+
+  bool found;
+  if (se->get_slice(cass_key, cass_key_len, &found))
+  {
+    my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
generally it's better not to use my_error() directly,
but only return an error code, and return your se->error_str()
from ha_cassandra::get_error_message()
...
+    rc= HA_ERR_INTERNAL_ERROR;
+  }
+
+  /* TODO: what if we're not reading all columns?? */
+  if (!found)
+    rc= HA_ERR_KEY_NOT_FOUND;
+  else
+    rc= read_cassandra_columns(false);
+
+  DBUG_RETURN(rc);
+}
+
+
+void ha_cassandra::print_conversion_error(const char *field_name,
+                                          char *cass_value,
+                                          int cass_value_len)
+{
+  char buf[32];
+  char *p= cass_value;
+  size_t i= 0;
+  for (; (i < (int)sizeof(buf)-1) && (p < cass_value + cass_value_len); p++)
why do you cast here?
...
+  {
+    buf[i++]= map2number[(*p >> 4) & 0xF];
+    buf[i++]= map2number[*p & 0xF];
+  }
+  buf[i]=0;
+
+  se->print_error("Unable to convert value for field `%s` from Cassandra's data"
+                  " format. Source data is %d bytes, 0x%s%s",
+                  field_name, cass_value_len, buf,
+                  (i == sizeof(buf) - 1)? "..." : "");
+  my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+}
+
+
+void free_strings(DYNAMIC_COLUMN_VALUE *vals, uint num)
+{
+  for (uint i= 0; i < num; i++)
+    if (vals[i].type == DYN_COL_STRING &&
+        !vals[i].x.string.nonfreeable)
+      my_free(vals[i].x.string.value.str);
+}
+
+
+CASSANDRA_TYPE_DEF * ha_cassandra::get_cassandra_field_def(char *cass_name,
+                                                           int cass_name_len)
+{
+  CASSANDRA_TYPE_DEF *type= default_type_def;
+  for(uint i= 0; i < n_special_type_fields; i++)
+  {
+    if (cass_name_len == (int)special_type_field_names[i].length &&
+        memcmp(cass_name, special_type_field_names[i].str,
+               cass_name_len) == 0)
+    {
+      type= special_type_field_converters + i;
+      break;
+    }
+  }
+  return type;
+}
+
+int ha_cassandra::read_cassandra_columns(bool unpack_pk)
+{
+  char *cass_name;
+  char *cass_value;
+  int cass_value_len, cass_name_len;
+  Field **field;
+  int res= 0;
+  ulong total_name_len= 0;
+
+  /*
+    cassandra_to_mariadb() calls will use field->store(...) methods, which
+    require that the column is in the table->write_set
+  */
+  my_bitmap_map *old_map;
+  old_map= dbug_tmp_use_all_columns(table, table->write_set);
+
+  /* Start with all fields being NULL */
+  for (field= table->field + 1; *field; field++)
+    (*field)->set_null();
+
+  while (!se->get_next_read_column(&cass_name, &cass_name_len,
+                                   &cass_value, &cass_value_len))
+  {
+    // map to our column. todo: use hash or something..
+    bool found= 0;
+    for (field= table->field + 1; *field; field++)
+    {
+      uint fieldnr= (*field)->field_index;
+      if ((!dyncol_set || dyncol_field != fieldnr) &&
+          !strcmp((*field)->field_name, cass_name))
+      {
+        found= 1;
+        (*field)->set_notnull();
+        if (field_converters[fieldnr]->cassandra_to_mariadb(cass_value,
+                                                            cass_value_len))
+        {
+          print_conversion_error((*field)->field_name, cass_value,
+                                 cass_value_len);
+          res=1;
+          goto err;
+        }
+        break;
+      }
+    }
+    if (dyncol_set && !found)
+    {
+      DYNAMIC_COLUMN_VALUE val;
+      LEX_STRING nm;
+      CASSANDRA_TYPE_DEF *type= get_cassandra_field_def(cass_name,
+                                                        cass_name_len);
+      nm.str= cass_name;
+      nm.length= cass_name_len;
+      if (nm.length > MAX_NAME_LENGTH)
+      {
+        se->print_error("Unable to convert value for field `%s`"
+                        " from Cassandra's data format. Name"
+                        " length exceed limit of %u: '%s'",
+                        table->field[dyncol_field]->field_name,
+                        (uint)MAX_NAME_LENGTH, cass_name);
+        my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+        res=1;
+        goto err;
+      }
+      total_name_len+= cass_name_len;
+      if (nm.length > MAX_TOTAL_NAME_LENGTH)
+      {
+        se->print_error("Unable to convert value for field `%s`"
+                        " from Cassandra's data format. Sum of all names"
+                        " length exceed limit of %lu",
+                        table->field[dyncol_field]->field_name,
+                        cass_name, (uint)MAX_TOTAL_NAME_LENGTH);
+        my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+        res=1;
+        goto err;
+      }
+
+      if ((res= (*(type->cassandra_to_dynamic))(cass_value,
+                                                cass_value_len, &val)) ||
+          insert_dynamic(&dynamic_names, (uchar *) &nm) ||
+          insert_dynamic(&dynamic_values, (uchar *) &val))
+      {
+        if (res)
+        {
+          print_conversion_error(cass_name, cass_value, cass_value_len);
+        }
+        free_strings((DYNAMIC_COLUMN_VALUE *)dynamic_values.buffer,
+                     dynamic_values.elements);
+        // EOM shouldm be already reported if happened
+        res=1;
+        goto err;
+      }
+    }
+  }
+
+  dynamic_rec.length= 0;
+  if (dyncol_set)
+  {
+    if (dynamic_column_create_many_internal_fmt(&dynamic_rec,
+                                                dynamic_names.elements,
+                                                dynamic_names.buffer,
+                                                (DYNAMIC_COLUMN_VALUE *)
+                                                dynamic_values.buffer,
+                                                FALSE,
+                                                TRUE) < 0)
+      dynamic_rec.length= 0;
+
+    free_strings((DYNAMIC_COLUMN_VALUE *)dynamic_values.buffer,
+                 dynamic_values.elements);
+    dynamic_values.elements= dynamic_names.elements= 0;
+  }
+  if (dyncol_set)
why a separate if() ?
...
+  {
+    if (dynamic_rec.length == 0)
+      table->field[dyncol_field]->set_null();
+    else
+    {
+      Field_blob *blob= (Field_blob *)table->field[dyncol_field];
+      blob->set_notnull();
+      blob->store_length(dynamic_rec.length);
+      *((char **)(((char *)blob->ptr) + blob->pack_length_no_ptr()))=
+        dynamic_rec.str;
+    }
+  }
+
+  if (unpack_pk)
+  {
+    /* Unpack rowkey to primary key */
+    field= table->field;
+    (*field)->set_notnull();
+    se->get_read_rowkey(&cass_value, &cass_value_len);
+    if (rowkey_converter->cassandra_to_mariadb(cass_value, cass_value_len))
+    {
+      print_conversion_error((*field)->field_name, cass_value, cass_value_len);
+      res=1;
+      goto err;
+    }
+  }
+
+err:
+  dbug_tmp_restore_column_map(table->write_set, old_map);
+  return res;
+}
...
+int ha_cassandra::rnd_pos(uchar *buf, uchar *pos)
+{
+  int rc;
+  DBUG_ENTER("ha_cassandra::rnd_pos");
+
+  int save_active_index= active_index;
+  active_index= 0; /* The primary key */
+  rc= index_read_map(buf, pos, key_part_map(1), HA_READ_KEY_EXACT);
+
+  active_index= save_active_index;
a bit nicer would be to implement index_read_map_idx
and call it from here and from index_read_map
...
+
+  DBUG_RETURN(rc);
+}
+
...
+bool ha_cassandra::mrr_start_read()
+{
+  uint key_len;
+
+  my_bitmap_map *old_map;
+  old_map= dbug_tmp_use_all_columns(table, table->read_set);
+
+  se->new_lookup_keys();
+
+  while (!(source_exhausted= mrr_funcs.next(mrr_iter, &mrr_cur_range)))
+  {
+    char *cass_key;
+    int cass_key_len;
+
+    DBUG_ASSERT(mrr_cur_range.range_flag & EQ_RANGE);
+
+    uchar *key= (uchar*)mrr_cur_range.start_key.key;
+    key_len= mrr_cur_range.start_key.length;
+    //key_len= calculate_key_len(table, active_index, key, keypart_map); // NEED THIS??
+    store_key_image_to_rec(table->field[0], (uchar*)key, key_len);
+
+    rowkey_converter->mariadb_to_cassandra(&cass_key, &cass_key_len);
+
+    // Primitive buffer control
+    if (se->add_lookup_key(cass_key, cass_key_len) >
+        THDVAR(table->in_use, multiget_batch_size))
+      break;
I'd suggest to add a status variable to count
how many times the buffer was refilled
...
+  }
+
+  dbug_tmp_restore_column_map(table->read_set, old_map);
+
+  return se->multiget_slice();
+}
...
+/////////////////////////////////////////////////////////////////////////////
+// Dummy implementations start
+/////////////////////////////////////////////////////////////////////////////
+
+
+int ha_cassandra::index_next(uchar *buf)
Why did you do all these dummy methods?
...
+{
+  int rc;
+  DBUG_ENTER("ha_cassandra::index_next");
+  rc= HA_ERR_WRONG_COMMAND;
+  DBUG_RETURN(rc);
+}
+
+
+int ha_cassandra::index_prev(uchar *buf)
+{
+  int rc;
+  DBUG_ENTER("ha_cassandra::index_prev");
+  rc= HA_ERR_WRONG_COMMAND;
+  DBUG_RETURN(rc);
+}
+
+
+int ha_cassandra::index_first(uchar *buf)
+{
+  int rc;
+  DBUG_ENTER("ha_cassandra::index_first");
+  rc= HA_ERR_WRONG_COMMAND;
+  DBUG_RETURN(rc);
+}
+
+int ha_cassandra::index_last(uchar *buf)
+{
+  int rc;
+  DBUG_ENTER("ha_cassandra::index_last");
+  rc= HA_ERR_WRONG_COMMAND;
+  DBUG_RETURN(rc);
+}
+
+
+ha_rows ha_cassandra::records_in_range(uint inx, key_range *min_key,
+                                     key_range *max_key)
+{
+  DBUG_ENTER("ha_cassandra::records_in_range");
+  //DBUG_RETURN(10);                         // low number to force index usage
+  DBUG_RETURN(HA_POS_ERROR);
+}
+
+
+class Column_name_enumerator_impl : public Column_name_enumerator
+{
+  ha_cassandra *obj;
+  uint idx;
+public:
+  Column_name_enumerator_impl(ha_cassandra *obj_arg) : obj(obj_arg), idx(1) {}
+  const char* get_next_name()
+  {
+    if (idx == obj->table->s->fields)
+      return NULL;
+    else
+      return obj->table->field[idx++]->field_name;
+  }
+};
+
+
+int ha_cassandra::update_row(const uchar *old_data, uchar *new_data)
+{
+  DYNAMIC_ARRAY oldvals, oldnames, vals, names;
+  String oldvalcol, valcol;
+  char *oldfree_names= NULL, *free_names= NULL;
+  my_bitmap_map *old_map;
+  int res;
+  DBUG_ENTER("ha_cassandra::update_row");
+  /* Currently, it is guaranteed that new_data == table->record[0] */
+  DBUG_ASSERT(new_data == table->record[0]);
+  /* For now, just rewrite the full record */
+  se->clear_insert_buffer();
+
+  old_map= dbug_tmp_use_all_columns(table, table->read_set);
+
+  char *old_key;
+  int old_key_len;
+  se->get_read_rowkey(&old_key, &old_key_len);
+
+  /* Get the key we're going to write */
+  char *new_key;
+  int new_key_len;
+  if (rowkey_converter->mariadb_to_cassandra(&new_key, &new_key_len))
+  {
+    my_error(ER_WARN_DATA_OUT_OF_RANGE, MYF(0),
+             rowkey_converter->field->field_name, insert_lineno);
+    dbug_tmp_restore_column_map(table->read_set, old_map);
+    DBUG_RETURN(HA_ERR_AUTOINC_ERANGE);
+  }
+
+  /*
+    Compare it to the key we've read. For all types that Cassandra supports,
+    binary byte-wise comparison can be used
+  */
+  bool new_primary_key;
+  if (new_key_len != old_key_len || memcmp(old_key, new_key, new_key_len))
+    new_primary_key= true;
+  else
+    new_primary_key= false;
+
+  if (dyncol_set)
+  {
+    Field *field= table->field[dyncol_field];
+    /* move to get old_data */
+    my_ptrdiff_t diff;
+    diff= (my_ptrdiff_t) (old_data - new_data);
+    field->move_field_offset(diff);      // Points now at old_data
+    if ((res= read_dyncol(&oldvals, &oldnames, &oldvalcol, &oldfree_names)))
+      DBUG_RETURN(res);
+    field->move_field_offset(-diff);     // back to new_data
+    if ((res= read_dyncol(&vals, &names, &valcol, &free_names)))
+    {
+      free_dynamic_row(&oldnames, &oldvals, oldfree_names);
+      DBUG_RETURN(res);
+    }
+  }
+
+  if (new_primary_key)
+  {
+    /*
+      Primary key value changed. This is essentially a DELETE + INSERT.
+      Add a DELETE operation into the batch
+    */
+    Column_name_enumerator_impl name_enumerator(this);
+    se->add_row_deletion(old_key, old_key_len, &name_enumerator,
+                         (LEX_STRING *)oldnames.buffer,
+                         (dyncol_set ? oldnames.elements : 0));
+    oldnames.elements= oldvals.elements= 0; // they will be deleted
+  }
+
+  se->start_row_insert(new_key, new_key_len);
+
+  /* Convert other fields */
+  for (uint i= 1; i < table->s->fields; i++)
I'd probably would've simply done delete+insert in all cases :)

but as you've went to the troubles of updating column by column,
it really make little sense to rewrite columns that didn't change.
could you check table->write_set here and memcmp with the old
column value before updating?
...
+  {
+    char *cass_data;
+    int cass_data_len;
+    if (dyncol_set && dyncol_field == i)
+    {
+      DBUG_ASSERT(field_converters[i] == NULL);
+      if ((res= write_dynamic_row(&vals, &names)))
+        goto err;
+    }
+    else
+    {
+      if (field_converters[i]->mariadb_to_cassandra(&cass_data, &cass_data_len))
+      {
+        my_error(ER_WARN_DATA_OUT_OF_RANGE, MYF(0),
+                 field_converters[i]->field->field_name, insert_lineno);
+        dbug_tmp_restore_column_map(table->read_set, old_map);
+        DBUG_RETURN(HA_ERR_AUTOINC_ERANGE);
+      }
+      se->add_insert_column(field_converters[i]->field->field_name, 0,
+                            cass_data, cass_data_len);
+    }
+  }
+  if (dyncol_set)
+  {
+    /* find removed fields */
+    uint i= 0, j= 0;
+    LEX_STRING *onames= (LEX_STRING *)oldnames.buffer;
+    LEX_STRING *nnames= (LEX_STRING *)names.buffer;
+    /* both array are sorted */
+    for(; i < oldnames.elements; i++)
+    {
+      int scmp= 0;
+      while (j < names.elements &&
+             (nnames[j].length < onames[i].length ||
+             (nnames[j].length == onames[i].length &&
+              (scmp= memcmp(nnames[j].str, onames[i].str,
+                            onames[i].length)) < 0)))
+        j++;
+      if (j < names.elements &&
+          nnames[j].length == onames[i].length &&
+          scmp == 0)
+        j++;
+      else
+        se->add_insert_delete_column(onames[i].str, onames[i].length);
+    }
+  }
+
+  dbug_tmp_restore_column_map(table->read_set, old_map);
+
+  res= se->do_insert();
+
+  if (res)
+    my_error(ER_INTERNAL_ERROR, MYF(0), se->error_str());
+
+err:
+  if (dyncol_set)
+  {
+    free_dynamic_row(&oldnames, &oldvals, oldfree_names);
+    free_dynamic_row(&names, &vals, free_names);
+  }
+
+  DBUG_RETURN(res? HA_ERR_INTERNAL_ERROR: 0);
+}
+
+
+int ha_cassandra::extra(enum ha_extra_function operation)
+{
+  DBUG_ENTER("ha_cassandra::extra");
+  DBUG_RETURN(0);
+}
please. why do you like dummy methods that much? :)
...
+
+
+/* The following function was copied from ha_blackhole::store_lock: */
+THR_LOCK_DATA **ha_cassandra::store_lock(THD *thd,
+                                         THR_LOCK_DATA **to,
+                                         enum thr_lock_type lock_type)
+{
+  DBUG_ENTER("ha_cassandra::store_lock");
+  if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK)
+  {
+    /*
+      Here is where we get into the guts of a row level lock.
+      If TL_UNLOCK is set
+      If we are not doing a LOCK TABLE or DISCARD/IMPORT
+      TABLESPACE, then allow multiple writers
+    */
+
+    if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
+         lock_type <= TL_WRITE) && !thd_in_lock_tables(thd)
+        && !thd_tablespace_op(thd))
1. tablespace op in cassandra? really? too much copy-paste is confusing.
(here and in the comments too)
2. did you test LOCK TABLES?
...
+      lock_type = TL_WRITE_ALLOW_WRITE;
that makes all changes to cassanrda immediately visible
by concurrently running threads. okay, if that's intentional,
but it needs to be documented, as it not the SQL semantics.
...
+
+    /*
+      In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
+      MySQL would use the lock TL_READ_NO_INSERT on t2, and that
+      would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
+      to t2. Convert the lock to a normal read lock to allow
+      concurrent inserts to t2.
+    */
+
+    if (lock_type == TL_READ_NO_INSERT && !thd_in_lock_tables(thd))
+      lock_type = TL_READ;
this also breaks SBR for INSERT ... SELECT. same as above - okay if
intentional, but should be thouroughly documented.
...
+
+    lock.type= lock_type;
+  }
+  *to++= &lock;
+  DBUG_RETURN(to);
+}
+
+
+int ha_cassandra::external_lock(THD *thd, int lock_type)
+{
+  DBUG_ENTER("ha_cassandra::external_lock");
+  DBUG_RETURN(0);
+}
+
+int ha_cassandra::delete_table(const char *name)
+{
+  DBUG_ENTER("ha_cassandra::delete_table");
+  /*
+    Cassandra table is just a view. Dropping it doesn't affect the underlying
+    column family.
+  */
+  DBUG_RETURN(0);
+}
more dummies...
...
+
+
+/**
+  check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
+  if new and old definition are compatible
+
+  @details If there are no other explicit signs like changed number of
+  fields this function will be called by compare_tables()
+  (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
+  file.
+
+*/
+
+bool ha_cassandra::check_if_incompatible_data(HA_CREATE_INFO *info,
+                                            uint table_changes)
+{
+  DBUG_ENTER("ha_cassandra::check_if_incompatible_data");
+  /* Checked, we intend to have this empty for Cassandra SE. */
+  DBUG_RETURN(COMPATIBLE_DATA_YES);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////
+// Dummy implementations end
+/////////////////////////////////////////////////////////////////////////////
::update_row and ::store_lock are hardly dummies :)
...
+
+static int show_cassandra_vars(THD *thd, SHOW_VAR *var, char *buff)
+{
+  cassandra_counters_copy= cassandra_counters;
+
+  var->type= SHOW_ARRAY;
+  var->value= (char *) &cassandra_status_variables;
what's the point? If you don't do anything in this function,
you can just as easily list all status variables below
in the func_status[] array.
...
+  return 0;
+}
+
+
+struct st_mysql_storage_engine cassandra_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+static struct st_mysql_show_var func_status[]=
+{
+  {"Cassandra",  (char *)show_cassandra_vars, SHOW_FUNC},
+  {0,0,SHOW_UNDEF}
+};
+
+maria_declare_plugin(cassandra)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &cassandra_storage_engine,
+  "CASSANDRA",
+  "Monty Program Ab",
+  "Cassandra storage engine",
+  PLUGIN_LICENSE_GPL,
+  cassandra_init_func,                            /* Plugin Init */
+  cassandra_done_func,                            /* Plugin Deinit */
+  0x0001,                                       /* version number (0.1) */
+  func_status,                                  /* status variables */
+  cassandra_system_variables,                     /* system variables */
+  "0.1",                                        /* string version */
+  MariaDB_PLUGIN_MATURITY_EXPERIMENTAL          /* maturity */
+}
+maria_declare_plugin_end;
Regards,
Sergei

    

[Maria-developers] MDEV-3792: review of the handler part of the cassandra engine

Sergei Golubchik