Hello, 

I have recently been doing some capacity planning exercises for a new database we are developing in MariaDB.  One of the exercises I completed was to create a database with all tables loaded with maximum width records at 100,000, 500,000, and 1,000,000 records.  I noticed that when I did this it seems that the larger the tables grow, the less bytes per record are required.  I am curious what causes this to happen?  These are all InnoDB tables saved to their own tablespaces.





Max Bytes Per Record



Record Count Record Count Record Count

100,000 500,000 1,000,000
acct_activation 241 176 164
acct_deactivated 419 336 327
disk_usage 136 75 63
keywords 713 646 633
pods 77,511 56,698 53,126
pods_joined 178 101 96
pod_activity 461 369 357
pod_files 78,224 57,220 53,615
sessions 168 92 80
users 1,426 1,351 1,338
usr_profile 78,392 57,343 53,730


I also created a query (using Information_Schema.Columns) to calculate maximum possible bytes per record / table (SQL attached below).  I ran the query and compared the estimates to the actual values produced above.  The query is close, but I would like to make it a little more accurate if possible. 


TABLE_SCHEMA TABLE_NAME PK_BYTES_TOT FIELD_BYTES_TOT IX_FIELD_BYTES_TOT IX_FIELD_COUNT IX_OVERHEAD_BYTES IX_BYTES_TOT TABLE_BYTES_TOT
pods acct_activation 0 73 33 1 29 62 189
pods acct_deactivated 8 269 0 0 0 0 388
pods devices 1 273 0 0 0 0 384
pods disk_usage 8 32 0 0 0 0 56
pods keywords 0 264 264 2 58 322 820
pods pods 8 65,821 16 1 29 45 92,224
pods pods_joined 0 16 16 2 58 74 126
pods pod_activity 0 275 9 1 29 38 438
pods pod_files 8 66,376 56 2 58 114 93,097
pods sessions 0 10 10 2 58 68 109
pods users 8 1,112 41 1 29 70 1,666
pods usr_profile 8 66,628 0 0 0 0 93,290
    
Does anyone have any suggestions for my calculations in the code below to make it more accurate?

Basic Assumptions 


Current SQL to calculate MAX record bytes by table:

SELECT B.TABLE_SCHEMA
     , B.TABLE_NAME
     , SUM(PK_BYTES) AS PK_BYTES_TOT
     , SUM(FIELD_BYTE_SPACE) AS FIELD_BYTES_TOT
     , SUM(IX_BYTES) AS IX_FIELD_BYTES_TOT
     , SUM(CASE WHEN IX_BYTES > 0 THEN 1 ELSE 0 END) AS IX_FIELD_COUNT
     , SUM(CASE WHEN IX_BYTES > 0 THEN 1 ELSE 0 END) * 29 AS IX_OVERHEAD_BYTES
     , (SUM(CASE WHEN IX_BYTES > 0 THEN 1 ELSE 0 END) * 29) + SUM(IX_BYTES) AS IX_BYTES_TOT
     , (SUM(PK_BYTES) + SUM(FIELD_BYTE_SPACE) + 
       (SUM(CASE WHEN IX_BYTES > 0 THEN 1 ELSE 0 END) * 29) + SUM(IX_BYTES)) * 1.4 AS TABLE_BYTES_TOT
FROM 
(
SELECT A.*
     , CASE WHEN COLUMN_KEY = 'PRI'THEN FIELD_BYTE_SPACE ELSE 0 END AS PK_BYTES
     , CASE WHEN A.COLUMN_KEY <> 'PRI'  
             AND A.COLUMN_KEY <> '' THEN (PK_BYTE_SPACE + FIELD_BYTE_SPACE) ELSE 0 END AS IX_BYTES
FROM (   
SELECT PK_SP.TABLE_SCHEMA
     , PK_SP.TABLE_NAME
     , PK_SP.COLUMN_NAME
     , DATA_TYPE      
     , CHARACTER_MAXIMUM_LENGTH 
     , NUMERIC_PRECISION
     , IS_NULLABLE
     , COLUMN_KEY
     , CHARACTER_SET_NAME
     , CHARACTER_OCTET_LENGTH
     , (CASE -- CHARACTER FIELDS
            WHEN DATA_TYPE = 'varchar'             THEN CHARACTER_MAXIMUM_LENGTH + 1 
            WHEN DATA_TYPE = 'char'                THEN CHARACTER_MAXIMUM_LENGTH 
            WHEN DATA_TYPE = 'tinyblob' 
              OR DATA_TYPE = 'tinytext'            THEN CHARACTER_MAXIMUM_LENGTH + 1 
            WHEN DATA_TYPE = 'blob' 
              OR DATA_TYPE = 'text'                THEN CHARACTER_MAXIMUM_LENGTH + 2
            WHEN DATA_TYPE = 'mediumblob' 
              OR DATA_TYPE = 'mediumtext'          THEN CHARACTER_MAXIMUM_LENGTH + 3
            WHEN DATA_TYPE = 'largeblob' 
              OR DATA_TYPE = 'largetext'           THEN CHARACTER_MAXIMUM_LENGTH + 4
            -- NUMERIC FIELDS
            WHEN DATA_TYPE = 'tinyint'             THEN 1
            WHEN DATA_TYPE = 'smallint'            THEN 2
            WHEN DATA_TYPE = 'mediumint'           THEN 3 
            WHEN DATA_TYPE = 'int' 
              OR DATA_TYPE = 'integer'             THEN 4 
            WHEN DATA_TYPE = 'bigint'              THEN 8
            WHEN DATA_TYPE = 'float' 
             AND (NUMERIC_PRECISION <= 24 
              OR NUMERIC_PRECISION IS NULL)        THEN 4 
            WHEN DATA_TYPE = 'float' 
             AND NUMERIC_PRECISION > 24            THEN 8
            WHEN DATA_TYPE = 'bit'                 THEN (NUMERIC_PRECISION + 7) / 8 
            WHEN DATA_TYPE = 'double' 
              OR DATA_TYPE = 'numeric'             THEN 
              (FLOOR(NUMERIC_PRECISION/9)*4) + ROUND((NUMERIC_PRECISION- FLOOR(NUMERIC_PRECISION/9)*9)*.5,0)
            -- DATETIME FIELDS  
            WHEN DATA_TYPE = 'date' 
              OR DATA_TYPE = 'time'                THEN 3
            WHEN DATA_TYPE = 'datetime'            THEN 8
            WHEN DATA_TYPE = 'timestamp'           THEN 4
            WHEN DATA_TYPE = 'year'                THEN 1
       ELSE 999999999999999 END) + 
       (CASE WHEN IS_NULLABLE = 'YES' THEN 1 ELSE 0 END) AS FIELD_BYTE_SPACE
     , CASE WHEN PK_BYTE_SPACE IS NULL THEN 0 ELSE PK_BYTE_SPACE END AS PK_BYTE_SPACE
  FROM information_schema.columns AS PK_SP
  LEFT OUTER JOIN
       (SELECT TABLE_SCHEMA
             , TABLE_NAME
             , SUM((CASE -- CHARACTER FIELDS
            WHEN DATA_TYPE = 'varchar'             THEN CHARACTER_MAXIMUM_LENGTH + 1 
            WHEN DATA_TYPE = 'char'                THEN CHARACTER_MAXIMUM_LENGTH 
            WHEN DATA_TYPE = 'tinyblob' 
              OR DATA_TYPE = 'tinytext'            THEN CHARACTER_MAXIMUM_LENGTH + 1 
            WHEN DATA_TYPE = 'blob' 
              OR DATA_TYPE = 'text'                THEN CHARACTER_MAXIMUM_LENGTH + 2
            WHEN DATA_TYPE = 'mediumblob' 
              OR DATA_TYPE = 'mediumtext'          THEN CHARACTER_MAXIMUM_LENGTH + 3
            WHEN DATA_TYPE = 'largeblob' 
              OR DATA_TYPE = 'largetext'           THEN CHARACTER_MAXIMUM_LENGTH + 4
            -- NUMERIC FIELDS
            WHEN DATA_TYPE = 'tinyint'             THEN 1
            WHEN DATA_TYPE = 'smallint'            THEN 2
            WHEN DATA_TYPE = 'mediumint'           THEN 3 
            WHEN DATA_TYPE = 'int' 
              OR DATA_TYPE = 'integer'             THEN 4 
            WHEN DATA_TYPE = 'bigint'              THEN 8
            WHEN DATA_TYPE = 'float' 
             AND (NUMERIC_PRECISION <= 24 
              OR NUMERIC_PRECISION IS NULL)        THEN 4 
            WHEN DATA_TYPE = 'float' 
             AND NUMERIC_PRECISION > 24            THEN 8
            WHEN DATA_TYPE = 'bit'                 THEN (NUMERIC_PRECISION + 7) / 8 
            WHEN DATA_TYPE = 'double' 
              OR DATA_TYPE = 'numeric'             THEN 
              (FLOOR(NUMERIC_PRECISION/9)*4) + ROUND((NUMERIC_PRECISION- FLOOR(NUMERIC_PRECISION/9)*9)*.5,0)
            -- DATETIME FIELDS  
            WHEN DATA_TYPE = 'date' 
              OR DATA_TYPE = 'time'                THEN 3
            WHEN DATA_TYPE = 'datetime'            THEN 8
            WHEN DATA_TYPE = 'timestamp'           THEN 4
            WHEN DATA_TYPE = 'year'                THEN 1
       ELSE 999999999999999 END) + 
       (CASE WHEN IS_NULLABLE = 'YES' THEN 1 ELSE 0 END)) AS PK_BYTE_SPACE 
        FROM information_schema.columns COL_SP
        WHERE COLUMN_KEY = 'PRI'
          AND TABLE_SCHEMA = 'PODS'
        GROUP BY TABLE_SCHEMA
               , TABLE_NAME) AS IX_SP 
     ON PK_SP.TABLE_SCHEMA = IX_SP.TABLE_SCHEMA
    AND PK_SP.TABLE_NAME = IX_SP.TABLE_NAME
  WHERE PK_SP.TABLE_SCHEMA = 'PODS') AS A
  ) AS B
  GROUP BY B.TABLE_SCHEMA
         , B.TABLE_NAME