大テーブルと小テーブルのJOINのコスト計算の話

先日MySQLアンカンファレンスでhogeさんの以下の発表を聴かせていただいた。

www.docswell.com

100万行×3行のJOINが全検索でhash joinになり、後者の行数を増やして100万行×10行にするとeq_refになる現象を紹介いただいた。自分の環境でも同様の現象を再現できて、オプティマイザトレースの内容やコスト計算の過程が非常に興味深かったので、色々調べてみたことを垂れ流してみる。

環境

mysql> select version();
+--------------+
| version()    |
+--------------+
| 8.0.28-debug |
+--------------+
1 row in set (0.00 sec)

mysql> desc mytable;
+---------+--------------+------+-----+---------+----------------+
| Field   | Type         | Null | Key | Default | Extra          |
+---------+--------------+------+-----+---------+----------------+
| id      | int          | NO   | PRI | NULL    | auto_increment |
| name    | varchar(255) | YES  |     | NULL    |                |
| type_id | int          | NO   | MUL | NULL    |                |
| value   | varchar(255) | YES  |     | NULL    |                |
+---------+--------------+------+-----+---------+----------------+
4 rows in set (0.02 sec)

mysql> desc types;
+-------+--------------+------+-----+---------+----------------+
| Field | Type         | Null | Key | Default | Extra          |
+-------+--------------+------+-----+---------+----------------+
| id    | int          | NO   | PRI | NULL    | auto_increment |
| name  | varchar(255) | YES  |     | NULL    |                |
+-------+--------------+------+-----+---------+----------------+
2 rows in set (0.01 sec)

mysql> table types;
+----+-------+
| id | name  |
+----+-------+
|  1 | typeA |
|  2 | typeB |
|  3 | typeC |
+----+-------+
3 rows in set (0.00 sec)

mysql> select count(*) from mytable;
+----------+
| count(*) |
+----------+
|   524288 |
+----------+
1 row in set (0.60 sec)

事象

 select * 
 from mytable 
 left join types on mytable.type_id = types.id;

前節の環境の元で上記クエリのEXPLAINを眺めると以下のような結果となる

mysql> explain select * from mytable left join types on mytable.type_id = types.id;
+----+-------------+---------+------------+------+---------------+------+---------+------+--------+----------+--------------------------------------------+
| id | select_type | table   | partitions | type | possible_keys | key  | key_len | ref  | rows   | filtered | Extra                                      |
+----+-------------+---------+------------+------+---------------+------+---------+------+--------+----------+--------------------------------------------+
|  1 | SIMPLE      | mytable | NULL       | ALL  | NULL          | NULL | NULL    | NULL | 522428 |   100.00 | NULL                                       |
|  1 | SIMPLE      | types   | NULL       | ALL  | PRIMARY       | NULL | NULL    | NULL |      2 |   100.00 | Using where; Using join buffer (hash join) |
+----+-------------+---------+------------+------+---------------+------+---------+------+--------+----------+--------------------------------------------+
2 rows in set, 1 warning (0.01 sec)

このようにtypesが全検索でhash joinでの結合が選択される。しかし、次のようにレコード数を少し増やすと結果はeq_refに変わる。

insert into types values (4,'typeD'),(5,'typeE'),(6,'typeF'),(7,'typeG'),(8,'typeH'),(9,'typeI'),(10,'typeJ');

mysql> table types;
+----+-------+
| id | name  |
+----+-------+
|  1 | typeA |
|  2 | typeB |
|  3 | typeC |
|  4 | typeD |
|  5 | typeE |
|  6 | typeF |
|  7 | typeG |
|  8 | typeH |
|  9 | typeI |
| 10 | typeJ |
+----+-------+
10 rows in set (0.00 sec)

mysql> explain select * from mytable left join types on mytable.type_id = types.id;
+----+-------------+---------+------------+--------+---------------+---------+---------+----------------------+--------+----------+-------+
| id | select_type | table   | partitions | type   | possible_keys | key     | key_len | ref                  | rows   | filtered | Extra |
+----+-------------+---------+------------+--------+---------------+---------+---------+----------------------+--------+----------+-------+
|  1 | SIMPLE      | mytable | NULL       | ALL    | NULL          | NULL    | NULL    | NULL                 | 522428 |   100.00 | NULL  |
|  1 | SIMPLE      | types   | NULL       | eq_ref | PRIMARY       | PRIMARY | 4       | kubo.mytable.type_id |      1 |   100.00 | NULL  |
+----+-------------+---------+------------+--------+---------------+---------+---------+----------------------+--------+----------+-------+
2 rows in set, 1 warning (0.00 sec)

では、typesテーブルのレコード数が3件のときのオプティマイザの判断をオプティマイザトレースを通して観察してみることにする。

オプティマイザトレース

mysql> SELECT * FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE\G
*************************** 1. row ***************************
                            QUERY: explain select * from mytable left join types on mytable.type_id = types.id
                            TRACE: {
  "steps": [
    {
      "join_preparation": {
        "select#": 1,
        "steps": [
          {
            "expanded_query": "/* select#1 */ select `mytable`.`id` AS `id`,`mytable`.`name` AS `name`,`mytable`.`type_id` AS `type_id`,`mytable`.`value` AS `value`,`types`.`id` AS `id`,`types`.`name` AS `name` from (`mytable` left join `types` on((`mytable`.`type_id` = `types`.`id`)))"
          },
          {
            "transformations_to_nested_joins": {
              "transformations": [
                "parenthesis_removal"
              ],
              "expanded_query": "/* select#1 */ select `mytable`.`id` AS `id`,`mytable`.`name` AS `name`,`mytable`.`type_id` AS `type_id`,`mytable`.`value` AS `value`,`types`.`id` AS `id`,`types`.`name` AS `name` from `mytable` left join `types` on((`mytable`.`type_id` = `types`.`id`))"
            }
          }
        ]
      }
    },
    {
      "join_optimization": {
        "select#": 1,
        "steps": [
          {
            "condition_processing": {
              "condition": "WHERE",
              "original_condition": null,
              "steps": [
                {
                  "transformation": "equality_propagation",
                  "resulting_condition": null
                }
              ]
            }
          },
          {
            "table_dependencies": [
              {
                "table": "`mytable`",
                "row_may_be_null": false,
                "map_bit": 0,
                "depends_on_map_bits": [
                ]
              },
              {
                "table": "`types`",
                "row_may_be_null": true,
                "map_bit": 1,
                "depends_on_map_bits": [
                  0
                ]
              }
            ]
          },
          {
            "ref_optimizer_key_uses": [
              {
                "table": "`types`",
                "field": "id",
                "equals": "`mytable`.`type_id`",
                "null_rejecting": true
              }
            ]
          },
          {
            "rows_estimation": [
              {
                "table": "`mytable`",
                "table_scan": {
                  "rows": 522423,
                  "cost": 505
                }
              },
              {
                "table": "`types`",
                "table_scan": {
                  "rows": 3,
                  "cost": 0.25
                }
              }
            ]
          },
          {
            "considered_execution_plans": [
              {
                "plan_prefix": [
                ],
                "table": "`mytable`",
                "best_access_path": {
                  "considered_access_paths": [
                    {
                      "rows_to_scan": 522423,
                      "access_type": "scan",
                      "resulting_rows": 522423,
                      "cost": 52747.3,
                      "chosen": true
                    }
                  ]
                },
                "condition_filtering_pct": 100,
                "rows_for_plan": 522423,
                "cost_for_plan": 52747.3,
                "rest_of_plan": [
                  {
                    "plan_prefix": [
                      "`mytable`"
                    ],
                    "table": "`types`",
                    "best_access_path": {
                      "considered_access_paths": [
                        {
                          "access_type": "eq_ref",
                          "index": "PRIMARY",
                          "rows": 1,
                          "cost": 182848,
                          "chosen": true,
                          "cause": "clustered_pk_chosen_by_heuristics"
                        },
                        {
                          "rows_to_scan": 3,
                          "access_type": "scan",
                          "using_join_cache": true,
                          "buffers_needed": 4092,
                          "resulting_rows": 3,
                          "cost": 157750,
                          "chosen": true
                        }
                      ]
                    },
                    "condition_filtering_pct": 100,
                    "rows_for_plan": 1.56727e+06,
                    "cost_for_plan": 210497,
                    "chosen": true
                  }
                ]
              }
            ]
          },
          {
            "condition_on_constant_tables": "true",
            "condition_value": true
          },
          {
            "attaching_conditions_to_tables": {
              "original_condition": "true",
              "attached_conditions_computation": [
                {
                  "table": "`types`",
                  "rechecking_index_usage": {
                    "recheck_reason": "not_first_table",
                    "range_analysis": {
                      "table_scan": {
                        "rows": 3,
                        "cost": 2.65
                      },
                      "potential_range_indexes": [
                        {
                          "index": "PRIMARY",
                          "usable": true,
                          "key_parts": [
                            "id"
                          ]
                        }
                      ],
                      "setup_range_conditions": [
                      ],
                      "group_index_range": {
                        "chosen": false,
                        "cause": "not_single_table"
                      },
                      "skip_scan_range": {
                        "chosen": false,
                        "cause": "not_single_table"
                      },
                      "analyzing_range_alternatives": {
                        "range_scan_alternatives": [
                          {
                            "index": "PRIMARY",
                            "chosen": false,
                            "cause": "depends_on_unread_values"
                          }
                        ],
                        "analyzing_roworder_intersect": {
                          "usable": false,
                          "cause": "too_few_roworder_scans"
                        }
                      }
                    }
                  }
                }
              ],
              "attached_conditions_summary": [
                {
                  "table": "`mytable`",
                  "attached": null
                },
                {
                  "table": "`types`",
                  "attached": "<if>(is_not_null_compl(types), (`types`.`id` = `mytable`.`type_id`), true)"
                }
              ]
            }
          },
          {
            "finalizing_table_conditions": [
              {
                "table": "`types`",
                "original_table_condition": "<if>(is_not_null_compl(types), (`types`.`id` = `mytable`.`type_id`), true)",
                "final_table_condition   ": "<if>(is_not_null_compl(types), (`types`.`id` = `mytable`.`type_id`), true)"
              }
            ]
          },
          {
            "refine_plan": [
              {
                "table": "`mytable`"
              },
              {
                "table": "`types`"
              }
            ]
          }
        ]
      }
    },
    {
      "join_explain": {
        "select#": 1,
        "steps": [
        ]
      }
    }
  ]
}
MISSING_BYTES_BEYOND_MAX_MEM_SIZE: 0
          INSUFFICIENT_PRIVILEGES: 0
1 row in set (0.02 sec)

トレース内容の前半を見ると、eq_refのコストとscanのコストが表示されており、scanのコストの方が安い様子が伺える。ここで、scanとはテーブルスキャンのことを指す。値も均衡しているので、typesのレコードを7行増やすことでこれらのコストの大小が逆転して、eq_refが選ばれるようになったのだろうと予想できる。要するに「JOIN先のテーブルはレコード少ないし、PRIMARYキーで検索するよりもテーブル全部見た方が早くねー？」と判断したと予想できる。これは一般にレコード数が少数の場合に起こりうることである。この辺りの判断過程は後のソースの節で詳しく確認する。

一方でトレース後半に、"recheck_reason": "not_first_table"という文字列が表示されている。どうやらnot_first_tableという理由で再チェックが走っている様子。何だかよくわからない。ここについては次回の記事で(多分)触れることにする。

では、トレース内容の前半の判断過程のソースを眺めていく。

ソース

コスト比較の過程

まずはJOIN::optimaizeでオプティマイズを開始。この関数のコメントにもあるが、この関数がオプティマイズフェーズの開始点となる。

bool JOIN::optimize(bool finalize_access_paths) {
  DBUG_TRACE;

  uint no_jbuf_after = UINT_MAX;
...
// Set up join order and initial access paths
  THD_STAGE_INFO(thd, stage_statistics);
  if (make_join_plan()) {
    if (thd->killed) thd->send_kill_message();
    DBUG_PRINT("error", ("Error: JOIN::make_join_plan() failed"));
    return true;
  }

上記JOIN::optimaizeがmake_join_plan()を呼び出す。この関数ではベストなJOIN順序を決定するとのこと。

bool JOIN::make_join_plan() {
...
// Choose the table order based on analysis done so far.
  if (Optimize_table_order(thd, this, nullptr).choose_table_order())
    return true;

そして、JOIN順を決定する過程で以下のbest_access_path関数を呼び出す。この関数ではその名の通り、１つ１つのテーブルに対してベストなアクセス方法を見つけ出す。オプティマイザトレースのbest_access_pathの表記の部分に関連する。

void Optimize_table_order::best_access_path(JOIN_TAB *tab,
                                            const table_map remaining_tables,
                                            const uint idx, bool disable_jbuf,
                                            const double prefix_rowcount,
                                            POSITION *pos) {
                                            
...

// The 'ref' access method with lowest cost as found by find_best_ref()
  Key_use *best_ref = nullptr;
  
// Look for the best ref access if the storage engine supports index access.
  if (tab->keyuse() != nullptr &&
      (table->file->ha_table_flags() & HA_NO_INDEX_ACCESS) == 0)
    best_ref =
        find_best_ref(tab, remaining_tables, idx, prefix_rowcount,
                      &found_condition, &ref_depend_map, &used_key_parts);
                      
  double rows_fetched = best_ref ? best_ref->fanout : DBL_MAX;
  /*
    Cost of executing the best access method prefix_rowcount
    number of times
  */
  double best_read_cost = best_ref ? best_ref->read_cost : DBL_MAX;

上記のfind_best_ref関数によってどのキーを使ったrefアクセスが最もコスト安かを計算する。typesテーブルの場合はPRIMARYでのみJOINしているので、このキーでのeq_refが最も安いと計算される。つまりこの時点でオプティマイザの中ではeq_refで行こかという空気感になっている。

しかし、その後best_access_path関数は以下のように続く。ここではrefアクセスが高コストかどうかを(1a)~(4)のヒューリスティックな条件でチェックし、テーブルスキャンやindexスキャンやrangeアクセスの可能性を探っている。これらの条件をすべて満たさない場合はテーブルスキャンやindexスキャンやrangeアクセスが候補となる。実際、mytableの場合もtypesの場合もこれらの条件は満たさないのでelse以下のソースに続く。

※ただし、全検索が選択されない場合(100万行×10行のJOIN)の場合も(1a)~(4)の条件を満たさないので、ココが今回の事象の分岐点というわけではない

/*
    Don't test table scan if it can't be better.
    Prefer key lookup if we would use the same key for scanning.

    Don't do a table scan on InnoDB tables, if we can read the used
    parts of the row from any of the used index.
    This is because table scans uses index and we would not win
    anything by using a table scan. The only exception is INDEX_MERGE
    quick select. We can not say for sure that INDEX_MERGE quick select
    is always faster than ref access. So it's necessary to check if
    ref access is more expensive.

    We do not consider index/table scan or range access if:

    1a) The best 'ref' access produces fewer records than a table scan
        (or index scan, or range acces), and
    1b) The best 'ref' executed for all partial row combinations, is
        cheaper than a single scan. The rationale for comparing

        COST(ref_per_partial_row) * E(#partial_rows)
           vs
        COST(single_scan)

        is that if join buffering is used for the scan, then scan will
        not be performed E(#partial_rows) times, but
        E(#partial_rows)/E(#partial_rows_fit_in_buffer). At this point
        in best_access_path() we don't know this ratio, but it is
        somewhere between 1 and E(#partial_rows). To avoid
        overestimating the total cost of scanning, the heuristic used
        here has to assume that the ratio is 1. A more fine-grained
        cost comparison will be done later in this function.
    (2) The best way to perform table or index scan is to use 'range' access
        using index IDX. If it is a 'tight range' scan (i.e not a loose index
        scan' or 'index merge'), then ref access on the same index will
        perform equal or better if ref access can use the same or more number
        of key parts.
    (3) See above note about InnoDB.
    (4) NOT ("FORCE INDEX(...)" is used for table and there is 'ref' access
             path, but there is no quick select)
        If the condition in the above brackets holds, then the only possible
        "table scan" access method is ALL/index (there is no quick select).
        Since we have a 'ref' access path, and FORCE INDEX instructs us to
        choose it over ALL/index, there is no need to consider a full table
        scan.
  */
if (rows_fetched < tab->found_records &&  // (1a)
      best_read_cost <= tab->read_time)     // (1b)
  {
    // "scan" means (full) index scan or (full) table scan.
    if (tab->range_scan()) {
      trace_access_scan.add_alnum("access_type", "range");
      trace_quick_description(tab->range_scan(), &thd->opt_trace);
    } else
      trace_access_scan.add_alnum("access_type", "scan");

    trace_access_scan
        .add("cost",
             tab->read_time + cost_model->row_evaluate_cost(
                                  static_cast<double>(tab->found_records)))
        .add("rows", tab->found_records)
        .add("chosen", false)
        .add_alnum("cause", "cost");
  } else if (tab->range_scan() && best_ref &&                            // (2)
             used_index(tab->range_scan()) == best_ref->key &&           // (2)
             used_key_parts >= table->quick_key_parts[best_ref->key] &&  // (2)
             tab->range_scan()->type != AccessPath::GROUP_INDEX_SKIP_SCAN &&
             tab->range_scan()->type != AccessPath::INDEX_SKIP_SCAN)  // (2)
  {
    trace_access_scan.add_alnum("access_type", "range");
    trace_quick_description(tab->range_scan(), &thd->opt_trace);
    trace_access_scan.add("chosen", false)
        .add_alnum("cause", "heuristic_index_cheaper");
  } else if ((table->file->ha_table_flags() & HA_TABLE_SCAN_ON_INDEX) &&  //(3)
             !table->covering_keys.is_clear_all() && best_ref &&          //(3)
             (!tab->range_scan() ||                                       //(3)
              (tab->range_scan()->type ==
                   AccessPath::ROWID_INTERSECTION &&             //(3)
               best_ref->read_cost < tab->range_scan()->cost)))  //(3)
  {
    if (tab->range_scan()) {
      trace_access_scan.add_alnum("access_type", "range");
      trace_quick_description(tab->range_scan(), &thd->opt_trace);
    } else
      trace_access_scan.add_alnum("access_type", "scan");

    trace_access_scan.add("chosen", false)
        .add_alnum("cause", "covering_index_better_than_full_scan");
  } else if ((table->force_index && best_ref && !tab->range_scan()))  // (4)
  {
    trace_access_scan.add_alnum("access_type", "scan")
        .add("chosen", false)
        .add_alnum("cause", "force_index");
  } else {

さてelse以降は以下のように続く。改めてだが、else以降の分岐に来たということは「refアクセスよりもテーブルスキャンやindexスキャンやrangeスキャンのコストの方が安い可能性が出た」ということである。そのため、ここではスキャンのコストを計算して、先ほどのベストなrefアクセスのコストと比較している。

ココが今回の事象の分岐点となる。具体的には以下の最後のscan_total_cost < best_read_cost + cost_model->row_evaluate_cost(prefix_rowcount * rows_fetchedのコスト比較である。この条件を満たすとscanの方がコスト安いねという判断になる。

     double scan_read_cost = calculate_scan_cost(
        tab, idx, best_ref, prefix_rowcount, found_condition, disable_jbuf,
        &rows_after_filtering, &trace_access_scan);

    /*
      We estimate the cost of evaluating WHERE clause for found
      records as row_evaluate_cost(prefix_rowcount * rows_after_filtering).
      This cost plus scan_cost gives us total cost of using
      TABLE/INDEX/RANGE SCAN.
    */
    const double scan_total_cost =
        scan_read_cost +
        cost_model->row_evaluate_cost(prefix_rowcount * rows_after_filtering);

    trace_access_scan.add("resulting_rows", rows_after_filtering);
    trace_access_scan.add("cost", scan_total_cost);

    if (best_ref == nullptr ||
        (scan_total_cost <
         best_read_cost +
             cost_model->row_evaluate_cost(prefix_rowcount * rows_fetched))) {

実際、typesレコードが3件の場合の各コストをprintすると以下のようになる（右辺と左辺がオプティマイザトレースで表示されたコストとほぼ一致することがわかる）

scan_total_cost=157749.99852886199
best_read_cost=130605.75
cost_model->row_evaluate_cost(prefix_rowcount * rows_fetched) = 522423 * 1 * 0.1

ここで、row_evaluate_cost関数は引数の値にrow_evaluate_costというサーバーコストを乗算する関数である（MySQL8.0では0.1がデフォルト。この値が高ければ高いほど行フェッチの大小によるコスト差が大きくなる）

mysql> SELECT * FROM mysql.server_cost where cost_name="row_evaluate_cost";
+-------------------+------------+---------------------+---------+---------------+
| cost_name         | cost_value | last_update         | comment | default_value |
+-------------------+------------+---------------------+---------+---------------+
| row_evaluate_cost |       NULL | 2022-08-08 07:12:45 | NULL    |           0.1 |
+-------------------+------------+---------------------+---------+---------------+
1 row in set (0.02 sec)

確かに左辺のscan_total_costの方がコストが低い。一方でtypesのレコードを10件に増やすと、右辺のbest_read_cost、prefix_rowcount、rows_fetchedは全く変わらず、scan_total_costの方がグンと高くなり、右辺を上回る様子を確認することができる。つまり、「PRIMARYキーで検索するよりもテーブル全部見た方が早くねー？」の判断から「テーブル件数多いからPRIMARYキーで検索した方が宜しくね？」に変わるのだろう。

この分岐を満たす（scanのコストの方が安いと分かる）と以下のようにbest_refがnullptrになる。つまり頑張って見つけたベストなrefアクセスが無かったことにされる。また、best_uses_jbufがtrueとなる。そしてその値がPOSITIONに詰められる。

これ以降はrecheckの項に入っていくがそれは次回（多分）

      /*
        If the table has a range (tab->quick is set) make_join_query_block()
        will ensure that this will be used
      */
      best_read_cost = scan_read_cost;
      rows_fetched = rows_after_filtering;

      if (tab->found_records) {
        /*
          Although join buffering may be used for this table, this
          filter calculation is not done to calculate the cost of join
          buffering itself (that is done inside
          calculate_scan_cost()). The is_join_buffering parameter is
          therefore 'false'.
        */
        const float full_filter = calculate_condition_filter(
            tab, nullptr, ~remaining_tables & ~excluded_tables,
            static_cast<double>(tab->found_records), false, false,
            trace_access_scan);
        filter_effect = static_cast<float>(std::min(
            1.0, tab->found_records * full_filter / rows_after_filtering));
      }
      best_ref = nullptr;
      best_uses_jbuf = !disable_jbuf;
      ref_depend_map = 0;
    }

    trace_access_scan.add("chosen", best_ref == nullptr);
    
    ...
    
  pos->filter_effect = filter_effect;
  pos->rows_fetched = rows_fetched;
  pos->read_cost = best_read_cost;
  pos->key = best_ref;
  pos->table = tab;
  pos->ref_depend_map = ref_depend_map;
  pos->loosescan_key = MAX_KEY;
  pos->use_join_buffer = best_uses_jbuf;

コスト計算

ところでscan_total_costやbest_read_cost自体はどのように計算されているのだろうか。

まずはbest_read_cost（refアクセスによる読み取りコスト）から見ていこう。振り返ってみるとbest_read_costの実態はbest_ref->read_costであり、best_refは以下のfind_best_refで構成される。この関数内のcur_read_costという値がそれである。

どうやらcur_read_costはprev_record_reads(join, idx, table_deps)とtable->cost_model()->page_read_cost(1.0)の乗算で計算されるようだ。

Key_use *Optimize_table_order::find_best_ref(
    const JOIN_TAB *tab, const table_map remaining_tables, const uint idx,
    const double prefix_rowcount, bool *found_condition,
    table_map *ref_depend_map, uint *used_key_parts) {
    
    ...
    
// Check if we found full key
      if (all_key_parts_covered && !ref_or_null_part) /* use eq key */
      {
        cur_used_keyparts = (uint)~0;
        if (keyinfo->flags & HA_NOSAME &&
            ((keyinfo->flags & HA_NULL_PART_KEY) == 0 ||
             all_key_parts_non_null)) {
          cur_read_cost = prev_record_reads(join, idx, table_deps) *
                          table->cost_model()->page_read_cost(1.0);

prev_record_readsは以下のようになっており、返り値は結合先のテーブルに結合される行数を表す。すなわちJOIN元のテーブルであるmytableのフェッチした行数にfileteredの値を乗算したものである。

static double prev_record_reads(JOIN *join, uint idx, table_map found_ref) {
  double found = 1.0;
  POSITION *pos_end = join->positions - 1;
  for (POSITION *pos = join->positions + idx - 1; pos != pos_end; pos--) {
    const double fanout = pos->rows_fetched * pos->filter_effect;
    if (pos->table->table_ref->map() & found_ref) {
      found_ref |= pos->ref_depend_map;
      /*
        For the case of "t1 LEFT JOIN t2 ON ..." where t2 is a const table
        with no matching row we will get position[t2].rows_fetched==0.
        Actually the size of output is one null-complemented row, therefore
        we will use value of 1 whenever we get rows_fetched==0.

        Note
        - the above case can't occur if inner part of outer join has more
          than one table: table with no matches will not be marked as const.

        - Ideally we should add 1 to rows_fetched for every possible null-
          complemented row. We're not doing it because: 1. it will require
          non-trivial code and add overhead. 2. The value of rows_fetched
          is an inprecise estimate and adding 1 (or, in the worst case,
          #max_nested_outer_joins=64-1) will not make it any more precise.
      */
      if (pos->rows_fetched > DBL_EPSILON) found *= fanout;
    } else if (fanout < 1.0) {
      /*
        With condition filtering it is possible that a table has a
        lower fanout than 1.0. If so, calculate the fanout of this
        table into the found rows estimate so the produced number is
        not too pessimistic. Otherwise, the expected number of row
        combinations returned by this function may be higher than the
        prefix_rowcount for the table. See BUG#18352936
      */
      found *= fanout;
    }
  }
  return found;
}

page_read_costは以下のようになっており、buffer_block_read_cost(pages_in_mem)とio_block_read_cost(pages_on_disk)の和が返り値となっている。

ここでin_memはメモリ内のページ数と総ページ数の比率の推定値を表す。よってpages_in_memはメモリ内のページ数の推定値、pages_on_diskはディスク内のページ数の推定値を表すことになる。

また、buffer_block_read_cost関数とio_block_read_cost関数はそれぞれbuffer_block_read_cost、io_block_read_costという名のエンジンコストを乗算する関数である。それぞれメモリやディスクからインデックスやデータページを読み取る定数コストを表す。

つまり、page_read_costの返り値は「(メモリ内のページ数×メモリから読み出す定数コスト)+(ディスク内ページ数×ディスクから読み出す定数コスト)」を表すことが分かる。

mysql> SELECT * FROM mysql.engine_cost;
+-------------+-------------+------------------------+------------+---------------------+---------+---------------+
| engine_name | device_type | cost_name              | cost_value | last_update         | comment | default_value |
+-------------+-------------+------------------------+------------+---------------------+---------+---------------+
| default     |           0 | io_block_read_cost     |       NULL | 2022-08-08 07:12:45 | NULL    |             1 |
| default     |           0 | memory_block_read_cost |       NULL | 2022-08-08 07:12:45 | NULL    |          0.25 |
+-------------+-------------+------------------------+------------+---------------------+---------+---------------+
2 rows in set (0.00 sec)

double Cost_model_table::page_read_cost(double pages) const {
  assert(m_initialized);
  assert(pages >= 0.0);

  const double in_mem = m_table->file->table_in_memory_estimate();

  const double pages_in_mem = pages * in_mem;
  const double pages_on_disk = pages - pages_in_mem;
  assert(pages_on_disk >= 0.0);

  const double cost =
      buffer_block_read_cost(pages_in_mem) + io_block_read_cost(pages_on_disk);

  return cost;
}

よってbest_read_costは「JOIN先に結合する行数×{(メモリ内のページ数×メモリから読み出す定数コスト)+(ディスク内ページ数×ディスクから読み出す定数コスト)}」で表されることが分かる。

次にscan_total_cost（scanのコスト）を見て行こう。振り返ってみるとscan_total_costはscan_total_cost = scan_read_cost + cost_model->row_evaluate_cost(prefix_rowcount * rows_after_filtering)で表される。後者のrow_evaluate_costは先ほども見たようにサーバーコストで0.1の定数である。prefix_rowcountはJOIN先に結合する行数（今回で言うとmytableの全行数）、rows_after_filteringはJOIN先のテーブルのフィルタリング後の行数（今回で言うとtypesの全行数）である。typesの行数を3から10に増やせば確かにscan_total_costは格段に増えそうである。

なお、scan_read_costは下記の関数の返り値であるscan_and_filter_costが該当する。このscan_and_filter_costはscan_and_filter_cost = buffer_count *(single_scan_read_cost + cost_model->row_evaluate_cost( tab->records() - *rows_after_filtering))という式で構成されている。

一つ一つの値を見ていこうと思ったが、なかなか構成要素が多くて意味合いを読み取るのに苦しんでしまった。そのためソースコメントから察することにした。どうやら「ストレージエンジンから行を取得するコストに加えて、クエリ条件によってフィルタリングされると見積もられる行のCPU実行コスト」ということらしい。

double Optimize_table_order::calculate_scan_cost(
    const JOIN_TAB *tab, const uint idx, const Key_use *best_ref,
    const double prefix_rowcount, const bool found_condition,
    const bool disable_jbuf, double *rows_after_filtering,
    Opt_trace_object *trace_access_scan) {
    
    ...
    
    else
      scan_cost = table->file->table_scan_cost();  // table scan
    const double single_scan_read_cost = scan_cost.total_cost();
    
    ...
    
      const double buffer_count =
          1.0 + ((double)cache_record_length(join, idx) * prefix_rowcount /
                 (double)thd->variables.join_buff_size);

      scan_and_filter_cost =
          buffer_count *
          (single_scan_read_cost + cost_model->row_evaluate_cost(
                                       tab->records() - *rows_after_filtering));

次回（多分）はこれ以降のソースを見てみることにする。

amamanamam

データベースと仲良くなりたいです

大テーブルと小テーブルのJOINのコスト計算の話

環境

事象

オプティマイザトレース

ソース

コスト比較の過程

コスト計算