2023-10-12

INSERT...SELETC でauto_increment値が飛ぶ話

MySQL

環境

CREATE TABLE `t2` (
  `id` int NOT NULL,
  `id2` int NOT NULL AUTO_INCREMENT,
  PRIMARY KEY (`id`),
  KEY `id2` (`id2`)
)

mysql> select version();
+--------------+
| version()    |
+--------------+
| 8.0.28-debug |
+--------------+
1 row in set (0.00 sec)

mysql> select @@auto_increment_increment;
+----------------------------+
| @@auto_increment_increment |
+----------------------------+
|                          1 |
+----------------------------+
1 row in set (0.00 sec)

mysql> select @@auto_increment_offset;
+-------------------------+
| @@auto_increment_offset |
+-------------------------+
|                       1 |
+-------------------------+
1 row in set (0.00 sec)

mysql> select @@innodb_autoinc_lock_mode;
+----------------------------+
| @@innodb_autoinc_lock_mode |
+----------------------------+
|                          2 |
+----------------------------+
1 row in set (0.01 sec)

※innodb_autoinc_lock_mode=1でも同様のことは起こる

事象

insert...selectでテストデータ増やしてたらauto_increment値が飛び飛びの値となった。

mysql> insert into t2(id) values(1);
Query OK, 1 row affected (0.02 sec)

--①insert...selectで順に(2),(3),(4)の行を挿入
mysql> insert into t2(id) select id+1 from t2;
Query OK, 1 row affected (0.02 sec)
Records: 1  Duplicates: 0  Warnings: 0

mysql> insert into t2(id) select id+2 from t2;
Query OK, 2 rows affected (0.02 sec)
Records: 2  Duplicates: 0  Warnings: 0

--②想定通りの挿入結果
mysql> select * from t2;
+----+-----+
| id | id2 |
+----+-----+
|  1 |   1 |
|  2 |   2 |
|  3 |   3 |
|  4 |   4 |
+----+-----+
4 rows in set (0.02 sec)

--③同様に(5),(6),(7),(8)を挿入
mysql> insert into t3(id) select id+4 from t3;

↓
--こうなって欲しい
+----+-----+
| id | id2 |
+----+-----+
|  1 |   1 |
|  2 |   2 |
|  3 |   3 |
|  4 |   4 |
|  5 |   5 |
|  6 |   6 |
|  7 |   7 |
|  8 |   8 |
+----+-----+

--こうなる
mysql> select * from t3;
+----+-----+
| id | id2 |
+----+-----+
|  1 |   1 |
|  2 |   2 |
|  3 |   3 |
|  4 |   4 |
|  5 |   6 |
|  6 |   7 |
|  7 |   8 |
|  8 |   9 |
+----+-----+
8 rows in set (0.00 sec)

なお、①,③の部分をinsert...valuesで直接値を指定するとこのような事象は起きない。

なんでこうなる

insert...valuesは直接値を指定するが故に、事前に挿入行数が分かっている挿入となるが、一方でinsert...selectは事前に挿入行数が分からないために見積もりがずれてしまう。そのためauto_increment値の計算に齟齬が発生してしまう。

どんな計算をしていてどのような理由でズレが生じるのか次章から見ていく。

ちなみにズレが生じるポイントは③ではなく②の時点から起きている。正常と思われていた②の状態でschema_auto_increment_columnsを確認すると、以下のようにauto_increment値が6になってしまっている。これはカラムの「現在の auto_increment値」であり、本来は5になって欲しい値である。

すなわち、このauto_increment値が6の状態で③を行うことにより、次にinsertされる行のauto_increment値が6になってしまうのである。

mysql> select * from sys.schema_auto_increment_columns where table_name = 't2'\G
*************************** 1. row ***************************
        table_schema: kubo
          table_name: t2
         column_name: id2
           data_type: int
         column_type: int
           is_signed: 1
         is_unsigned: 0
           max_value: 2147483647
      auto_increment: 6
auto_increment_ratio: 0.0000
1 row in set (0.02 sec)

ソース

現在のauto_increment値を読み出している部分
現在のauto_increment値を更新している部分

を順に見ていく。その前にこれらのパートを呼び出している大元のメソッドについてざっくり触れる

update_auto_increment

auto_incrementに関するアレコレは以下のupdate_auto_incrementメソッドでやりくりされる。

このメソッドをざっくり理解するために以降は以下のような状況を例として用いる

auto_increment_increment＝5（つまり5ずつ増える）
auto_increment_offset＝10（つまり開始位置が10）
各行のauto_increment値が隙間なく95まで埋まっている
insert...valueでauto_increment値が100,105,110が埋まる予定である

まずauto_increment値が100の行を挿入する時にこのメソッドが呼び出される。

この時点ではnext_insert_idもauto_inc_interval_for_cur_row.minimum()も０のため以下のアサーションが通る。ちなみに、このメソッドの後半でこれらの値が付与されて、次のauto_increment値が105の行を挿入する際に参照される。

int handler::update_auto_increment() {
  ulonglong nr, nb_reserved_values = 0;
  bool append = false;
  THD *thd = table->in_use;
  struct System_variables *variables = &thd->variables;
  assert(table_share->tmp_table != NO_TMP_TABLE || m_lock_type != F_UNLCK);
  DBUG_TRACE;

  /*
    next_insert_id is a "cursor" into the reserved interval, it may go greater
    than the interval, but not smaller.
  */
  assert(next_insert_id >= auto_inc_interval_for_cur_row.minimum());

同様に続きの条件式もtrueとなる。同じく次のauto_increment値が105の行を挿入する際にはこれらの条件式が活きてくる。

if ((nr = next_insert_id) >= auto_inc_interval_for_cur_row.maximum()) {

次に上記のifにネストしている部分では、auto_inc_intervals_countとestimation_rows_to_insertの評価がされる。auto_inc_intervals_countは予約されたauto_incrementの間隔である。これはまだ予約を行っていないので0になる。また、estimation_rows_to_insertは見積もられた挿入行数である。insert...valuesの場合はすでに挿入行数がわかっているので、正の数が入る。一方でinsert...selectの場合はこの値は0である。

今回の例の場合はestimation_rows_to_insertは3となり条件式はtrueとなる。そしてnb_desired_valuesにestimation_rows_to_insertの値が入る。変数名から察するにestimation_rows_to_insertの数だけauto_increment値が入ると期待されているということである。

if ((auto_inc_intervals_count == 0) && (estimation_rows_to_insert > 0))
        nb_desired_values = estimation_rows_to_insert;
      else if ((auto_inc_intervals_count == 0) &&
               (thd->lex->bulk_insert_row_cnt > 0)) {
        /*
          For multi-row inserts, if the bulk inserts cannot be started, the
          handler::estimation_rows_to_insert will not be set. But we still
          want to reserve the autoinc values.
        */
        nb_desired_values = thd->lex->bulk_insert_row_cnt;
      } else /* go with the increasing defaults */
      {
        /* avoid overflow in formula, with this if() */
        if (auto_inc_intervals_count <= AUTO_INC_DEFAULT_NB_MAX_BITS) {
          nb_desired_values =
              AUTO_INC_DEFAULT_NB_ROWS * (1 << auto_inc_intervals_count);
          nb_desired_values =
              std::min(nb_desired_values, ulonglong(AUTO_INC_DEFAULT_NB_MAX));
        } else
          nb_desired_values = AUTO_INC_DEFAULT_NB_MAX;
      }

続けてif ((nr = next_insert_id) >= auto_inc_interval_for_cur_row.maximum())のネストしている部分では、以下のようにget_auto_incrementを呼び出している。内部では特にschema_auto_increment_columnsのauto_increment値を参照して、今回入れるべきauto_increment値、次の行または次のinsertの機会に入れるべきauto_increment値を計算をしている。

get_auto_increment(variables->auto_increment_offset,
                         variables->auto_increment_increment, nb_desired_values,
                         &nr, &nb_reserved_values);
      if (nr == ULLONG_MAX) return HA_ERR_AUTOINC_READ_FAILED;  // Mark failure

次に予約されたauto_increment範囲や次の行または次のinsertの機会に入れるべきauto_increment値がauto_inc_interval_for_cur_rowに格納される。またnext_insert_idにも値が格納される。これらは次の行（auto_increment値が105,110の行を挿入する時）に関する評価の際に用いられる。

if (append) {
    auto_inc_interval_for_cur_row.replace(nr, nb_reserved_values,
                                          variables->auto_increment_increment);
    auto_inc_intervals_count++;
    /* Row-based replication does not need to store intervals in binlog */
    if (mysql_bin_log.is_open() && !thd->is_current_stmt_binlog_format_row())
      thd->auto_inc_intervals_in_cur_stmt_for_binlog.append(
          auto_inc_interval_for_cur_row.minimum(),
          auto_inc_interval_for_cur_row.values(),
          variables->auto_increment_increment);
  }

  /*
    Record this autogenerated value. If the caller then
    succeeds to insert this value, it will call
    record_first_successful_insert_id_in_cur_stmt()
    which will set first_successful_insert_id_in_cur_stmt if it's not
    already set.
  */
  insert_id_for_cur_row = nr;
  /*
    Set next insert id to point to next auto-increment value to be able to
    handle multi-row statements.
  */
  set_next_insert_id(compute_next_insert_id(nr, variables));

  return 0;

現在のauto_increment値を読み出している部分

では、まず現在のauto_increment値を読み出しているソースを見てみる

以下がauto_increment値をディクショナリから読み出している部分となる。つまりここで次に入れるべきauto_increment値が決まる。

/** Reads the next autoinc value (== autoinc counter value), 0 if not yet
 initialized.
 @return value for a new row, or 0 */
ib_uint64_t dict_table_autoinc_read(const dict_table_t *table) /*!< in: table */
{
  ut_ad(dict_table_autoinc_own(table));

  return (table->autoinc);
}

これを呼び出している箇所は以下である。

/** Read the next autoinc value. Acquire the relevant locks before reading
 the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked
 on return and all relevant locks acquired.
 @return DB_SUCCESS or error code */

dberr_t ha_innobase::innobase_get_autoinc(
    ulonglong *value) /*!< out: autoinc value */
{
  *value = 0;

  m_prebuilt->autoinc_error = innobase_lock_autoinc();

  if (m_prebuilt->autoinc_error == DB_SUCCESS) {
    /* Determine the first value of the interval */
    *value = dict_table_autoinc_read(m_prebuilt->table);

    /* It should have been initialized during open. */
    if (*value == 0) {
      m_prebuilt->autoinc_error = DB_UNSUPPORTED;
      dict_table_autoinc_unlock(m_prebuilt->table);
    }
  }

  return (m_prebuilt->autoinc_error);
}

ここでm_prebuiltはrow_prebuilt_tという型であり、テーブルハンドラに予め用意された関連情報を表す。これによって、必要になった際にディクショナリを読みに行くなどしなくても済む。これはテーブルオープン時に構成される。

そして、このメソッドは先ほど紹介したha_innobase::get_auto_incrementから呼び出される。参考までに、このメソッドを用いている位置までのバックトレースは以下である。

ha_innobase::innobase_get_autoinc(ha_innobase * const this, ulonglong * value) (/mysql-8.0.28/storage/innobase/handler/ha_innodb.cc:19357)
ha_innobase::get_auto_increment(ha_innobase * const this, ulonglong offset, ulonglong increment, ulonglong nb_desired_values, ulonglong * first_value, ulonglong * nb_reserved_values) (/mysql-8.0.28/storage/innobase/handler/ha_innodb.cc:19400)
handler::update_auto_increment(handler * const this) (/mysql-8.0.28/sql/handler.cc:3815)
ha_innobase::write_row(ha_innobase * const this, uchar * record) (/mysql-8.0.28/storage/innobase/handler/ha_innodb.cc:8880)
handler::ha_write_row(handler * const this, uchar * buf) (/mysql-8.0.28/sql/handler.cc:7937)
write_record(THD * thd, TABLE * table, COPY_INFO * info, COPY_INFO * update) (/mysql-8.0.28/sql/sql_insert.cc:2165)
Query_result_insert::send_data(Query_result_insert * const this, THD * thd, const mem_root_deque<Item*> & values) (/mysql-8.0.28/sql/sql_insert.cc:2400)
Query_expression::ExecuteIteratorQuery(Query_expression * const this, THD * thd) (/mysql-8.0.28/sql/sql_union.cc:1315)
Query_expression::execute(Query_expression * const this, THD * thd) (/mysql-8.0.28/sql/sql_union.cc:1353)
Sql_cmd_dml::execute_inner(Sql_cmd_dml * const this, THD * thd) (/mysql-8.0.28/sql/sql_select.cc:781)
Sql_cmd_dml::execute(Sql_cmd_dml * const this, THD * thd) (/mysql-8.0.28/sql/sql_select.cc:581)
mysql_execute_command(THD * thd, bool first_level) (/mysql-8.0.28/sql/sql_parse.cc:3553)
dispatch_sql_command(THD * thd, Parser_state * parser_state) (/mysql-8.0.28/sql/sql_parse.cc:5174)
dispatch_command(THD * thd, const COM_DATA * com_data, enum_server_command command) (/mysql-8.0.28/sql/sql_parse.cc:1938)
do_command(THD * thd) (/mysql-8.0.28/sql/sql_parse.cc:1352)
handle_connection(void * arg) (/mysql-8.0.28/sql/conn_handler/connection_handler_per_thread.cc:302)
pfs_spawn_thread(void * arg) (/mysql-8.0.28/storage/perfschema/pfs.cc:2947)
libpthread.so.0!start_thread(void * arg) (/build/glibc-SzIz7B/glibc-2.31/nptl/pthread_create.c:477)
libc.so.6!clone() (/build/glibc-SzIz7B/glibc-2.31/sysdeps/unix/sysv/linux/x86_64/clone.S:95)

現在のauto_increment値を更新している部分

最後に、現在のauto_increment値を更新しているソースを調べる。

先程のauto_increment値を読み出している部分よりも後になるはずので、innobase_get_autoincを呼び出した後をじっくり見ると、以下のように次の「現在のauto_increment値」を計算して（innobase_next_autoinc）更新する（dict_table_autoinc_update_if_greater）ようなソースが確認できる。

※dict_table_autoinc_update_if_greaterの方のソースは紹介しないが、メソッド内ではm_prebuilt->table->autoincの値を更新している様子が確認できる。今回の場合で言えば、innobase_next_autoincの計算結果に更新している。

void ha_innobase::get_auto_increment(
    ulonglong offset,              /*!< in: table autoinc offset */
    ulonglong increment,           /*!< in: table autoinc
                                   increment */
    ulonglong nb_desired_values,   /*!< in: number of values
                                   reqd */
    ulonglong *first_value,        /*!< out: the autoinc value */
    ulonglong *nb_reserved_values) /*!< out: count of reserved
                                   values */
{
  trx_t *trx;
  dberr_t error;
  ulonglong autoinc = 0;

  /* Prepare m_prebuilt->trx in the table handle */
  update_thd(ha_thd());

  error = innobase_get_autoinc(&autoinc);
  
  ...
  ...
  
/* Compute the last value in the interval */
    next_value = innobase_next_autoinc(current, *nb_reserved_values, increment,
                                       offset, col_max_value);

    m_prebuilt->autoinc_last_value = next_value;

    if (m_prebuilt->autoinc_last_value < *first_value) {
      *first_value = (~(ulonglong)0);
    } else {
      /* Update the table autoinc variable */
      dict_table_autoinc_update_if_greater(m_prebuilt->table,
                                           m_prebuilt->autoinc_last_value);
    }

schema_auto_increment_columnsのauto_increment値が想定と異なるのはおそらくinnobase_next_autoincでの計算によるものと推察できる。

計算の中身を除いてみる。

/** Compute the next autoinc value.

 For MySQL replication the autoincrement values can be partitioned among
 the nodes. The offset is the start or origin of the autoincrement value
 for a particular node. For n nodes the increment will be n and the offset
 will be in the interval [1, n]. The formula tries to allocate the next
 value for a particular node.

 Note: This function is also called with increment set to the number of
 values we want to reserve for multi-value inserts e.g.,

         INSERT INTO T VALUES(), (), ();

 innobase_next_autoinc() will be called with increment set to 3 where
 autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
 the multi-value INSERT above.
 @return the next value */
ulonglong innobase_next_autoinc(
    ulonglong current,   /*!< in: Current value */
    ulonglong need,      /*!< in: count of values needed */
    ulonglong step,      /*!< in: AUTOINC increment step */
    ulonglong offset,    /*!< in: AUTOINC offset */
    ulonglong max_value) /*!< in: max value for type */
{
  ulonglong next_value;
  ulonglong block = need * step;

  /* Should never be 0. */
  ut_a(need > 0);
  ut_a(block > 0);
  ut_a(max_value > 0);

  /* According to MySQL documentation, if the offset is greater than
  the step then the offset is ignored. */
  if (offset > block) {
    offset = 0;
  }

  /* Check for overflow. Current can be > max_value if the value is
  in reality a negative value.The visual studio compilers converts
  large double values automatically into unsigned long long datatype
  maximum value */

  if (block >= max_value || offset > max_value || current >= max_value ||
      max_value - offset <= offset) {
    next_value = max_value;
  } else {
    ut_a(max_value > current);

    ulonglong free = max_value - current;

    if (free < offset || free - offset <= block) {
      next_value = max_value;
    } else {
      next_value = 0;
    }
  }

  if (next_value == 0) {
    ulonglong next;

    if (current > offset) {
      next = (current - offset) / step;
    } else {
      next = (offset - current) / step;
    }

    ut_a(max_value > next);
    next_value = next * step;
    /* Check for multiplication overflow. */
    ut_a(next_value >= next);
    ut_a(max_value > next_value);

    /* Check for overflow */
    if (max_value - next_value >= block) {
      next_value += step;

      if (max_value - next_value >= offset) {
        next_value += offset;
      } else {
        next_value = max_value;
      }
    } else {
      next_value = max_value;
    }
  }

  ut_a(next_value != 0);
  ut_a(next_value <= max_value);

  return (next_value);
}

いくつか変数があるので整理する。多分こんな感じ

current　＝　現在のauto_increment値
next_value　＝　次の「現在のauto_increment値」
need　＝　挿入予定のauto_increment値が何個あるか
step　＝　いくつずつauto_increment値が増えるか
offset　＝　いくつからauto_increment値が増えているか
max_value　＝　auto_incrementの最大限界値
block　＝ need*step

ちなみにこの計算ロジックはinsert....selectの場合は挿入する行数だけ呼びだされる。挿入行数が事前に分かるinsert...valuesの場合は一度しか呼ばれない（事前に分かっていれば次の「現在のauto_increment値」の計算は１回で十分ということ）。

さて、注目すべきは後半の部分。まずは

next = (current - offset) / step;
...
next_value = next * step;

でnext_valueが現在のauto_increment値と開始値の差分となる

そして

/ Check for overflow /
    if (max_value - next_value >= block) {
      next_value += block;

  <span class="synStatement">if</span> (max_value - next_value &gt;= offset) {
    next_value += offset;
  } <span class="synStatement">else</span> {
    next_value = max_value;
  }
} <span class="synStatement">else</span> {
  next_value = max_value;
}

その差分にblockとoffsetを足し合わせる。

何が起きてるのか分かりにくいので、insert...valueとinsert...selectの場合でそれぞれ例を見てみる。以下の条件でデバッグ実行して変数を覗き見した結果をもとに説明する。

auto_increment_increment＝1（つまり1ずつ増える）
auto_increment_offset＝1（つまり開始位置が1）
各行のauto_increment値が隙間なく2まで埋まっている
insert...valueもしくはinsert...selectでauto_increment値が3,4が埋まる予定である

insert...select

	1 回目	２回目
current	3	4
need	1	2
step	1	1
offset	1	1
max_value	2147483647	2147483647
block	1	2
next_value	4	6

insert...selectの場合この計算ロジックを２回通る。１回目はauto_increment値が3となる行にカーソルが当たり実行される。この時、以下の計算により

next = (current - offset) / step;
...
next_value = next * step;

next_valueが3-1=2となる。つまり開始位置と挿入しようとしている位置との距離が2となる。そして以下の計算により

/* Check for overflow */
    if (max_value - next_value >= block) {
      next_value += block;

      if (max_value - next_value >= offset) {
        next_value += offset;
      } else {
        next_value = max_value;
      }
    } else {
      next_value = max_value;
    }

next_valueが2+1+1=4となる。つまり開始位置に距離とblock（「何個のauto_increment値を挿入予定か」と「いくつずつauto_incrementが増えるか」の積）を足すことで、次の「現在のauto_increment値」を表している。

２回目はauto_increment値が4となる行にカーソルが当たり実行される。同様の計算により、次の「現在のauto_increment値」が6となってしまう。これはblockが1でなく2、特にneedが2になってしまうことに由来する。つまり、行数が正しく見積もられていない事（挿入行数が事前に分からない事）に由来している。

※初めはblockを足さずにstepを足すのが妥当かと思われたが、そのようにすると次のinsert...valueでの計算が拗れる

insert...value

	1 回目	２回目
current	3	-
need	2	-
step	1	-
offset	1	-
max_value	2147483647	-
block	2	-
next_value	5	-

insert...valueの場合はこの計算ロジックは1回しか通らない。挿入行数が事前に分かっているので、1回目の時点でneedが2になっていることが分かる。そのため。この1回の計算ロジックで次回の「現在のauto_increment値」が計算できている。

ちなみに

同じような現象を示したMySQL Bugsがある

MySQL Bugs: #57643: InnoDB skips auto_increment values with LOAD DATA INFILE

MySQL Bugs: #70692: auto_increment wrong value when using insert into... select from

そもそもauto_incrementは連番を保証したものではないのでバグではないよとのコメントがあったり

I suggest that it is "not a bug". I believe that there is nothing saying that AUTO_INCREMENT values must avoid wasting ids. Sure, it is annoying.

There are many other cases of 'burned' ids: * Rolled back Transaction. * INSERT IGNORE with multiple rows (and other INSERTs/REPLACEs).

An un-burned id that is annoying in a different direction: 1. DELETE the row with MAX(id) 2. Restart the server 3. Now that id will be reused (InnoDB only?).

そういう事前に挿入行数がわからない場合は大きめのインターバルを予約する仕組みなんだよーといったコメントがあったりした。なるほど〜

According to the source code https://github.com/mysql/mysql-server/blob/8.0/sql/handler.cc#L3820, for statements like INSERT INTO ... SELECT, where the exact number of inserted rows is unknown in advance, the SQL handler reserves intervals of auto-increment values while processing the records, every time reserving twice as larger an interval as the previous one as soon as the number of rows reaches the highest value of the previous interval. For example, when inserting 4 records, first, an interval of 1 gets reserved, afterwards an interval of 2, then 4, and in the end 8. In this particular case, the next auto-increment value is going to be 8 instead of 5, and as a result, a gap between 4 and 8 appears. With larger numbers, the gap gets much more significant, for instance, while processing data in bulk say by 5000, the next auto-increment will be the current column value + 8192 instead of the current column value + 5000 + 1. As a workaround to this, data could be processed in batches equal to the value of 2ⁿ-1. However, even with this, gaps can't be avoided at all but at least could be reduced.

2023-10-05

接続要求を受けたときのメインスレッドの働きの話

MySQL

クライアントから接続要求〜ユーザスレッド作成までのmysqldの御働きをバックトレースを見ながら探検する。ついでに接続待機している部分にも寄り道する。なお、スレッドキャッシュを空にするため、mysqldを再起動してから検証をする。

環境

mysql> select version();
+--------------+
| version()    |
+--------------+
| 8.0.28-debug |
+--------------+
1 row in set (0.00 sec)

接続要求が来るまでの間

クライアントから接続要求がやって来る以前のmysqldのバックトレース

Connection_acceptor<Mysqld_socket_listener>::connection_event_loop(Connection_acceptor<Mysqld_socket_listener> * const this) (/mysql-8.0.28/sql/conn_handler/connection_acceptor.h:65)
mysqld_main(int argc, char ** argv) (/mysql-8.0.28/sql/mysqld.cc:7903)
main(int argc, char ** argv) (/mysql-8.0.28/sql/main.cc:25)

mysqldはクライアントから接続要求を受けるまでは接続待機をしている. 以下のconnection_event_loopでConnection_handler_managerというシングルトンの接続管理クラスを作成した後に、listen_for_connection_event で接続を待ち受ける。具体的にはpollがコールされる（タイムアウト値が-1になっていることから接続を受けるまで待機していることがわかる）

/**
    Connection acceptor loop to accept connections from clients.
  */
  void connection_event_loop() {
    Connection_handler_manager *mgr =
        Connection_handler_manager::get_instance();
    while (!connection_events_loop_aborted()) {
      Channel_info *channel_info = m_listener->listen_for_connection_event();
      if (channel_info != nullptr) mgr->process_new_connection(channel_info);
    }
  }

Channel_info *Mysqld_socket_listener::listen_for_connection_event() {
#ifdef HAVE_POLL
  int retval = poll(&m_poll_info.m_fds[0], m_socket_vector.size(), -1);
#else
  m_select_info.m_read_fds = m_select_info.m_client_fds;
  int retval = select((int)m_select_info.m_max_used_connection,
                      &m_select_info.m_read_fds, 0, 0, 0);
#endif
...

接続要求が来た後

適当なところでブレークポイントを置いて、接続要求があった際のデバックトレースが以下になる

my_thread_create(my_thread_handle * thread, const my_thread_attr_t * attr, my_start_routine func, void * arg) (/mysql-8.0.28/mysys/my_thread.cc:78)
pfs_spawn_thread_vc(PSI_thread_key key, PSI_thread_seqnum seqnum, my_thread_handle * thread, const my_thread_attr_t * attr, void *(*)(void *) start_routine, void * arg) (/mysql-8.0.28/storage/perfschema/pfs.cc:2998)
inline_mysql_thread_create(PSI_thread_key key, unsigned int sequence_number, my_thread_handle * thread, const my_thread_attr_t * attr, my_start_routine start_routine, void * arg) (/mysql-8.0.28/include/mysql/psi/mysql_thread.h:139)
Per_thread_connection_handler::add_connection(Per_thread_connection_handler * const this, Channel_info * channel_info) (/mysql-8.0.28/sql/conn_handler/connection_handler_per_thread.cc:411)
Connection_handler_manager::process_new_connection(Connection_handler_manager * const this, Channel_info * channel_info) (/mysql-8.0.28/sql/conn_handler/connection_handler_manager.cc:259)
Connection_acceptor<Mysqld_socket_listener>::connection_event_loop(Connection_acceptor<Mysqld_socket_listener> * const this) (/mysql-8.0.28/sql/conn_handler/connection_acceptor.h:65)
mysqld_main(int argc, char ** argv) (/mysql-8.0.28/sql/mysqld.cc:7903)
main(int argc, char ** argv) (/mysql-8.0.28/sql/main.cc:25)

Connection_handler_manager::process_new_connection

接続要求を受けた後はChannel_infoというクライアントからの接続情報が作成される（これはAbstractであり具体的にはChannel_info_tcpip_socketやChannel_info_local_socketなど）

そして、予め作成されていたConnection_handler_managerからprocess_new_connectionメソッドが呼び出される。max_connectionsを超えていないか等チェックしてadd_connectionを呼び出す（CONNECTION_ADMIN権限を持つユーザであればこのチェックはスルーされる）。connection_events_loop_abortedはよく分からない。

void Connection_handler_manager::process_new_connection(
    Channel_info *channel_info) {
  if (connection_events_loop_aborted() ||
      !check_and_incr_conn_count(channel_info->is_admin_connection())) {
    channel_info->send_error_and_close_channel(ER_CON_COUNT_ERROR, 0, true);
    delete channel_info;
    return;
  }

  if (m_connection_handler->add_connection(channel_info)) {
    inc_aborted_connects();
    delete channel_info;
  }
}

なお、m_connection_handlerメンバの型はConnection_handlerという接続処理のAbstractクラスである。これはConnectionHandlerManagerインスタンス作成時のinitでthread_handlingの値を見て作成される。

今回の場合はthread_handling=one-thread-per-connectionなので具体的にはPer_thread_connection_handler インスタンスとなる

bool Connection_handler_manager::init() {
  /*
    This is a static member function.
    Per_thread_connection_handler's static members need to be initialized
    even if One_thread_connection_handler is used instead.
  */
  Per_thread_connection_handler::init();

  Connection_handler *connection_handler = nullptr;
  switch (Connection_handler_manager::thread_handling) {
    case SCHEDULER_ONE_THREAD_PER_CONNECTION:
      connection_handler = new (std::nothrow) Per_thread_connection_handler();
      break;
    case SCHEDULER_NO_THREADS:
      connection_handler = new (std::nothrow) One_thread_connection_handler();
      break;
    default:
      assert(false);
  }
...

Per_thread_connection_handler::add_connection

add_connectionではcheck_idle_thread_and_enqueue_connectionが初めに呼び出される。ここではスレッドキャッシュの確認をしている。もしアイドル状態のスレッドがあればそれ利用する。今回の場合はmysqld再起動後に動かしているのでこの部分はスルーされる。

その後、mysql_thread_createを呼び出してスレッドの作成に取り掛かる。

bool Per_thread_connection_handler::add_connection(Channel_info *channel_info) {
  int error = 0;
  my_thread_handle id;

  DBUG_TRACE;

  // Simulate thread creation for test case before we check thread cache
  DBUG_EXECUTE_IF("fail_thread_create", error = 1; goto handle_error;);

  if (!check_idle_thread_and_enqueue_connection(channel_info)) return false;

  /*
    There are no idle threads avaliable to take up the new
    connection. Create a new thread to handle the connection
  */
  channel_info->set_prior_thr_create_utime();
  error =
      mysql_thread_create(key_thread_one_connection, &id, &connection_attrib,
                          handle_connection, (void *)channel_info);
#ifndef NDEBUG
handle_error:
#endif  // !NDEBUG

  if (error) {
    connection_errors_internal++;
    if (!create_thd_err_log_throttle.log())
      LogErr(ERROR_LEVEL, ER_CONN_PER_THREAD_NO_THREAD, error);
    channel_info->send_error_and_close_channel(ER_CANT_CREATE_THREAD, error,
                                               true);
    Connection_handler_manager::dec_connection_count();
    return true;
  }

  Global_THD_manager::get_instance()->inc_thread_created();
  DBUG_PRINT("info", ("Thread created"));
  return false;
}

inline_mysql_thread_create以降

HAVE_PSI_THREAD_INTERFACEのマクロが定義されていればPSI_THREAD_CALLが呼び出される。デフォルトでは定義されておりCMakeオプションで制御される（DISABLE_PSI_THREAD）。

#define PSI_THREAD_CALL(M) psi_thread_service->M
...
...
static inline int inline_mysql_thread_create(
    PSI_thread_key key [[maybe_unused]],
    unsigned int sequence_number [[maybe_unused]], my_thread_handle *thread,
    const my_thread_attr_t *attr, my_start_routine start_routine, void *arg) {
  int result;
#ifdef HAVE_PSI_THREAD_INTERFACE
  result = PSI_THREAD_CALL(spawn_thread)(key, sequence_number, thread, attr,
                                         start_routine, arg);
#else
  result = my_thread_create(thread, attr, start_routine, arg);
#endif
  return result;
}

PSI_THREAD_CALLで呼び出されたメソッドでは、これから作成するスレッドに親スレッド（現在のスレッド）の情報を伝搬させるための準備をしている。具体的にはPFS_spawn_thread_argの型の変数にメインスレッドの計測情報を詰め込み、それをmy_thread_createに渡している。

/**
  Implementation of the thread instrumentation interface.
  @sa PSI_v2::spawn_thread.
*/
int pfs_spawn_thread_vc(PSI_thread_key key, PSI_thread_seqnum seqnum,
                        my_thread_handle *thread, const my_thread_attr_t *attr,
                        void *(*start_routine)(void *), void *arg) {
  PFS_spawn_thread_arg *psi_arg;
  PFS_thread *parent;

  /* psi_arg can not be global, and can not be a local variable. */
  psi_arg = (PFS_spawn_thread_arg *)my_malloc(
      PSI_NOT_INSTRUMENTED, sizeof(PFS_spawn_thread_arg), MYF(MY_WME));
  if (unlikely(psi_arg == nullptr)) {
    return EAGAIN;
  }

  psi_arg->m_child_key = key;
  psi_arg->m_child_seqnum = seqnum;
  psi_arg->m_child_identity = (arg ? arg : thread);
  psi_arg->m_user_start_routine = start_routine;
  psi_arg->m_user_arg = arg;

  parent = my_thread_get_THR_PFS();
  if (parent != nullptr) {
    /*
      Make a copy of the parent attributes.
      This is required, because instrumentation for this thread (the parent)
      may be destroyed before the child thread instrumentation is created.
    */
    psi_arg->m_thread_internal_id = parent->m_thread_internal_id;

    memcpy(psi_arg->m_username, parent->m_username,
           sizeof(psi_arg->m_username));
    psi_arg->m_username_length = parent->m_username_length;

    memcpy(psi_arg->m_hostname, parent->m_hostname,
           sizeof(psi_arg->m_hostname));
    psi_arg->m_hostname_length = parent->m_hostname_length;
  } else {
    psi_arg->m_thread_internal_id = 0;
    psi_arg->m_username_length = 0;
    psi_arg->m_hostname_length = 0;
  }

  int result = my_thread_create(thread, attr, pfs_spawn_thread, psi_arg);
  if (unlikely(result != 0)) {
    my_free(psi_arg);
  }
  return result;
}

最終的に呼び出されるmy_thread_createではpthread_createがコールされ、新しいスレッドが作成される。

int my_thread_create(my_thread_handle *thread, const my_thread_attr_t *attr,
                     my_start_routine func, void *arg) {
#ifndef _WIN32
  return pthread_create(&thread->thread, attr, func, arg);
#else
  struct thread_start_parameter *par;
  unsigned int stack_size;

  par = (struct thread_start_parameter *)malloc(sizeof(*par));
  if (!par) goto error_return;

  par->func = func;
  par->arg = arg;
  stack_size = attr ? attr->dwStackSize : 0;

  thread->handle =
      (HANDLE)_beginthreadex(NULL, stack_size, win_thread_start, par, 0,
                             (unsigned int *)&thread->thread);

  if (thread->handle) {
    /* Note that JOINABLE is default, so attr == NULL => JOINABLE. */
    if (attr && attr->detachstate == MY_THREAD_CREATE_DETACHED) {
      /*
        Close handles for detached threads right away to avoid leaking
        handles. For joinable threads we need the handle during
        my_thread_join. It will be closed there.
      */
      CloseHandle(thread->handle);
      thread->handle = NULL;
    }
    return 0;
  }

  my_osmaperr(GetLastError());
  free(par);

error_return:
  thread->thread = 0;
  thread->handle = NULL;
  return 1;
#endif
}

2023-09-29

Connection LifecycleのConnection Phaseの話

MySQL

MySQL: Connection Phaseを読んでフムフムと思ったことを実験を交えて書き記すことにした。

MySQL側ではクライアントからTCP接続が確立してから以下のようなフェーズを辿る

https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_connection_lifecycle.html

これらのフェーズではMySQL プロトコルというMySQL独自のプロトコル（アプリケーションレイヤー）によってクライアントとやり取りされる。初めに接続フェーズを介して認証情報の評価や送信を行い、その後コマンドフェーズで実際のコマンドパケットの処理を行う。今回はこのうちの接続フェーズの部分の話をする。

※実際、クエリを叩く際のパケットキャプチャでは以下のようにMySQL プロトコルのやりとりが確認できる。この内のRequestQuery以前の過程が接続フェーズとなる。

※簡単のため、SSL/TLS接続は考慮しない

ServerGreeting

まず初めにサーバーからクライアントへInitial Handshake packetを送信する。これはServerGreetingと呼ばれ、以下のように各種バージョンや認証プラグイン、文字セットや照合順序、Server Capabilitiesなどの情報を含む。

MySQL Protocol
    Packet Length: 80
    Packet Number: 0
    Server Greeting
        Protocol: 10
        Version: 8.0.28-debug
        Thread ID: 72
        Salt: a^p\017(cQ9
        Server Capabilities: 0xffff
        Server Language: utf8mb4 COLLATE utf8mb4_0900_ai_ci (255)
        Server Status: 0x0002
        Extended Server Capabilities: 0xdfff
        Authentication Plugin Length: 21
        Unused: 00000000000000000000
        Salt: WY\031p[k@\nZ@`m
        Authentication Plugin: mysql_native_password

なお、ペイロードの詳細はProtocol::HandshakeV10 に記載がある。今回の例のプロトコルバージョンが10のため、Protocol::HandshakeV10のリンクを載せている。

また、Server CapabilitiesやExtended Server Capabilitiesにはサーバー機能のビットマスクであり、各種フラグについては同様にCapabilities Flags に記載がある。これらの情報を交換することで、クライアント/サーバー間で期待されていないフォーマットでデータを送るような事を防げる。

Initial Handshake packetの応答としてクライアントからサーバーへLogin Requestを送る。以下のようにクライアント側の要求している認証プラグイン、ユーザ名、パスワード、文字セットや照合順序、Client Capabilitiesなどの情報を含む。

MySQL Protocol
    Packet Length: 249
    Packet Number: 1
    Login Request
        Client Capabilities: 0xa28f
        Extended Client Capabilities: 0x80be
        MAX Packet: 1073741824
        Charset: utf8mb4 COLLATE utf8mb4_general_ci (45)
        Unused: 0000000000000000000000000000000000000000000000
        Username: app
        Password: 55b1a7f3d9159712803de97bb0e15400ef37d4b2
        Schema: world
        Client Auth Plugin: mysql_native_password
        Prefix: 163
        Length: 163
        Connection Attributes

ペイロードの詳細はProtocol::HandshakeResponseに記載がある。サーバー側ではこれらを受け取って、Capabilitiesと使用する認証方法の確認を行う。

認証方法の確認について

実はServerGreeting時点ですでにdefault_authentication_pluginをもとに期待している認証プラグインをクライアント側に送信している。それに応じてLogin Requestでクライアントからサーバーへ使用する認証プラグインを送信する。そして、その後にサーバー側ではクライアントから受け取った認証情報をもとにmysql.user テーブルを確認しに行く。ここでもし期待されている認証プラグインと異なれば、サーバーはクライアントへAuthSwitchRequestを送信して、使用するべき認証プラグインを通知する。

実際、以下のような設定でappというユーザで接続すると

※クライアントサイドはlibmysqlclientを使用

 mysql> select @@default_authentication_plugin;
+---------------------------------+
| @@default_authentication_plugin |
+---------------------------------+
| caching_sha2_password           |
+---------------------------------+
1 row in set, 1 warning (0.00 sec)

mysql> select User, plugin from mysql.user where User = 'app'\G
*************************** 1. row ***************************
                    User: app
                  plugin: mysql_native_password
1 row in set (0.00 sec)

以下のようにAuthSwitchRequestが確認できる。また、mysql_native_passwordを要求していることがパケットの中身から確認できる。なお、ServerGreetingとLogin requestではAuthentication Pluginはcaching_sha2_passwordの内容で送信していることも確認できる。

認証が問題なければサーバーはクライアントへResponse OKを返す。詳細はOK_Packetに記載されている。

Command Phaseについてはまた今度

2023-09-28

トランザクションの同時実行（静的アプローチ）

RDB

この記事は「リレーショナルデータベース入門第３版」のトランザクションの同時実行の章の一部のサーベイです。

はてなのtex記法が上手くいかないので、定義の部分は別でtexで書いたもののキャプチャを貼るという暴挙に出ています

トランザクションの同時実行とスケジュール

トランザクション集合{T1,...,Tn}が与えられた時、ある時刻t1にはTiの第 j ステップを次の時刻t2ではTkの第 l ステップを ( iとkは必ずしも異なる必要性はない) 次の時刻t3ではTiの第 j+1 ステップをという具合に複数のトランザクション実行していくことをトランザクションの同時実行という。

定義(スケジュール)

スケジュールは2次元的に表現するとわかりやすく、その例は以下のようになる

image.png (81.3 kB)

直列スケジュール

トランザクションの原則はACID特性のCにもあるように一貫性である。そのためトランザクションが単体で実行されれば、DBの状態は一貫性が保たれる。つまり、複数のトランザクションに対しては、それらを一つずつ順番に実行していけば一貫性が保証される。この時のスケジュールを直列スケジュールという

n個のトランザクション{T1,...,Tn}が与えられたとする。これらを直列に実行させるとき、直列スケジュールのパターンはn!個ある。この時それぞれのパターンはトランザクションの順番が変わるので、最終的な結果は一般には異なってくる。しかし、最終的な結果は異なっていようが実行の結果はいずれも正しく、一貫性は損なわれていないことに注意する。

スケジュールの等価性と正当性

トランザクションの処理効率（以降TPS）を向上させようとすれば、トランザクションを同時実行させたくなる。例えば、一つのトランザクションがディスクのI/Oまちになって、CPUがアイドル状態になっているならその時間を使って他のトランザクションの処理に当てたい。

しかし無秩序に同時実行させれば、さまざまな異状が発生してしまう。例えば遺失更新異状というトランザクションの更新結果が別のトランザクションの書き込みによって上書きされてしまうなどの現象があったりする。

そこでトランザクションを同時実行させるものの、直列実行のように一貫性を損なわない実行スケジュールを考えたい。そのような理想的なスケジュールを直列化可能スケジュールという。そこで等価性を用いて直列化可能スケジュールを定義する。

トランザクション集合{T1,...,Tn}を同時あるいは直列に実行していくにあたり、実行開始時点でのDBの状態をDBの初期状態ということにし、DB0と表すことにする。DB0に始まり、スケジュール Sに基づいて全てのトランザクションを実行終了した時点でのDBの状態をDBの最終状態ということにし、S(DB0)とかく。

定義(スケジュールの等価性)

定義(スケジュールの正当性)

また、スケジュール Sが正当であることは直列化可能であるともいう（あくまで直列実行は正しいという認識で進む）

さてトランザクション集合が与えられた時に、あるスケジュール Sが正当であるのか、またそのようなスケジュールが存在するのかという決定問題を考える。愚直に考えるとこの問題は、非直列スケジュールSに対してn!個の直列スケジュールを順番に等価性を確認していけば解が出るが、階乗時間の計算量になるため手に負えない。

そこでスケジュール同士の他の等価性に視点をあてて、直列化可能のための条件を見つけていく。

ビュー等価とビュー直列化可能性

スケジュールの等価性を議論するために、最初に考えることはそれぞれのスケジュールの最終書き込み集合を考える。つまり、時間的に最新の書き込みが一緒であれば等価と見做していいんじゃないかということである。しかし、直列スケジュールと最新の書き込みが同じだったとしても正当とは言えないことが知られている。そこでその条件よりも強い制約を課したビュー等価を考える。

定義(ビュー等価)

定義(ビュー直列化可能性)

実はスケジュールがビュー等価であれば、いずれのスケジュールでトランザクションを実行しても実行終了後のDBは同じ状態になる。つまりビュー直列化可能であれば正当である。

とすると次なる問題はビュー直列化可能かどうかを決定する問題はどのくらい難しいのかである。この問題は実はNP完全であることが知られている。従って、問題解決が多項式時間で解けるような次善の策を考える必要がある。

そこで相反等価という考え方を導入する。

相反グラフ、相反投下、相反直列化

定義(相反グラフ)

a~cの状況にある2つのステップは互いに相反しているという。

定理

例えば以下のようなトランザクションスケジュールがあったとする image.png (76.3 kB)

ここから相反グラフSG(C)を作る。まず相反グラフの定義のaに注目すると、データ項目xとyに関してreadがwriteに先行しているのは(T1,T2),(T2,T4),(T3,T4),(T1,T3)。次に定義のbに該当するのは(T1,T2),(T3,T4)。最後に定義のcに該当するのは(T1,T2),(T3,T4),(T3,T2)。よってグラフは以下のようになる

image.png (24.9 kB)

ここからグラフが非巡回であることが分かる（任意のノードに対してそのノードに戻ってくるような経路が存在しないこと）。よってビュー直列化可能であることが分かる。等価な直列スケジュールをトポロジカルソートで求める。

まずグラフの中から入線のないノードを探す。この場合T1が該当する。次にグラフからそのノードと出線を消す。この場合T2→T4とT3→T4とT3→T2が残る。するとまた入線のないノードが生まれる。この場合T3が該当する。また同様にグラフからそのノードと出線を消す。この場合T2→T4が残る。するとまた入線のないノードが生まれる。この場合T2が該当する。また同様にグラフからそのノードと出線を消す

そして消したノードを順番に並べるとT1,T3,T2,T4の直列スケジュールが手に入る。実はこのスケジュールが元のスケジュールとビュー等価になる。（T1→T2のような矢印はT1の動作がT2に依存しているという感覚だと分かりやすいかもしれない。グラフが巡回的だと、T1→T2→T3→T4→T1のように全てが依存しあってしまう。一方直列スケジュールはこのようなことは決して起こらない。トポロジカルソートは依存元を順番に取り除いて並べる操作ということになる。ただし、巡回的でもビュー直列化可能なものはあるのであくまで非巡回は十分条件にすぎない）

定義(相反等価)

定義(相反直列化可能)

つまり相反直列化可能であればビュー直列化可能、つまり正当であるということがわかる。実はこの相反直列化可能かどうかを求めるアルゴリズムは多項式時間の計算量ですむ。

※これまでは盲目的書きを許した議論をしている（readしなくてもwriteができる状態）。ただし、盲目的書きを許さない場合はビュー直列化と相反直列化は同値となる。

以上が同時実行制御における「静的」なアプローチである。ここまでのアプローチ（トランザクションを実行する前にスケジュールを立てて、それが直列化可能かどうかを調べる）をスケジュール法という。

2023-09-21

statusカラムを用意する前に考えたいことは....の話

テーブル設計

statusカラムと題に載せているが、複数の定数を選択肢として持つような所謂enum型のカラムに関することを述べていく（statusという名で定義されることが多いような気がしたので）

そのようなstatus系のカラムは楽で便利が故によく提案されると思うのだが、追加する前にちょっと整理・考慮しておきたい点がある。

因みにアンチstatusカラムという訳ではなく、メリデメをしっかり議論されていればOKな立ち位置なのである

そのカラムを使ってどのようなSQLを構成する予定か

まずはstatusカラムを使ってどのようなSQLを書く予定があげてみよう。意外にもそのカラム追加によってSQLパフォーマンスに影響を与えることがある。

where条件によく現れますか？

select name
from users
where status = 'active'
and (another condition)

usersというユーザのテーブルに"active"（アクティブ）、"inactive"（非アクティブ）、"pending"（保留）の選択肢のstatusカラムを追加したかったとする。また、上記のようなstatusカラムをwhere条件で記述するSQLを書く予定があったとする。

ここで考えたいのは「このwhere条件は頻繁について回る可能性があるのではないか」ということである。つまり、参照する際に常にstatusカラムの値を気にする必要があるか、ということである。今回の例のようなusersのアクティブ性を問うカラムであれば、常に評価する必要がありそうに感じる。

もしも該当するのであれば、クエリパフォーマンスや今後の対応に難が出る可能性が高い。常に評価する必要があるので、既存もしくは今後のSQLにも調整が必要になる。よってそれらのSQLにstatusカラムにフィルタリングが発生してしまう（カバリングインデックスだったSQLもあるかもしれない）。これは、フェッチしてきた行数が多ければ多いほどダメージはデカい。

statusカラムにインデックスを貼るという判断になったとしても、「このwhere条件は頻繁について回る可能性がある」のであれば、今後のインデックスにstatusカラムを毎回考慮する必要がある。また、enumであるが故にカーディナリティは高くないので、そこまで貢献してくれるインデックスにはなり得ない。

そのカラムに関わる定義変更が起こる可能性があるか

次にそのカラム追加をした後後のスキーマの姿をイメージしよう。

前提

リレーショナルデータベースではテーブル論理設計は「実世界の切り取り」というような表現をされることがある（厳密にリレーション等々の言葉を使うべきだが、便宜上一旦は置いておく）。スキーマという枠組み自身は本来時間と共に不変であり、中身は時間と共に変化をする。

例えばusersのnameというカラム自体は時間と共に不変であるが、usersレコードのnameの値は時間と共に変化する可能性はある。後々になってユーザ名を変えた経験は誰だってあるはず。

つまりカラムは時間変化の軸となるものである。例えばusersにname,emailというカラムがあったとする。時間経過とともに各レコードのname,emailの値は変わる。ここにstatusというカラムを追加すれば、もう１つ軸が増えて時間経過とともに変わる要素が増えるのである（リレーションはドメインの直積で表されることをナントナク言い換えている）

ちょっと大袈裟に聞こえるかもしれないが、軸が増えたり減ったり変更されると、そのテーブルの概念や意味が大きく変わる。それに伴ってテーブルの扱いも変わる訳である。そのため、カラム追加を始め、スキーマに対する変更はやや慎重になっていいかもしれない（そのくらいの構えが意外と丁度よいかも）

以降はそんなスキーマ変更に対する考慮である。

そのカラムに依存するカラムは増えますか？

CREATE TABLE orders (
    order_id INT PRIMARY KEY,
    customer_id INT NOT NULL, --顧客ID
    order_date DATE NOT NULL, --注文日
    total_amount DECIMAL(10, 2) NOT NULL, --注文金額
    status ENUM('placed', 'shipped', 'delivered') NOT NULL, --ステータス
    delivery_received_date DATE , --受取日
    delivery_date TEXT, --配達日
);

上記のように、ordersという注文テーブルに"placed"（注文済み）、"shipped"（出荷済み）、"delivered"（配達済み）の選択肢のstatusカラムを追加したかったとする。

ここでdelivery_received_date（受取日）、delivery_date（配達日）は、その言葉の意義からもわかるように、statusがdelivered（配達済）の時のみしか値が入らないのである。このように「statusカラムの選択肢に依存してしまうカラムが複数追加されないか」を考慮する必要がある。

今回の例の２つのカラムは、以下のようにstatusが'placed', 'shipped'の場合はNULLとなってしまう。すなわち、statusが'placed', 'shipped'の時は考慮する必要のないカラムになってしまう。

+----------+----------+----------------------+----------------------+
| order_id | status   | delivery_received_date | delivery_date        |
+----------+----------+----------------------+----------------------+
| 1        | placed   | NULL                 | NULL                 |
| 2        | shipped  | NULL                 | NULL                 |
| 3        | delivered| 2023-09-14           | 2023-09-13           |
| 4        | canceled | NULL                 | NULL                 |
| 5        | placed   | NULL                 | NULL                 |
+----------+----------+----------------------+----------------------+

そのカラムの選択肢は今後増える可能性はありますか？

SQL アンチパターンの31 Flavorsとやや重複するが

www.oreilly.co.jp

先ほどの注文テーブルのstatusカラムを導入して、後々にまた新たな状態を用意したい.....と思ったとする。この欲求の度に、スキーマ変更（ALTER TABLE）する必要がもちろんある。 ENUMの定義変更はメタデータ変更のみだったりするが、これらの変更コストが避けられるなら避けたいものです。

また、前提でも書いたように、状態が増えるという事はテーブルで管理される責務や意味合いが膨らむ事に繋がる。それによって、このテーブルの抱えるレコード数やカラム数が増えることにつながる恐れがある（１つ前の依存カラムが増える恐れもある）。

このように「今後その選択肢の増減する可能性はあるのか」ということを考慮すると良いかもしれない。

じゃあどうしようか

テーブル分割とかどうですか

以上のような点に対して、有効と思われる打開策は何であろうか。自分がよく提案するのは「状態ごとのテーブル分割」である。つまり、状態をカラムで管理するのではなくテーブル単位で管理する方針である。

これにより、テーブルで状態が分けられてるが故に、状態をwhere条件に持つ必要がない。また、状態に依存するカラムはその状態に該当するテーブルでのみ管理すれば良くなる。そして、新たな状態を用意しても、新規テーブル作成で済む。

実際どんな感じで分割？

では、実際どんな感じでテーブル分割するのかだが、これにはいくつか方法があり、スーパータイプとサブタイプに分ける形とサブタイプのみで分ける形がある。 ※概念設計の段階で登場する用語であるが便宜上用いる

スーパータイプとは共通の要素を持つ複数のエンティティからその共通要素を抜き出したものであり、サブタイプとはそれぞれのエンティティ独自の要素を持つものを言う（汎化と特化などと言う）。こちらのhmatsuさんの記事が非常にわかりやすい

先ほどの注文テーブル（statusカラムを入れた状態）を例にとる。

CREATE TABLE orders (
    order_id INT PRIMARY KEY,
    customer_id INT NOT NULL, --顧客ID
    order_date DATE NOT NULL, --注文日
    total_amount DECIMAL(10, 2) NOT NULL, --注文金額
    status ENUM('placed', 'shipped', 'delivered') NOT NULL, --ステータス
    delivery_received_date DATE , --受取日
    delivery_date TEXT, --配達日
);

スーパータイプとサブタイプで分ける

共通の要素が多ければこちらの方が合致するかもしれない。

--スーパータイプ
CREATE TABLE orders (
    order_id INT PRIMARY KEY,
    customer_id INT NOT NULL, --顧客ID（各ステータスの共通の情報）
);

--サブタイプ
CREATE TABLE placed_orders (
    order_id INT PRIMARY KEY,
    order_date DATE NOT NULL, --注文日（注文済みステータスの情報）
    total_amount DECIMAL(10, 2) NOT NULL, --注文金額（注文済みステータスの情報）
);

--サブタイプ
CREATE TABLE shipped_orders (
    order_id INT PRIMARY KEY,
);

--サブタイプ
CREATE TABLE delivered_orders (
    order_id INT PRIMARY KEY,
    delivery_received_date DATE,--受取日（配達済みステータスの情報）
    delivery_date TEXT--配達日（配達済みステータスの情報）
);

場合によっては、注文金額と注文日は共通処理で用いるのであればスーパータイプの方で管理しても良いかもしれない。ここら辺は議論のしどころ。

サブタイプのみで分ける

各状態の独立性が高い、つまり各状態に依存する情報や処理が多いのであればこちらの方が合致するかもしれない。

--サブタイプ
CREATE TABLE placed_orders (
    order_id INT PRIMARY KEY,
    customer_id INT NOT NULL, --顧客ID
    order_date DATE NOT NULL, --注文日（注文済みステータスの情報）
    total_amount DECIMAL(10, 2) NOT NULL, --注文金額（注文済みステータスの情報）
);

--サブタイプ
CREATE TABLE shipped_orders (
    order_id INT PRIMARY KEY,
    customer_id INT NOT NULL, --顧客ID
);

--サブタイプ
CREATE TABLE delivered_orders (
    order_id INT PRIMARY KEY,
    customer_id INT NOT NULL, --顧客ID
    delivery_received_date DATE,--受取日（配達済みステータスの情報）
    delivery_date TEXT--配達日（配達済みステータスの情報）
);

最後に

もちろん他にもいい案があるかもしれないし、いろいろな都合が出没するかもしれない。ここで言いたいのはあくまでメリデメしっかり考慮して議論した上で導入しようぜ〜ということである。

他にもstatusカラムの懸念だったり、いい案があったらぜひ教えてください。

2023-09-15

「テーブルを開く」とは何ぞやの話

MySQL

追うぞ

今回は「テーブルを開く」とは何を指す言葉なのかを追う。 fopenとやっていることは同じなら、ハンドラを作成してそのハンドラを使って各種テーブル操作ができる状態にすることである（と思う）。同じ感覚でいいのか確かめていく。

ha_innobase::openまでのバックトレース

全体像を確かめていく。そのためにそれっぽいところまでのバックトレースを眺める。

以下は、FLUSH TABLE＆testというテーブルにinsertを行った際のha_innobase::openまでのバックトレースである。yokuさんの記事に記載があるように、データディクショナリにアクセスするタイミングで1回目のブレークポイントがやってくる。ここではcolumn_statisticsというテーブルをopenしている。

(gdb) bt
#0  ha_innobase::open (this=0x50000ffff, name=0x0, open_flags=0, table_def=0x100000000)
    at /mysql-8.0.28/storage/innobase/handler/ha_innodb.cc:6995
#1  0x000056101dab35b8 in handler::ha_open (this=0x7f2b48ad0110, table_arg=0x7f2b4818d250,
    name=0x7f2b48ac0ce0 "./mysql/column_statistics", mode=2, test_if_locked=2, table_def=0x7f2b4813cde8)
    at /mysql-8.0.28/sql/handler.cc:2812
#2  0x000056101d87e8a4 in open_table_from_share (thd=0x7f2b480012d0, share=0x7f2b48ac0930,
    alias=0x7f2bc27d5b40 "column_statistics", db_stat=39, prgflag=8, ha_open_flags=0, outparam=0x7f2b4818d250,
    is_create_table=false, table_def_param=0x7f2b4813cde8) at /mysql-8.0.28/sql/table.cc:3175
#3  0x000056101d5d39f1 in open_table (thd=0x7f2b480012d0, table_list=0x7f2b4819bd18, ot_ctx=0x7f2bbc6f34d0)
    at /mysql-8.0.28/sql/sql_base.cc:3367
#4  0x000056101d5d7732 in open_and_process_table (thd=0x7f2b480012d0, lex=0x7f2b48004320,
    tables=0x7f2b4819bd18, counter=0x7f2bbc6f35b0, prelocking_strategy=0x7f2bbc6f3550,
    has_prelocking_list=false, ot_ctx=0x7f2bbc6f34d0) at /mysql-8.0.28/sql/sql_base.cc:5034
#5  0x000056101d5d92d5 in open_tables (thd=0x7f2b480012d0, start=0x7f2bbc6f35a8, counter=0x7f2bbc6f35b0,
    flags=18434, prelocking_strategy=0x7f2bbc6f3550) at /mysql-8.0.28/sql/sql_base.cc:5842
#6  0x000056101d5e6cb0 in open_tables (thd=0x7f2b480012d0, tables=0x7f2bbc6f35a8, counter=0x7f2bbc6f35b0,
    flags=18434) at /mysql-8.0.28/sql/sql_base.h:455
#7  0x000056101edfd2da in dd::Open_dictionary_tables_ctx::open_tables (this=0x7f2bbc6f3650)
    at /mysql-8.0.28/sql/dd/impl/transaction_impl.cc:107
#8  0x000056101ecf9072 in dd::cache::Storage_adapter::get<dd::Item_name_key, dd::Column_statistics> (
    thd=0x7f2b480012d0, key=..., isolation=ISO_READ_COMMITTED, bypass_core_registry=false,
    object=0x7f2bbc6f3760) at /mysql-8.0.28/sql/dd/impl/cache/storage_adapter.cc:170
#9  0x000056101ecf12a8 in dd::cache::Shared_dictionary_cache::get_uncached<dd::Item_name_key, dd::Column_statistics> (this=0x5610227194c0 <dd::cache::Shared_dictionary_cache::instance()::s_cache>, thd=0x7f2b480012d0,
--Type <RET> for more, q to quit, c to continue without paging--c
    key=..., isolation=ISO_READ_COMMITTED, object=0x7f2bbc6f3760) at /mysql-8.0.28/sql/dd/impl/cache/shared_dictionary_cache.cc:113
#10 0x000056101ecf1051 in dd::cache::Shared_dictionary_cache::get<dd::Item_name_key, dd::Column_statistics> (this=0x5610227194c0 <dd::cache::Shared_dictionary_cache::instance()::s_cache>, thd=0x7f2b480012d0, key=..., element=0x7f2bbc6f37c8) at /mysql-8.0.28/sql/dd/impl/cache/shared_dictionary_cache.cc:98
#11 0x000056101ec101c4 in dd::cache::Dictionary_client::acquire<dd::Item_name_key, dd::Column_statistics> (this=0x7f2b48004ca0, key=..., object=0x7f2bbc6f3848, local_committed=0x7f2bbc6f3845, local_uncommitted=0x7f2bbc6f3846) at /mysql-8.0.28/sql/dd/impl/cache/dictionary_client.cc:909
#12 0x000056101ebed4e6 in dd::cache::Dictionary_client::acquire<dd::Column_statistics> (this=0x7f2b48004ca0, object_name="kubo\037test\037id", object=0x7f2bbc6f3970) at /mysql-8.0.28/sql/dd/impl/cache/dictionary_client.cc:1281
#13 0x000056101dad3d12 in histograms::find_histogram (thd=0x7f2b480012d0, schema_name="kubo", table_name="test", column_name="id", histogram=0x7f2bbc6f3a58) at /mysql-8.0.28/sql/histograms/histogram.cc:1336
#14 0x000056101d5cc5d0 in read_histograms (thd=0x7f2b480012d0, share=0x7f2b48ae8bb0, schema=0x7f2b4810e108, table_def=0x7f2b48116578) at /mysql-8.0.28/sql/sql_base.cc:591
#15 0x000056101d5cd0ff in get_table_share (thd=0x7f2b480012d0, db=0x7f2b48ab8e00 "kubo", table_name=0x7f2b480eadc8 "test", key=0x7f2b48ab8baf "kubo", key_length=10, open_view=true, open_secondary=false) at /mysql-8.0.28/sql/sql_base.cc:813
#16 0x000056101d5cd595 in get_table_share_with_discover (thd=0x7f2b480012d0, table_list=0x7f2b48ab8790, key=0x7f2b48ab8baf "kubo", key_length=10, open_secondary=false, error=0x7f2bbc6f3fac) at /mysql-8.0.28/sql/sql_base.cc:884
#17 0x000056101d5d32b6 in open_table (thd=0x7f2b480012d0, table_list=0x7f2b48ab8790, ot_ctx=0x7f2bbc6f4480) at /mysql-8.0.28/sql/sql_base.cc:3202
#18 0x000056101d5d7732 in open_and_process_table (thd=0x7f2b480012d0, lex=0x7f2b48004320, tables=0x7f2b48ab8790, counter=0x7f2b48004378, prelocking_strategy=0x7f2bbc6f4508, has_prelocking_list=false, ot_ctx=0x7f2bbc6f4480) at /mysql-8.0.28/sql/sql_base.cc:5034
#19 0x000056101d5d92d5 in open_tables (thd=0x7f2b480012d0, start=0x7f2bbc6f44f0, counter=0x7f2b48004378, flags=0, prelocking_strategy=0x7f2bbc6f4508) at /mysql-8.0.28/sql/sql_base.cc:5842
#20 0x000056101d5dae47 in open_tables_for_query (thd=0x7f2b480012d0, tables=0x7f2b48ab8790, flags=0) at /mysql-8.0.28/sql/sql_base.cc:6722
#21 0x000056101d76fb82 in Sql_cmd_dml::prepare (this=0x7f2b48ab90c0, thd=0x7f2b480012d0) at /mysql-8.0.28/sql/sql_select.cc:367
#22 0x000056101d770601 in Sql_cmd_dml::execute (this=0x7f2b48ab90c0, thd=0x7f2b480012d0) at /mysql-8.0.28/sql/sql_select.cc:528
#23 0x000056101d6e5611 in mysql_execute_command (thd=0x7f2b480012d0, first_level=true) at /mysql-8.0.28/sql/sql_parse.cc:3553
#24 0x000056101d6eabba in dispatch_sql_command (thd=0x7f2b480012d0, parser_state=0x7f2bbc6f5bd0) at /mysql-8.0.28/sql/sql_parse.cc:5174
#25 0x000056101d6e098c in dispatch_command (thd=0x7f2b480012d0, com_data=0x7f2bbc6f6bc0, command=COM_QUERY) at /mysql-8.0.28/sql/sql_parse.cc:1938
#26 0x000056101d6deb2b in do_command (thd=0x7f2b480012d0) at /mysql-8.0.28/sql/sql_parse.cc:1352
#27 0x000056101d8e4375 in handle_connection (arg=0x56102549bee0) at /mysql-8.0.28/sql/conn_handler/connection_handler_per_thread.cc:302
#28 0x000056101f75b137 in pfs_spawn_thread (arg=0x561025540740) at /mysql-8.0.28/storage/perfschema/pfs.cc:2947
#29 0x00007f2bd3590609 in start_thread (arg=<optimized out>) at pthread_create.c:477
#30 0x00007f2bd2cde133 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

２回目のブレークポイントでtestに関するopenが為される

(gdb) bt
#0  ha_innobase::open (this=0x0, name=0x0, open_flags=0, table_def=0x100000000)
    at /mysql-8.0.28/storage/innobase/handler/ha_innodb.cc:6995
#1  0x000056101dab35b8 in handler::ha_open (this=0x7f2b48acb5e0, table_arg=0x7f2b480fff10,
    name=0x7f2b48ae8f58 "./kubo/test", mode=2, test_if_locked=2, table_def=0x7f2b48116578)
    at /mysql-8.0.28/sql/handler.cc:2812
#2  0x000056101d87e8a4 in open_table_from_share (thd=0x7f2b480012d0, share=0x7f2b48ae8bb0,
    alias=0x7f2b480ebe18 "test", db_stat=39, prgflag=8, ha_open_flags=0, outparam=0x7f2b480fff10,
    is_create_table=false, table_def_param=0x7f2b48116578) at /mysql-8.0.28/sql/table.cc:3175
#3  0x000056101d5d39f1 in open_table (thd=0x7f2b480012d0, table_list=0x7f2b48ab8790, ot_ctx=0x7f2bbc6f4480)
    at /mysql-8.0.28/sql/sql_base.cc:3367
#4  0x000056101d5d7732 in open_and_process_table (thd=0x7f2b480012d0, lex=0x7f2b48004320,
    tables=0x7f2b48ab8790, counter=0x7f2b48004378, prelocking_strategy=0x7f2bbc6f4508,
    has_prelocking_list=false, ot_ctx=0x7f2bbc6f4480) at /mysql-8.0.28/sql/sql_base.cc:5034
#5  0x000056101d5d92d5 in open_tables (thd=0x7f2b480012d0, start=0x7f2bbc6f44f0, counter=0x7f2b48004378,
    flags=0, prelocking_strategy=0x7f2bbc6f4508) at /mysql-8.0.28/sql/sql_base.cc:5842
#6  0x000056101d5dae47 in open_tables_for_query (thd=0x7f2b480012d0, tables=0x7f2b48ab8790, flags=0)
    at /mysql-8.0.28/sql/sql_base.cc:6722
#7  0x000056101d76fb82 in Sql_cmd_dml::prepare (this=0x7f2b48ab90c0, thd=0x7f2b480012d0)
    at /mysql-8.0.28/sql/sql_select.cc:367
#8  0x000056101d770601 in Sql_cmd_dml::execute (this=0x7f2b48ab90c0, thd=0x7f2b480012d0)
    at /mysql-8.0.28/sql/sql_select.cc:528
#9  0x000056101d6e5611 in mysql_execute_command (thd=0x7f2b480012d0, first_level=true)
    at /mysql-8.0.28/sql/sql_parse.cc:3553
#10 0x000056101d6eabba in dispatch_sql_command (thd=0x7f2b480012d0, parser_state=0x7f2bbc6f5bd0)
--Type <RET> for more, q to quit, c to continue without paging--c
    at /mysql-8.0.28/sql/sql_parse.cc:5174
#11 0x000056101d6e098c in dispatch_command (thd=0x7f2b480012d0, com_data=0x7f2bbc6f6bc0, command=COM_QUERY) at /mysql-8.0.28/sql/sql_parse.cc:1938
#12 0x000056101d6deb2b in do_command (thd=0x7f2b480012d0) at /mysql-8.0.28/sql/sql_parse.cc:1352
#13 0x000056101d8e4375 in handle_connection (arg=0x56102549bee0) at /mysql-8.0.28/sql/conn_handler/connection_handler_per_thread.cc:302
#14 0x000056101f75b137 in pfs_spawn_thread (arg=0x561025540740) at /mysql-8.0.28/storage/perfschema/pfs.cc:2947
#15 0x00007f2bd3590609 in start_thread (arg=<optimized out>) at pthread_create.c:477
#16 0x00007f2bd2cde133 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
(gdb) n
6999      bool cached = false;
(gdb) p name
$7 = 0x7f2b48ae8f58 "./kubo/test"
(gdb)

実際、FLUSH TABLE後にINSERTを実行し、openしているテーブルを確認すると次のようになることが確認できる。

mysql> show open tables;
+----------+-------------------+--------+-------------+
| Database | Table             | In_use | Name_locked |
+----------+-------------------+--------+-------------+
| mysql    | column_statistics |      0 |           0 |
| kubo     | test              |      0 |           0 |
+----------+-------------------+--------+-------------+
2 rows in set (0.00 sec)

TABLE

まずは詳解MySQLを参照してTABLEについて理解を深めつつ、ソースを見て「テーブルを開く」の意味を掴んでいく

www.oreilly.co.jp

TABLE構造体はテーブルディスクリプタを定義する。テーブルは「開いた状態」か、「閉じた状態」の２通りで存在する。「テーブルが開く」たびにテーブルディスクリプタが作成されて、テーブルキャッシュに配置される。

実際、ha_innobase::openの前段のopen_tableとopen_table_from_shareでは以下のようにTABLEを準備している様子が伺える。この少し前段ではTABLE_SHAREをテーブル定義キャッシュから取得している場面もあったりする。

/* make a new table */
    if (!(table = (TABLE *)my_malloc(key_memory_TABLE, sizeof(*table),
                                     MYF(MY_WME))))
      goto err_lock;

    error = open_table_from_share(
        thd, share, alias,
        ((flags & MYSQL_OPEN_NO_NEW_TABLE_IN_SE)
             ? 0
             : ((uint)(HA_OPEN_KEYFILE | HA_OPEN_RNDFILE | HA_GET_INDEX |
                       HA_TRY_READ_ONLY))),
        EXTRA_RECORD, thd->open_options, table, false, table_def);

nt open_table_from_share(THD *thd, TABLE_SHARE *share, const char *alias,
                          uint db_stat, uint prgflag, uint ha_open_flags,
                          TABLE *outparam, bool is_create_table,
                          const dd::Table *table_def_param) {
  ....

  error = 1;
  new (outparam) TABLE();
  outparam->in_use = thd;
  outparam->s = share;
  outparam->db_stat = db_stat;
  outparam->write_row_record = nullptr;

  MEM_ROOT *root;
  if (!internal_tmp) {
    root = &outparam->mem_root;
    init_sql_alloc(key_memory_TABLE, root, TABLE_ALLOC_BLOCK_SIZE, 0);
  } else
    root = &share->mem_root;

  /*
    For internal temporary tables we allocate the 'alias' in the
    TABLE_SHARE's mem_root rather than on the heap as it gives simpler
    freeing.
  */
  outparam->alias = internal_tmp
                        ? strdup_root(root, alias)
                        : my_strdup(key_memory_TABLE, alias, MYF(MY_WME));
  if (!outparam->alias) goto err;

  outparam->quick_keys.init();
  outparam->possible_quick_keys.init();
  outparam->covering_keys.init();
  outparam->merge_keys.init();
  outparam->keys_in_use_for_query.init(); 
  ...

また、open_table内の処理でopen_table_from_shareを呼び出した後すぐに、以下のようにテーブルキャッシュに保存している様子が伺える。

{
    /* Add new TABLE object to table cache for this connection. */
    Table_cache *tc = table_cache_manager.get_cache(thd);

    tc->lock();

    if (tc->add_used_table(thd, table)) {
      tc->unlock();
      goto err_lock;
    }
    tc->unlock();
  }

ここまでを見ると、「テーブルを開く」はテーブルディスクリプタを作成してテーブルキャッシュに載せる事であると言えそうだ。これで解散するのも惜しいのでもう少し続ける。

handler

もう少しopen_table_from_shareの処理の一部に注目する。ここからはInnodb側の処理に寄っていく。

以下のようにTABLEを構成している中で、get_new_handlerでhandlerを作成している。

  
  /* Allocate handler */
  outparam->file = nullptr;
  if (!(prgflag & SKIP_NEW_HANDLER)) {
    if (!(outparam->file = get_new_handler(share, share->m_part_info != nullptr,
                                           root, share->db_type())))
      goto err;
    if (outparam->file->set_ha_share_ref(&share->ha_share)) goto err;
  } else {
    assert(!db_stat);
  }

ここで、また詳解MySQLでhandlerについて参照する。テーブルハンドラはストレージエンジンとオプティマイザの間のインターフェースであり、このインターフェースはhandlerという名前の抽象クラスを通して実装される。この抽象クラスはテーブルオープン・クローズ、レコードのシーケンシャルスキャン・レコードの取得・格納・削除といった基本的なメソッドに対する処理を提供する。個々のストレージエンジンはhandlerのサブクラスを実装する。

実際、作成したhandlerを用いてha_openを呼び出し、InnoDBストレージエンジンで実装されているha_innobase::openが呼び出される。

int ha_err;
    if ((ha_err = (outparam->file->ha_open(
             outparam, share->normalized_path.str,
             (db_stat & HA_READ_ONLY ? O_RDONLY : O_RDWR),
             ((db_stat & HA_OPEN_TEMPORARY
                   ? HA_OPEN_TMP_TABLE
                   : (db_stat & HA_WAIT_IF_LOCKED)
                         ? HA_OPEN_WAIT_IF_LOCKED
                         : (db_stat & (HA_ABORT_IF_LOCKED | HA_GET_INFO))
                               ? HA_OPEN_ABORT_IF_LOCKED
                               : HA_OPEN_IGNORE_IF_LOCKED) |
              ha_open_flags),
             table_def)))) {
      /* Set a flag if the table is crashed and it can be auto. repaired */
      share->crashed = ((ha_err == HA_ERR_CRASHED_ON_USAGE) &&
                        outparam->file->auto_repair() &&
                        !(ha_open_flags & HA_OPEN_FOR_REPAIR));

      switch (ha_err) {
        case HA_ERR_TABLESPACE_MISSING:
          /*
            In case of Innodb table space header may be corrupted or
            ibd file might be missing
          */
          error = 1;
          assert(my_errno() == HA_ERR_TABLESPACE_MISSING);
          break;
        ...
        default:
          outparam->file->print_error(ha_err, MYF(0));
          error_reported = true;
          if (ha_err == HA_ERR_TABLE_DEF_CHANGED)
            error = 7;
          else if (ha_err == HA_ERR_ROW_FORMAT_CHANGED)
            error = 8;
          break;
      }
      goto err; /* purecov: inspected */
    }

/ha_innodb.cc
/** Open an InnoDB table.
@param[in]      name            table name
@param[in]      open_flags      flags for opening table from SQL-layer.
@param[in]      table_def       dd::Table object describing table to be opened
@retval 1 if error
@retval 0 if success */
int ha_innobase::open(const char *name, int, uint open_flags,
                      const dd::Table *table_def) {

ここまでを見ると、ストレージエンジン側でもopenの概念があり、open_table_from_share内ではhandlerを通してその操作が為されると分かる。

ha_innobase::openの実装

さて、ha_innobase::openを要所要所覗き見していく。

/* Get pointer to a table object in InnoDB dictionary cache.
  For intrinsic table, get it from session private data */
  ib_table = thd_to_innodb_session(thd)->lookup_table_handler(norm_name);

まずはこの部分。コメント内では、以降ディクショナリーキャッシュからテーブルオブジェクトのポインターを手に入れるぞいと宣言しているのが確認できる。ここでテーブルオブジェクトとは上記のib_tableの型でもあるdict_table_tを指す。

また、コメントから「intrinsic tableの場合」はセッションプライベートデータから読み込むとの内容が確認できる。

実際、thd_to_innodb_sessionとlookup_table_handlerのFile Referenceを読むと InnoDB セッション固有のプライベートハンドラーを取得し、そのハンドラを使ってテーブル名と一致するテーブルオブジェクトを取得している処理と確認できる。

intrinsic tableとは何か。また別の機会に調べてみよう。 ※5.7ではあるがintrinsic tableに関するブログアーカイブは見つけた

if (ib_table == nullptr) {
    dict_sys_mutex_enter();
    ib_table = dict_table_check_if_in_cache_low(norm_name);
    if (ib_table != nullptr) {
      if (ib_table->is_corrupted()) {
          /* Optionally remove this corrupted table from cache now
        if no other thread is still using it. If not, the corrupted bit
        will keep it from being used.*/
        if (ib_table->get_ref_count() == 0) {
          dict_table_remove_from_cache(ib_table);
        }
        ib_table = nullptr;
        cached = true;
        
      ...
      
      } else if (ib_table->discard_after_ddl) {
      reload:
        btr_drop_ahi_for_table(ib_table);
        dict_table_remove_from_cache(ib_table);
        ib_table = nullptr;
      } else {
        cached = true;
        if (!dd_table_match(ib_table, table_def)) {
          dict_set_corrupted(ib_table->first_index());
          dict_table_remove_from_cache(ib_table);
          ib_table = nullptr;
        } else {
          ib_table->acquire_with_lock();
        }
      }
      }

次にセッションプライベートデータからオブジェクトを取得できなかった場合の処理の一部。この場合、mutexを取得してディクショナリーキャッシュ内を探索する。キャッシュされていた場合、そのテーブルが破損してないか・DDL後でないかをチェックする。該当すれば、キャッシュから取得したオブジェクトを使わないことにしたりキャッシュから削除したりする。

※refresh_fkのチェックもあったが、何のチェックをしているかイマイチまだ分かっていない

if (!cached) {
      dd::cache::Dictionary_client *client = dd::get_dd_client(thd);
      dd::cache::Dictionary_client::Auto_releaser releaser(client);

      ib_table = dd_open_table(client, table, norm_name, table_def, thd);
      if (!ib_table) {
        set_my_errno(ENOENT);
        return HA_ERR_NO_SUCH_TABLE;
      }
    }
  } else {
    ib_table->acquire();
    ut_ad(ib_table->is_intrinsic());
  }

そしてキャッシュされていない場合、データディクショナリーからテーブルオブジェクトを取得する。

以降はテーブルオブジェクトの細やかなチェックと設定がおこなれていく。

if (ib_table != nullptr) {
    /* Make sure table->is_dd_table is set */
    std::string db_str;
    std::string tbl_str;
    dict_name::get_table(ib_table->name.m_name, db_str, tbl_str);

    ib_table->is_dd_table =
        dd::get_dictionary()->is_dd_table_name(db_str.c_str(), tbl_str.c_str());
  }

例えば、取得したテーブルオブジェクトについて、テーブルがデータディクショナリテーブルであるかどうかのチェック。

/* m_share might hold pointers to dict table indexes without any pin.
  We must always allocate m_share after opening the dict_table_t object
  and free it before de-allocating dict_table_t to avoid race. */
  if (ib_table != nullptr) {
    m_share = get_share(name);
    if (m_share == nullptr) {
      dict_table_close(ib_table, FALSE, FALSE);
      return HA_ERR_SE_OUT_OF_MEMORY;
    }
  }

他には、テーブルオブジェクトの競合を避けるための処理。コメントから察するに、dict_table_tを取得した後にm_share（INNOBASE_SHARE）を割り当てて、これを解放してからdict_table_tを閉じるという順番とのこと。

if (ib_table != nullptr &&
      ((!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID) &&
        table->s->fields != dict_table_get_n_tot_u_cols(ib_table)) ||
       (DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID) &&
        (table->s->fields != dict_table_get_n_tot_u_cols(ib_table) - 1)))) {
    ib::warn(ER_IB_MSG_556)
        << "Table " << norm_name << " contains " << ib_table->get_n_user_cols()
        << " user"
           " defined columns in InnoDB, but "
        << table->s->fields
        << " columns in MySQL. Please check"
           " INFORMATION_SCHEMA.INNODB_COLUMNS and " REFMAN
           "innodb-troubleshooting.html for how to resolve the"
           " issue.";

    /* Mark this table as corrupted, so the drop table
    or force recovery can still use it, but not others. */
    ib_table->first_index()->type |= DICT_CORRUPT;
    free_share(m_share);
    dict_table_close(ib_table, FALSE, FALSE);
    ib_table = nullptr;
  }

そして、tableとib_tableのカラム数の情報との乖離がないかチェックをしている様子。つまり、データディクショナリーやキャッシュから引っ張ってきた情報と違いがないかを確認している。以降も様々なチェックをしている（途中で力尽きた）

ここまでで、InnoDB層での「テーブルを開く」はディクショナリーキャッシュからテーブルオブジェクトを取得し、もしなければデータディクショナリから引っ張ってくる事と言えそうだ。

要するに

さて、順番に箇条書きにしてまとめると、以下がInnoDB層の挙動も含めた「テーブルを開く」であると思われる

テーブルディスクリプタ（TABLE）を作成する
テーブルハンドラ（handler）を作成する
InnoDBハンドラ（ha_innobase）を通してディクショナリキャッシュもしくはデータディクショナリからテーブルオブジェクトを取得する
テーブルキャッシュにテーブルディスクリプタを保存する

まだモヤモヤする部分がポツポツあったりするがが、勉強していけばその内ハッとくる時が来るであろう。

2023-09-07

EXPLAIN FORMAT=JSONに表示されるコストを理解する旅に出た話

MySQL

環境

mysql> select version();
+--------------+
| version()    |
+--------------+
| 8.0.28-debug |
+--------------+
1 row in set (0.00 sec)

mysql> show create table test\G
*************************** 1. row ***************************
       Table: test
Create Table: CREATE TABLE `test` (
  `id` int NOT NULL,
  `name` varchar(100) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
1 row in set (0.03 sec)

mysql> select count(*) from test;
+----------+
| count(*) |
+----------+
|  1048584 |
+----------+
1 row in set (0.84 sec)

mysql> select * from test limit 5;
+----+-------+
| id | name  |
+----+-------+
|  1 | kubo1 |
|  2 | kubo2 |
|  3 | kubo3 |
|  4 | kubo4 |
|  5 | kubo5 |
+----+-------+
5 rows in set (0.00 sec)

EXPLAIN FORMAT=JSONでコスト表示

mysql> select * from test where name = 'kubo3'\G
*************************** 1. row ***************************
  id: 3
name: kubo3
1 row in set (5.11 sec)

mysql> explain format=json select * from test where name = 'kubo3'\G
*************************** 1. row ***************************
EXPLAIN: {
  "query_block": {
    "select_id": 1,
    "cost_info": {
      "query_cost": "100730.00"
    },
    "table": {
      "table_name": "test",
      "access_type": "ALL",
      "rows_examined_per_scan": 984540,
      "rows_produced_per_join": 98454,
      "filtered": "10.00",
      "cost_info": {
        "read_cost": "90884.60",
        "eval_cost": "9845.40",
        "prefix_cost": "100730.00",
        "data_read_per_join": "38M"
      },
      "used_columns": [
        "id",
        "name"
      ],
      "attached_condition": "(`kubo`.`test`.`name` = 'kubo3')"
    }
  }
}

オプティマイザトレースと同様に、EXPLAIN FORMAT=JSONで実行計画を表示するとどれだけのコストがかかったかを参照できる。ただ、ふむふむと見てみても正直なんとなくしか分からない。

FORMAT=JSONの場合のEXPLAIN特有の情報を掴めるようになりたい。

そこで今回は以下のコストの意味をざっくり理解していくことにする。

"rows_examined_per_scan": 984540,
      "rows_produced_per_join": 98454,
      "filtered": "10.00",
      "cost_info": {
        "read_cost": "90884.60",
        "eval_cost": "9845.40",
        "prefix_cost": "100730.00"

これらの意味を理解していくためにソースを読んで出処を追う旅に出ることにした。

※なお、記事の最後では諸々の数値をざっくり説明してますが、eval_costだけは計算式のみでどういうコストか言語化できていないです。

Explain_format_JSON

read_costやeval_costなどでgrepすると以下のクラスExplain_format_JSONがヒットする。

/**
  Formatter class for EXPLAIN FORMAT=JSON output
*/

class Explain_format_JSON : public Explain_format {
 private:
  opt_explain_json_namespace::context *current_context;  ///< current tree node

 public:
  Explain_format_JSON() : current_context(nullptr) {}

  bool is_hierarchical() const override { return true; }
  bool send_headers(Query_result *result) override;
  bool begin_context(enum_parsing_context context, Query_expression *subquery,
                     const Explain_format_flags *flags) override;
  bool end_context(enum_parsing_context context) override;
  bool flush_entry() override { return false; }
  qep_row *entry() override;
};

以下のように、先ほどのread_costやeval_costなどの文字列を表す定数が作成されている。ではこの定数はどこで使用されているのだろうか

// JSON key names
...
static const char K_ROWS[] = "rows_examined_per_scan";
static const char K_PREFIX_ROWS[] = "rows_produced_per_join";

static const char K_COST_INFO[] = "cost_info";
static const char K_READ_TIME[] = "read_cost";
static const char K_PREFIX_COST[] = "prefix_cost";
static const char K_COND_COST[] = "eval_cost";
...
static const char K_QUERY_COST[] = "query_cost";
static const char K_DATA_SIZE_QUERY[] = "data_read_per_join";

これら定数は以下の同クラスのメソッド（format_body）で利用されている。

cost_info内のコストに対応する部分について説明する（cost_info外のコストに対応する部分もほぼ同じ）。

まず、Opt_trace_object オブジェクトに json と K_COST_INFO というキーで初期化している（explain結果のcost_infoの部分に対応。中身はまだない）。そして、32バイトの文字配列 bufにcol_read_costやcol_prefix_costなどの浮動小数点を文字列にキャストして詰め込んでいる。最後に、そのbufと共にread _costやeval_cost,prefix_costを表していた定数をcost_infoのオブジェクトに詰め込まれている（explain結果のcost_infoの部分が完成）

bool table_base_ctx::format_body(Opt_trace_context *json,
                                 Opt_trace_object *obj) {
  StringBuffer<64> buff;

  ...

  if (!col_rows.is_empty()) obj->add(K_ROWS, col_rows.value);
  if (!col_prefix_rows.is_empty())
    obj->add(K_PREFIX_ROWS, col_prefix_rows.value);

  if (!col_filtered.is_empty()) {
    char buf[32];  // 32 is enough for digits of a double
    print_filtered(buf, sizeof(buf), col_filtered.value);
    obj->add_utf8(K_FILTERED, buf);
  }

  format_extra(obj);

  if (!col_read_cost.is_empty()) {
    Opt_trace_object cost_info(json, K_COST_INFO);
    char buf[32];  // 32 is enough for digits of a double

    print_cost(buf, sizeof(buf), col_read_cost.value);
    cost_info.add_utf8(K_READ_TIME, buf);

    if (!col_cond_cost.is_empty()) {
      print_cost(buf, sizeof(buf), col_cond_cost.value);
      cost_info.add_utf8(K_COND_COST, buf);
    }
    if (!col_prefix_cost.is_empty()) {
      print_cost(buf, sizeof(buf), col_prefix_cost.value);
      cost_info.add_utf8(K_PREFIX_COST, buf);
    }
    if (!col_data_size_query.is_empty())
      cost_info.add_utf8(K_DATA_SIZE_QUERY, col_data_size_query.str);
  }

  ...

  return format_derived(json) || format_query_expression(json);
}

したがって、各コストの値はcol_hogehogeのvalueに対応していそうな事がわかる. ここまでで以下のような対応が整理できる。

"rows_examined_per_scan" → col_rows
"rows_produced_per_join" → col_prefix_rows
"read_cost" → col_read_cost
"eval_cost" → col_cond_cost
"prefix_cost" → col_prefix_cost

さて、このcol_hogehoge達は一体何者なのだろうか。

qep_row

これらcol_hogehoge達はqep_rowというクラスのpublic memberである。qep_rowはテーブルに関する各種情報をキャッシュする役割を担っている（通常のEXPLAINの場合は各出力行の情報が保存され、jsonやtree形式だと各コンテキストの情報が保存される）。

これらには以下でcol_hogehogeの各値がsetされている。なお、fmtはexplain_thd->lex->explain_formatであり、この場合だとEXPLAIN_FORMAT_JSONである。entryメソッドによってJSONフォーマットの現在のコンテキストの情報（col_hogehogeのようなコスト情報）を抜き出している。

bool Explain_join::explain_rows_and_filtered() {
  if (!tab || tab->table_ref->schema_table) return false;

  POSITION *const pos = tab->position();

  if (explain_thd->lex->sql_command == SQLCOM_EXPLAIN_OTHER &&
      skip_records_in_range) {
    // Skipping col_rows, col_filtered, col_prefix_rows will set them to NULL.
    fmt->entry()->col_cond_cost.set(0);
    fmt->entry()->col_read_cost.set(0.0);
    fmt->entry()->col_prefix_cost.set(0);
    fmt->entry()->col_data_size_query.set("0");
  } else {
    fmt->entry()->col_rows.set(static_cast<ulonglong>(pos->rows_fetched));
    fmt->entry()->col_filtered.set(
        pos->rows_fetched
            ? static_cast<float>(100.0 * tab->position()->filter_effect)
            : 0.0f);

    // Print cost-related info
    double prefix_rows = pos->prefix_rowcount;
    ulonglong prefix_rows_ull =
        static_cast<ulonglong>(std::min(prefix_rows, ULLONG_MAX_DOUBLE));
    fmt->entry()->col_prefix_rows.set(prefix_rows_ull);
    double const cond_cost = join->cost_model()->row_evaluate_cost(prefix_rows);
    fmt->entry()->col_cond_cost.set(cond_cost < 0 ? 0 : cond_cost);
    fmt->entry()->col_read_cost.set(pos->read_cost < 0.0 ? 0.0
                                                         : pos->read_cost);
    fmt->entry()->col_prefix_cost.set(pos->prefix_cost);
    // Calculate amount of data from this table per query
    char data_size_str[32];
    double data_size = prefix_rows * tab->table()->s->rec_buff_length;
    human_readable_num_bytes(data_size_str, sizeof(data_size_str), data_size);
    fmt->entry()->col_data_size_query.set(data_size_str);
  }

  return false;
}

ここまでで以下のような対応が整理できる。

"rows_examined_per_scan" → col_rows 
                         → static_cast<ulonglong>(pos->rows_fetched)
                         
"rows_produced_per_join" → col_prefix_rows 
                         → static_cast<ulonglong>(std::min(prefix_rows, ULLONG_MAX_DOUBLE))
                         
"read_cost"              → col_read_cost 
                         → pos->read_cost < 0.0 ? 0.0: pos->read_cost
                         
"eval_cost"              → col_cond_cost 
                         →　cond_cost < 0 ? 0 : cond_cost
                         
"prefix_cost"            → col_prefix_cost 
                         → pos->prefix_cost
                         
※　double prefix_rows = pos->prefix_rowcount;
※　double const cond_cost = join->cost_model()->row_evaluate_cost(prefix_rows);

いくつかif分岐や大小比較があるが、col_hogehogeの値は大体pos→hugehugeの値が入っていると見て良いだろう。因みにjoin->cost_model()->row_evaluate_cost()は0.1倍する処理である（サーバーコストパラメータの一種。MySQL8.0ではデフォルト0.1, MySQL5.7ではデフォルト0.2）

ではposとはなんだろうか。

先程のソースの冒頭では以下のように記述されており、POSITIONオブジェクトへのポインタであると分かる。ではPOSITIONとは何だろうか。

POSITION *const pos = tab->position();

POSITION

POSITIONクラスは以下のような情報を含む。大雑把に言うと、クエリ内のテーブルに関する結合順序や結合方法についてまとめられている。結合クエリ内でどのような位置/立場(Position)にあるテーブルかを表す、と無理やり解釈するとクラス名にも合点がいく。

One POSITION element contains information about: ・Which table is accessed ・Which access method was chosen = Its cost and #of output records ・Semi-join strategy choice. Note that there are two different representation formats: The one used during join optimization The one used at plan refinement/code generation stage. We call fix_semijoin_strategies_for_picked_join_order() to switch between #1 and #2. See that function's comment for more details. ・Semi-join optimization state. When we're running join optimization, we main a state for every semi-join strategy which are various variables that tell us if/at which point we could consider applying the strategy. The variables are really a function of join prefix but they are too expensive to re-caclulate for every join prefix we consider, so we maintain current state in join->positions[#tables_in_prefix]. See advance_sj_state() for details. https://dev.mysql.com/doc/dev/mysql-server/latest/structPOSITION.html#details

このクラスの中のPublic Attributesとしてprefix_rowcountやprefix_cost、rows_fetched、read_costが存在し、以下のようにprefix_cost以外は何者か解説されている。

struct POSITION {
  /**
    The number of rows that will be fetched by the chosen access
    method per each row combination of previous tables. That is:

      rows_fetched = selectivity(access_condition) * cardinality(table)

    where 'access_condition' is whatever condition was chosen for
    index access, depending on the access method ('ref', 'range',
    etc.)

    @note For index/table scans, rows_fetched may be less than
    the number of rows in the table because the cost of evaluating
    constant conditions is included in the scan cost, and the number
    of rows produced by these scans is the estimated number of rows
    that pass the constant conditions. @see
    Optimize_table_order::calculate_scan_cost() . But this is only during
    planning; make_join_readinfo() simplifies it for EXPLAIN.
  */
  double rows_fetched;

  /**
    Cost of accessing the table in course of the entire complete join
    execution, i.e. cost of one access method use (e.g. 'range' or
    'ref' scan ) multiplied by estimated number of rows from tables
    earlier in the join sequence.

    read_cost does NOT include cost of processing rows within the
    executor (row_evaluate_cost).
  */
  double read_cost;

  /**
    The fraction of the 'rows_fetched' rows that will pass the table
    conditions that were NOT used by the access method. If, e.g.,

      "SELECT ... WHERE t1.colx = 4 and t1.coly @> 5"

    is resolved by ref access on t1.colx, filter_effect will be the
    fraction of rows that will pass the "t1.coly @> 5" predicate. The
    valid range is 0..1, where 0.0 means that no rows will pass the
    table conditions and 1.0 means that all rows will pass.

    It is used to calculate how many row combinations will be joined
    with the next table, @see prefix_rowcount below.

    @note With condition filtering enabled, it is possible to get
    a fanout = rows_fetched * filter_effect that is less than 1.0.
    Consider, e.g., a join between t1 and t2:

       "SELECT ... WHERE t1.col1=t2.colx and t2.coly OP @<something@>"

    where t1 is a prefix table and the optimizer currently calculates
    the cost of adding t2 to the join. Assume that the chosen access
    method on t2 is a 'ref' access on 'colx' that is estimated to
    produce 2 rows per row from t1 (i.e., rows_fetched = 2). It will
    in this case be perfectly fine to calculate a filtering effect
    @<0.5 (resulting in "rows_fetched * filter_effect @< 1.0") from the
    predicate "t2.coly OP @<something@>". If so, the number of row
    combinations from (t1,t2) is lower than the prefix_rowcount of t1.

    The above is just an example of how the fanout of a table can
    become less than one. It can happen for any access method.
  */
  float filter_effect;

  /**
    prefix_rowcount and prefix_cost form a stack of partial join
    order costs and output sizes

    prefix_rowcount: The number of row combinations that will be
    joined to the next table in the join sequence.

    For a joined table it is calculated as
      prefix_rowcount =
          last_table.prefix_rowcount * rows_fetched * filter_effect

    @see filter_effect

    For a semijoined table it may be less than this formula due to
    duplicate elimination.
  */
  double prefix_rowcount;
  double prefix_cost;

rows_fetchedはその名の通りであるが、選択されているアクセス方法によってフェッチされる行数である。全検索であれば全行になるだろうし、ユニークインデックスでの定数検索なら1行になる。この値はselectivity(access_condition) * cardinality(table)という式で計算されるみたいだ。

read_costは選択されたアクセス方法にかかるコスト（恐らく１行あたり）と見積もり行数（結合順でいう前のテーブルとの結合結果の見積もり行数）の乗算で表されるコストらしい。その名の通り読み取りのコストと呼ぶにふさわしい気がする。Executorでのフィルタリングやソートのコストは含まないとのことなので、選択されているアクセス方法によってフェッチされるコストと呼べば良さそう。

filter_effectはrows_fetched（選択されているアクセス方法によってフェッチされる行数）に対して何割がフィルタリングされるかを表す割合である。

prefix_rowcountは次のテーブルに結合される行の組み合わせの数。すなわち、選択されているアクセス方法によってフェッチされて、フィルタリングも終わった後の行数である。これはrow_fetched と filter_effectの乗算になるということである（結合順でいう前のテーブルがある場合はlast_table.prefix_rowcount、つまり前のテーブルから渡された行数の分の乗算をさらに考慮する必要がある）

なお、適当にステップ実行してたら以下のようなメソッドを発見した。確かにコメントの通りの計算方法が確認できる。

/**
    Set complete estimated cost and produced rowcount for the prefix of tables
    up to and including this table, calculated from the cost of the previous
    stage, the fanout of the current stage and the cost to process a row at
    the current stage.

    @param idx      Index of position object within array, if zero there is no
                    "previous" stage that can be added.
    @param cm       Cost model that provides the actual calculation
  */
  void set_prefix_join_cost(uint idx, const Cost_model_server *cm) {
    if (idx == 0) {
      prefix_rowcount = rows_fetched;
      prefix_cost = read_cost + cm->row_evaluate_cost(prefix_rowcount);
    } else {
      prefix_rowcount = (this - 1)->prefix_rowcount * rows_fetched;
      prefix_cost = (this - 1)->prefix_cost + read_cost +
                    cm->row_evaluate_cost(prefix_rowcount);
    }
    prefix_rowcount *= filter_effect;
  }

prefix_costについてはコメントで説明がなかったが、上記のメソッドを見る限りではread_costと0.1*prefix_rowcount（eval_cost）の和で表されるコストな様子。結合順でいう前のテーブルがある場合は、前のテーブルのprefix_costも足す様子。つまり最後の結合テーブルのprefix_costを見ると、全てのテーブルのread_costとeval_costの総和が分かるみたい。

ここまでで以下のような対応が整理できる。

"rows_examined_per_scan" → col_rows 
                         → static_cast<ulonglong>(pos->rows_fetched)
                         → 選択されているアクセス方法によってフェッチされる行数
                         
"rows_produced_per_join" → col_prefix_rows 
                         → static_cast<ulonglong>(std::min(prefix_rows, ULLONG_MAX_DOUBLE))
                         →　次のテーブルに結合される行の組み合わせの数
                         
"read_cost"              → col_read_cost 
                         → pos->read_cost < 0.0 ? 0.0: pos->read_cost
                         →　選択されているアクセス方法によって見積もり行数をフェッチするコスト
                         
"eval_cost"              → col_cond_cost 
                         →　cond_cost < 0 ? 0 : cond_cost
                         → 0.1 * rows_produced_per_join
                         
"prefix_cost"            → col_prefix_cost 
                         → pos->prefix_cost
                         → read_cost　＋　eval_cost　（＋前のテーブルのprefix_cost）
                         
※　double prefix_rows = pos->prefix_rowcount;
※　double const cond_cost = join->cost_model()->row_evaluate_cost(prefix_rows);

eval_costは結局何のコストと評すれば良いかまだ分からない。cost_modelというのも何だろう。シンプルケースの場合でもう少し深く見積もり過程も追ってみたい。オプティマイザとレースも比較したい。

まだまだ旅をしなければ...