Bug #32089028 CONCURRENTLY UPDATING MANY JSON DOCUMENTS STEADILY INCREASES IBD FILE SIZE

gurusami · gurusami · commit 5ba03cadf63b · 2021-04-26T06:43:20.000+02:00
Background:

Currently the purge system groups the undo records based on table_id and then
it is distributed to different purge threads.  This way all undo records
belonging to a single table is handled by the same purge thread.  This grouping
is done to avoid contention b/w purge threads.  Our documentation says this:
"If DML action is concentrated on a single table or a few tables, keep the
setting low so that the threads do not contend with each other for access to
the tables.".

Problem:

But, what if all the DMLs are happening on only one table.  The grouping based
on table_id makes the distribution of work between purge threads lopsided.  For
each purge batch, only one thread will be working on the undo records of the
single table.  So the purge is slower and the purge lag is higher.  Increasing
the number of purge threads will not help to purge faster.

Solution:

Auto tune solution.

Overall Idea:

1. Create only as many groups as there are purge threads.
2. Distributed the undo records based on table_id between these groups.
3. Check if the undo records are uniformly distributed.
4. If they are not uniformly distributed, redistribute.

Maximum (MAX) records per purge group:

1. Let total number of undo records in a purge batch be T.
2. Let M be the total number of purge threads.
3. Maximum number of records per group is (T + M)/M.

Minimum (MIN) records per purge group:

1. Let MAX be the the maximum number of records per purge group.
2. Let M be the total number of purge threads.
3. If MAX &gt; M, then MIN = MAX - M.
4. If MAX =&lt; M, then MIN = 0.

Redistribution:

1. Let i be the first group.
2. If the ith group has more records than it should, move extra records to the
   next group.
3. If it is the last group, move extra records to the first group.
4. Increment i. And go to step 1.
5. Single pass is sufficient.  But if extra records are moved from last to
   first group, then one more pass is needed. In the 2nd pass, stop in the
   first group having &lt; MAX records.

What is uniform distribution in this context:

1.  All groups have record count between MIN and MAX.

rb#26108 approved by Debarun Banerjee &lt;debarun.banerjee@oracle.com&gt;
diff --git a/mysql-test/suite/innodb/r/undo-order.result b/mysql-test/suite/innodb/r/undo-order.result
@@ -0,0 +1,25 @@
+SET GLOBAL innodb_fast_shutdown=0;
+# restart: --debug=d,innodb_purge_sleep_12
+CREATE TABLE t1 (f1 INT PRIMARY KEY, f2 JSON);
+SET @data_1 = REPEAT('Ax', 16384);
+SET @data_2 = REPEAT('By', 16384);
+SET @data_3 = REPEAT('Cz', 16384);
+SET @data_4 = REPEAT('Pn', 16384);
+SET @json_doc_1 = CONCAT('["', @data_1, '", "', @data_2, '", "', @data_3, '", "', @data_4, '"]');
+INSERT INTO t1 (f1, f2) VALUES (1, @json_doc_1);
+start transaction;
+SELECT f1 FROM t1;
+f1
+1
+SET @e1 = REPEAT('Do', 16384);
+SET @e2 = REPEAT('Em', 16384);
+SET @e3 = REPEAT('Fi', 16384);
+SET @e4 = REPEAT('Gj', 16384);
+SET @json_doc_2 = CONCAT('["', @e1, '", "', @e2, '", "', @e3, '", "', @e4, '"]');
+UPDATE t1 SET f2 = @json_doc2;
+SET @e4 = REPEAT('Hu', 16384);
+SET @elem4 = CONCAT('"', @e4, '"');
+UPDATE t1 SET f2 = JSON_SET(f2, '$[3]', @elem4) WHERE f1 = 1;
+DELETE FROM t1;
+commit;
+DROP TABLE t1;
diff --git a/mysql-test/suite/innodb/t/undo-order-master.opt b/mysql-test/suite/innodb/t/undo-order-master.opt
@@ -0,0 +1 @@
+--max_allowed_packet=500M
diff --git a/mysql-test/suite/innodb/t/undo-order.test b/mysql-test/suite/innodb/t/undo-order.test
@@ -0,0 +1,41 @@
+--source include/count_sessions.inc
+--source include/have_debug.inc
+
+SET GLOBAL innodb_fast_shutdown=0;
+let $restart_parameters = restart: --debug=d,innodb_purge_sleep_12;
+--source include/restart_mysqld.inc
+
+CREATE TABLE t1 (f1 INT PRIMARY KEY, f2 JSON);
+SET @data_1 = REPEAT('Ax', 16384);
+SET @data_2 = REPEAT('By', 16384);
+SET @data_3 = REPEAT('Cz', 16384);
+SET @data_4 = REPEAT('Pn', 16384);
+SET @json_doc_1 = CONCAT('["', @data_1, '", "', @data_2, '", "', @data_3, '", "', @data_4, '"]');
+INSERT INTO t1 (f1, f2) VALUES (1, @json_doc_1);
+
+connect (con1,localhost,root,,);
+connection con1;
+start transaction;
+SELECT f1 FROM t1;
+
+connection default;
+SET @e1 = REPEAT('Do', 16384);
+SET @e2 = REPEAT('Em', 16384);
+SET @e3 = REPEAT('Fi', 16384);
+SET @e4 = REPEAT('Gj', 16384);
+SET @json_doc_2 = CONCAT('["', @e1, '", "', @e2, '", "', @e3, '", "', @e4, '"]');
+UPDATE t1 SET f2 = @json_doc2;
+
+SET @e4 = REPEAT('Hu', 16384);
+SET @elem4 = CONCAT('"', @e4, '"');
+UPDATE t1 SET f2 = JSON_SET(f2, '$[3]', @elem4) WHERE f1 = 1;
+DELETE FROM t1;
+
+connection con1;
+commit;
+connection default;
+disconnect con1;
+--source include/wait_innodb_all_purged.inc
+--source include/wait_until_count_sessions.inc
+
+DROP TABLE t1;
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
@@ -2269,7 +2269,8 @@ static rec_t *btr_insert_into_right_sibling(uint32_t flags, btr_cur_t *cursor,
   /* We have to change the parent node pointer */
 
   compressed = btr_cur_pessimistic_delete(&err, TRUE, &next_father_cursor,
-                                          BTR_CREATE_FLAG, false, 0, 0, 0, mtr);
+                                          BTR_CREATE_FLAG, false, 0, 0, 0, mtr,
+                                          nullptr, nullptr);
 
   ut_a(err == DB_SUCCESS);
 
@@ -2826,8 +2827,9 @@ void btr_node_ptr_delete(dict_index_t *index, buf_block_t *block, mtr_t *mtr) {
   /* Delete node pointer on father page */
   btr_page_get_father(index, block, mtr, &cursor);
 
-  compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, BTR_CREATE_FLAG,
-                                          false, 0, 0, 0, mtr);
+  compressed =
+      btr_cur_pessimistic_delete(&err, TRUE, &cursor, BTR_CREATE_FLAG, false, 0,
+                                 0, 0, mtr, nullptr, nullptr);
   ut_a(err == DB_SUCCESS);
 
   if (!compressed) {
@@ -3374,8 +3376,9 @@ ibool btr_compress(
       lock_prdt_page_free_from_discard(block, lock_sys->prdt_page_hash);
       lock_rec_free_all_from_discard_page(block);
     } else {
-      compressed = btr_cur_pessimistic_delete(
-          &err, TRUE, &cursor2, BTR_CREATE_FLAG, false, 0, 0, 0, mtr);
+      compressed =
+          btr_cur_pessimistic_delete(&err, TRUE, &cursor2, BTR_CREATE_FLAG,
+                                     false, 0, 0, 0, mtr, nullptr, nullptr);
       ut_a(err == DB_SUCCESS);
 
       if (!compressed) {
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
@@ -4625,7 +4625,7 @@ ibool btr_cur_pessimistic_delete(dberr_t *err, ibool has_reserved_extents,
                                  btr_cur_t *cursor, uint32_t flags,
                                  bool rollback, trx_id_t trx_id,
                                  undo_no_t undo_no, ulint rec_type, mtr_t *mtr,
-                                 btr_pcur_t *pcur) {
+                                 btr_pcur_t *pcur, purge_node_t *node) {
   DBUG_TRACE;
 
   buf_block_t *block;
@@ -4670,7 +4670,8 @@ ibool btr_cur_pessimistic_delete(dberr_t *err, ibool has_reserved_extents,
 
     /* The following call will restart the btr_mtr, which could change the
     cursor position. */
-    btr_ctx.free_externally_stored_fields(trx_id, undo_no, rollback, rec_type);
+    btr_ctx.free_externally_stored_fields(trx_id, undo_no, rollback, rec_type,
+                                          node);
 
     /* The cursor position could have changed now. */
     if (pcur != nullptr) {
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
@@ -5368,7 +5368,8 @@ dberr_t DDTableBuffer::remove(table_id_t id) {
     DEBUG_SYNC_C("delete_metadata_before");
 
     btr_cur_pessimistic_delete(&error, false, btr_pcur_get_btr_cur(&pcur),
-                               BTR_CREATE_FLAG, false, 0, 0, 0, &mtr);
+                               BTR_CREATE_FLAG, false, 0, 0, 0, &mtr, nullptr,
+                               nullptr);
     ut_ad(error == DB_SUCCESS);
   }
 
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
@@ -2187,10 +2187,13 @@ page_no_t fseg_get_nth_frag_page_no(
     mtr_t *mtr MY_ATTRIBUTE((unused)))
 /*!< in/out: mini-transaction */
 {
+#ifdef UNIV_DEBUG
+  const std::size_t n_slots = FSEG_FRAG_ARR_N_SLOTS;
   ut_ad(inode && mtr);
-  ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+  ut_ad(n < n_slots);
   ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_SX_FIX));
   ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+#endif /* UNIV_DEBUG */
   return (mach_read_from_4(inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE));
 }
 
@@ -3436,7 +3439,8 @@ static void fseg_free_page_low(fseg_inode_t *seg_inode,
       /* The page is in the fragment pages of the segment */
 
       for (i = 0;; i++) {
-        if (fseg_get_nth_frag_page_no(seg_inode, i, mtr) == page_id.page_no()) {
+        const page_no_t page_no = fseg_get_nth_frag_page_no(seg_inode, i, mtr);
+        if (page_no == page_id.page_no()) {
           fseg_set_nth_frag_page_no(seg_inode, i, FIL_NULL, mtr);
           break;
         }
diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc
@@ -1633,8 +1633,9 @@ void rtr_node_ptr_delete(dict_index_t *index, btr_cur_t *sea_cur,
   ibool compressed;
   dberr_t err;
 
-  compressed = btr_cur_pessimistic_delete(&err, TRUE, sea_cur, BTR_CREATE_FLAG,
-                                          false, 0, 0, 0, mtr);
+  compressed =
+      btr_cur_pessimistic_delete(&err, TRUE, sea_cur, BTR_CREATE_FLAG, false, 0,
+                                 0, 0, mtr, nullptr, nullptr);
   ut_a(err == DB_SUCCESS);
 
   if (!compressed) {
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -4014,7 +4014,7 @@ static MY_ATTRIBUTE((warn_unused_result)) ibool ibuf_delete_rec(
   root = ibuf_tree_root_get(mtr);
 
   btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0, false,
-                             0, 0, 0, mtr);
+                             0, 0, 0, mtr, nullptr, nullptr);
   ut_a(err == DB_SUCCESS);
 
 #ifdef UNIV_IBUF_COUNT_DEBUG
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
@@ -512,19 +512,20 @@ ibool btr_cur_optimistic_delete_func(
                   occur, the cursor stays valid: it points to successor of
                   deleted record on function exit
 @param[in] flags  BTR_CREATE_FLAG or 0
-@param[in] rollback True if performing rollback, false otherwise.
-@param[in] trx_id The current transaction id.
-@param[in] undo_no Undo number of the transaction. This is needed for
-rollback to savepoint of partially updated LOB.
-@param[in] rec_type Undo record type.
-@param[in] mtr Mini-transaction
-@param[in] pcur   Persistent cursor on the record to delete.
+@param[in] rollback     True if performing rollback, false otherwise.
+@param[in] trx_id       The current transaction id.
+@param[in] undo_no      Undo number of the transaction. This is needed for
+                        rollback to savepoint of partially updated LOB.
+@param[in] rec_type     Undo record type.
+@param[in] mtr          The mini transaction
+@param[in] pcur         Persistent cursor on the record to delete.
+@param[in,out] node     purge node or nullptr
 @return true if compression occurred */
 ibool btr_cur_pessimistic_delete(dberr_t *err, ibool has_reserved_extents,
                                  btr_cur_t *cursor, uint32_t flags,
                                  bool rollback, trx_id_t trx_id,
                                  undo_no_t undo_no, ulint rec_type, mtr_t *mtr,
-                                 btr_pcur_t *pcur = nullptr);
+                                 btr_pcur_t *pcur, purge_node_t *node);
 
 /** Parses a redo log record of updating a record in-place.
  @return end of log record or NULL */
diff --git a/storage/innobase/include/lob0first.h b/storage/innobase/include/lob0first.h
@@ -256,6 +256,9 @@ struct first_page_t : public basic_page_t {
   /** Free all the pages associated with this LOB. */
   void destroy();
 
+  /** Free all the pages associated with this LOB, except the first page. */
+  void make_empty();
+
   /** Check if the index list is empty or not.
   @return true if empty, false otherwise. */
   bool is_empty() const {
diff --git a/storage/innobase/include/lob0lob.h b/storage/innobase/include/lob0lob.h
@@ -669,6 +669,17 @@ class BtrContext {
         m_op(OPCODE_UNKNOWN),
         m_btr_page_no(FIL_NULL) {}
 
+  /** Constructor **/
+  BtrContext(mtr_t *mtr, dict_index_t *index, buf_block_t *block)
+      : m_mtr(mtr),
+        m_pcur(nullptr),
+        m_index(index),
+        m_rec(nullptr),
+        m_offsets(nullptr),
+        m_block(block),
+        m_op(OPCODE_UNKNOWN),
+        m_btr_page_no(FIL_NULL) {}
+
   /** Constructor **/
   BtrContext(mtr_t *mtr, btr_pcur_t *pcur, dict_index_t *index, rec_t *rec,
              ulint *offsets, buf_block_t *block)
@@ -774,9 +785,11 @@ class BtrContext {
   @param[in]	undo_no		undo number within a transaction whose
                                   LOB is being freed.
   @param[in]	rollback	performing rollback?
-  @param[in]	rec_type	undo record type.*/
+  @param[in]	rec_type	undo record type.
+  @param[in]	node		purge node or nullptr */
   void free_externally_stored_fields(trx_id_t trx_id, undo_no_t undo_no,
-                                     bool rollback, ulint rec_type);
+                                     bool rollback, ulint rec_type,
+                                     purge_node_t *node);
 
   /** Frees the externally stored fields for a record, if the field
   is mentioned in the update vector.
@@ -1570,9 +1583,11 @@ ulint btr_rec_get_externally_stored_len(const rec_t *rec, const ulint *offsets);
 @param[in]	undo_no		during rollback to savepoint, purge only upto
                                 this undo number.
 @param[in]	rec_type	undo record type.
-@param[in]	uf		the update vector for the field. */
+@param[in]	uf		the update vector for the field.
+@param[in]	node		the purge node or nullptr. */
 void purge(lob::DeleteContext *ctx, dict_index_t *index, trx_id_t trxid,
-           undo_no_t undo_no, ulint rec_type, const upd_field_t *uf);
+           undo_no_t undo_no, ulint rec_type, const upd_field_t *uf,
+           purge_node_t *node);
 
 /** Update a portion of the given LOB.
 @param[in]	ctx		update operation context information.
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
@@ -76,6 +76,17 @@ function used in an SQL execution graph.
 que_thr_t *row_purge_step(que_thr_t *thr) /*!< in: query thread */
     MY_ATTRIBUTE((warn_unused_result));
 
+using Page_free_tuple = std::tuple<index_id_t, page_id_t>;
+
+struct Compare_page_free_tuple {
+  bool operator()(const Page_free_tuple &lhs,
+                  const Page_free_tuple &rhs) const {
+    const page_id_t &lpage_id = std::get<1>(lhs);
+    const page_id_t &rpage_id = std::get<1>(rhs);
+    return (lpage_id < rpage_id);
+  }
+};
+
 /* Purge node structure */
 
 struct purge_node_t {
@@ -161,6 +172,23 @@ struct purge_node_t {
   /** Undo recs to purge */
   Recs *recs;
 
+  void init() { new (&m_lob_pages) LOB_free_set(); }
+
+  /** Add an LOB page to the list of pages that will be freed at the end of a
+   * purge batch.
+   @param[in]    index       the clust index to which the LOB belongs.
+   @param[in]    page_id     the page_id of the first page of the LOB.
+   @param[in]    page_size   The page size of the tablespace. */
+  void add_lob_page(dict_index_t *index, const page_id_t &page_id) {
+    const index_id_t id(page_id.space(), index->id);
+    const auto tup = std::make_tuple(id, page_id);
+    ut_ad(m_lob_pages.find(tup) == m_lob_pages.end());
+    m_lob_pages.insert(tup);
+  }
+
+  /** Free the LOB first pages at end of purge batch. */
+  void free_lob_pages();
+
   /** Check if undo records of given table_id is there in this purge node.
   @param[in]	table_id	look for undo records of this table id.
   @return true if undo records of table id exists, false otherwise. */
@@ -183,6 +211,13 @@ struct purge_node_t {
      the ref member.*/
   bool validate_pcur();
 #endif
+
+ private:
+  using LOB_free_set = std::set<Page_free_tuple, Compare_page_free_tuple,
+                                ut_allocator<Page_free_tuple>>;
+
+  /** Set of LOB first pages that are to be freed. */
+  LOB_free_set m_lob_pages;
 };
 
 #endif
diff --git a/storage/innobase/include/zlob0first.h b/storage/innobase/include/zlob0first.h
@@ -568,6 +568,10 @@ struct z_first_page_t {
   @return the total number of pages freed. */
   size_t destroy();
 
+  /** Free all the pages of the zlob except the first page.
+  @return the total number of pages freed. */
+  size_t make_empty();
+
 #ifdef UNIV_DEBUG
  private:
   /** Validate the LOB.  This is a costly function.  We need to avoid using
diff --git a/storage/innobase/lob/lob0first.cc b/storage/innobase/lob/lob0first.cc
@@ -458,9 +458,17 @@ void first_page_t::dealloc() {
 }
 
 void first_page_t::destroy() {
+  make_empty();
+  dealloc();
+}
+
+void first_page_t::make_empty() {
   free_all_data_pages();
   free_all_index_pages();
-  dealloc();
+  byte *free_lst = free_list();
+  byte *index_lst = index_list();
+  flst_init(index_lst, m_mtr);
+  flst_init(free_lst, m_mtr);
 }
 
 }  // namespace lob
diff --git a/storage/innobase/lob/lob0lob.cc b/storage/innobase/lob/lob0lob.cc
@@ -1052,7 +1052,9 @@ void BtrContext::free_updated_extern_fields(trx_id_t trx_id, undo_no_t undo_no,
       byte *field_ref = data + len - BTR_EXTERN_FIELD_REF_SIZE;
 
       DeleteContext ctx(*this, field_ref, ufield->field_no, rollback);
-      lob::purge(&ctx, m_index, trx_id, undo_no, 0, ufield);
+
+      /* Last argument is nullptr because this is rollback. */
+      lob::purge(&ctx, m_index, trx_id, undo_no, 0, ufield, nullptr);
       if (need_recalc()) {
         recalc();
       }
@@ -1137,10 +1139,12 @@ ulint btr_rec_get_externally_stored_len(const rec_t *rec,
 @param[in]	undo_no		undo number within a transaction whose
                                 LOB is being freed.
 @param[in]	rollback	performing rollback?
-@param[in]	rec_type	undo record type.*/
+@param[in]	rec_type	undo record type.
+@param[in]	node        purge node or nullptr */
 void BtrContext::free_externally_stored_fields(trx_id_t trx_id,
                                                undo_no_t undo_no, bool rollback,
-                                               ulint rec_type) {
+                                               ulint rec_type,
+                                               purge_node_t *node) {
   ut_ad(rec_offs_validate());
   ut_ad(mtr_is_page_fix(m_mtr, m_rec, MTR_MEMO_PAGE_X_FIX, m_index->table));
   /* Assert that the cursor position and the record are matching. */
@@ -1157,7 +1161,7 @@ void BtrContext::free_externally_stored_fields(trx_id_t trx_id,
       DeleteContext ctx(*this, field_ref, i, rollback);
 
       upd_field_t *uf = nullptr;
-      lob::purge(&ctx, m_index, trx_id, undo_no, rec_type, uf);
+      lob::purge(&ctx, m_index, trx_id, undo_no, rec_type, uf, node);
       if (need_recalc()) {
         recalc();
       }
diff --git a/storage/innobase/lob/lob0purge.cc b/storage/innobase/lob/lob0purge.cc
diff --git a/storage/innobase/lob/zlob0first.cc b/storage/innobase/lob/zlob0first.cc
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc

Original file line number	Diff line number	Diff line change
`@@ -5368,7 +5368,8 @@ dberr_t DDTableBuffer::remove(table_id_t id) {`
`5368`	`5368`	`DEBUG_SYNC_C("delete_metadata_before");`
`5369`	`5369`
`5370`	`5370`	`btr_cur_pessimistic_delete(&error, false, btr_pcur_get_btr_cur(&pcur),`
`5371`		`- BTR_CREATE_FLAG, false, 0, 0, 0, &mtr);`
	`5371`	`+ BTR_CREATE_FLAG, false, 0, 0, 0, &mtr, nullptr,`
	`5372`	`+ nullptr);`
`5372`	`5373`	`ut_ad(error == DB_SUCCESS);`
`5373`	`5374`	`}`
`5374`	`5375`