Bug#34986611 Test Ndb MTA unordered commits with log

blaudden · blaudden · commit 20384f7bfa72 · 2023-06-07T11:51:48.000+02:00
Multi-threaded-applier work for Ndb uncovered "bug#34229520 : Ndb MTA
unordered commits with log" and was fixed in 8.0.33 by commit
2510e0d6a972140476a17e0283572d36eaefb00a.

No testcase was added as the problem only showed up with a storage
engine which handles Binlogging itself (e.g. Ndb).

This patch adds the testcase.

Change-Id: Ib833e37ac5003a3fd4ca4c0cf8a6b6fafc41c1ba
diff --git a/mysql-test/suite/ndb_rpl/r/mta_commit_sync2.result b/mysql-test/suite/ndb_rpl/r/mta_commit_sync2.result
@@ -0,0 +1,140 @@
+include/master-slave.inc
+Warnings:
+Note	####	Sending passwords in plain text without SSL/TLS is extremely insecure.
+Note	####	Storing MySQL user name or password information in the connection metadata repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START REPLICA; see the 'START REPLICA Syntax' in the MySQL Manual for more information.
+[connection master]
+# Generate epoch transactions without dependencies by using WRITESET
+# dependency tracking
+show variables like 'binlog_transaction_dependency_tracking';
+Variable_name	Value
+binlog_transaction_dependency_tracking	WRITESET
+show variables like 'ndb_log_transaction_dependency';
+Variable_name	Value
+ndb_log_transaction_dependency	ON
+create table t1 (
+a int primary key,
+b varchar(100),
+c int) engine=ndb;
+insert into t1 values
+(1, "A", 0),
+(2, "A", 0),
+(3, "A", 0);
+# Valid state after epoch 0
+select * from t1 order by a;
+a	b	c
+1	A	0
+2	A	0
+3	A	0
+[connection slave]
+# Relevant replica variables
+show variables like 'replica_parallel_workers';
+Variable_name	Value
+replica_parallel_workers	10
+show variables like 'log_bin';
+Variable_name	Value
+log_bin	ON
+show variables like 'log_replica_updates';
+Variable_name	Value
+log_replica_updates	ON
+show variables like 'replica_preserve_commit_order';
+Variable_name	Value
+replica_preserve_commit_order	ON
+# Supress MTA errors generated by test
+call mtr.add_suppression(".*Could not execute Write_rows event.*");
+call mtr.add_suppression(".*possibly leaving data in inconsistent state.*");
+call mtr.add_suppression(".*worker has stopped after at least one previous worker encountered an error when replica-preserve-commit-order was enabled.*");
+include/start_slave.inc
+[connection master]
+include/sync_slave_sql_with_master.inc
+include/stop_slave.inc
+[connection master]
+# Epoch 1
+update t1 set b="B", c=1 where a=1;
+# Wait for ndb_binlog thread...
+# Valid state after epoch 1
+select * from t1 order by a;
+a	b	c
+1	B	1
+2	A	0
+3	A	0
+# Epoch 2
+begin;
+update t1 set b="C", c=2 where a=2;
+commit;
+# Wait for ndb_binlog thread...
+# Valid state after epoch 2
+select * from t1 order by a;
+a	b	c
+1	B	1
+2	C	2
+3	A	0
+# Epoch 3
+update t1 set b="D", c=3 where a=3;
+# Wait for ndb_binlog thread...
+# Valid state after epoch 3
+select * from t1 order by a;
+a	b	c
+1	B	1
+2	C	2
+3	D	3
+# Take a row lock on row 2 on one connection to the replica
+# This stops epoch 2 from finishing its prepare, and
+# therefore it will be unable to commit.
+# With commit ordering on, epoch 3 will not be able to commit
+# and will not become visible.
+[connection slave1]
+begin;
+select * from t1 where a=2 for update;
+a	b	c
+2	A	0
+# Start the replica from the other replica connection
+[connection slave]
+include/start_slave.inc
+# Row lock should stall epoch 2, and hence also
+# the independent epoch 3.
+# Check that no sign of epoch 3 can be seen while the
+# row lock is held.
+# This also stall the replica long enough to fail the transaction
+# being applied and thus replication will stop.
+# Expect that only epoch 1 successfully applied.
+select * from test.t1 order by a;
+a	b	c
+1	B	1
+2	A	0
+3	A	0
+# Now cleanup
+[connection slave1]
+# Release lock
+commit;
+[connection slave]
+# Wait for replication error
+include/wait_for_slave_sql_error.inc [errno=3030]
+# Show the error number and message after failure
+SELECT WORKER_ID, LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
+FROM performance_schema.replication_applier_status_by_worker
+WHERE LAST_ERROR_NUMBER != 0;
+WORKER_ID	1
+LAST_ERROR_NUMBER	1180
+LAST_ERROR_MESSAGE	Worker 1 failed executing transaction 'ANONYMOUS' at source log master-bin.000001, end_log_pos NNN; Could not execute Write_rows event on table test.t1; Got error -1 - 'Unknown error -1' during COMMIT, Error_code: 1180; the event's source log master-bin.000001, end_log_pos NNN
+WORKER_ID	2
+LAST_ERROR_NUMBER	3030
+LAST_ERROR_MESSAGE	Worker 2 failed executing transaction 'ANONYMOUS' at source log master-bin.000001, end_log_pos NNN; Error 'Replica worker has stopped after at least one previous worker encountered an error when replica-preserve-commit-order was enabled. To preserve commit order, the last transaction executed by this thread has not been committed. When restarting the replica after fixing any failed threads, you should fix this worker as well.' on query. Default database: ''. Query: 'COMMIT'
+# Show the coordinator error number and message
+SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
+FROM performance_schema.replication_applier_status_by_coordinator;
+LAST_ERROR_NUMBER	3030
+LAST_ERROR_MESSAGE	Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker 2 failed executing transaction 'ANONYMOUS' at source log master-bin.000001, end_log_pos NNN. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
+# Start replication again
+include/start_slave.inc
+# Resync
+[connection master]
+# Show resynced
+select * from test.t1 order by a;
+a	b	c
+1	B	1
+2	C	2
+3	D	3
+[connection master]
+# Drop table
+drop table test.t1;
+include/rpl_end.inc
diff --git a/mysql-test/suite/ndb_rpl/t/mta_commit_sync2.cnf b/mysql-test/suite/ndb_rpl/t/mta_commit_sync2.cnf
@@ -0,0 +1,12 @@
+!include suite/ndb_rpl/my.cnf
+
+[mysqld]
+binlog-transaction-dependency-tracking=WRITESET
+ndb-log-transaction-dependency=ON
+replica-parallel-workers=10
+
+[mysqld.1.slave]
+
+#disable-log-bin
+replica-preserve-commit-order=1
+log-replica-updates=1
diff --git a/mysql-test/suite/ndb_rpl/t/mta_commit_sync2.test b/mysql-test/suite/ndb_rpl/t/mta_commit_sync2.test
@@ -0,0 +1,155 @@
+--source include/have_ndb.inc
+--source include/have_binlog_format_mixed_or_row.inc
+
+# Configure replication, don't start applier
+--let $rpl_skip_start_slave= 1
+--source suite/ndb_rpl/ndb_master-slave.inc
+
+--echo # Generate epoch transactions without dependencies by using WRITESET
+--echo # dependency tracking
+show variables like 'binlog_transaction_dependency_tracking';
+show variables like 'ndb_log_transaction_dependency';
+
+create table t1 (
+  a int primary key,
+  b varchar(100),
+  c int) engine=ndb;
+
+# Initial row content
+insert into t1 values
+  (1, "A", 0),
+  (2, "A", 0),
+  (3, "A", 0);
+
+--echo # Valid state after epoch 0
+select * from t1 order by a;
+
+--source include/rpl_connection_slave.inc
+--echo # Relevant replica variables
+show variables like 'replica_parallel_workers';
+show variables like 'log_bin';
+show variables like 'log_replica_updates';
+show variables like 'replica_preserve_commit_order';
+
+--echo # Supress MTA errors generated by test
+call mtr.add_suppression(".*Could not execute Write_rows event.*");
+call mtr.add_suppression(".*possibly leaving data in inconsistent state.*");
+call mtr.add_suppression(".*worker has stopped after at least one previous worker encountered an error when replica-preserve-commit-order was enabled.*");
+
+--source include/start_slave.inc
+
+--source include/rpl_connection_master.inc
+--source include/sync_slave_sql_with_master.inc
+
+--source include/stop_slave.inc
+
+# Both clusters have same initial 3 rows
+# Now define some independent transactions, allowing parallel apply
+# but still requiring commit order if specified.
+
+--source include/rpl_connection_master.inc
+
+--echo # Epoch 1
+update t1 set b="B", c=1 where a=1;
+--source suite/ndb/include/ndb_binlog_wait_own_changes.inc
+
+--echo # Valid state after epoch 1
+select * from t1 order by a;
+
+--echo # Epoch 2
+begin;
+update t1 set b="C", c=2 where a=2;
+commit;
+
+--source suite/ndb/include/ndb_binlog_wait_own_changes.inc
+
+--echo # Valid state after epoch 2
+select * from t1 order by a;
+
+--echo # Epoch 3
+update t1 set b="D", c=3 where a=3;
+
+--source suite/ndb/include/ndb_binlog_wait_own_changes.inc
+
+--echo # Valid state after epoch 3
+select * from t1 order by a;
+
+#show binlog events;
+
+--echo # Take a row lock on row 2 on one connection to the replica
+--echo # This stops epoch 2 from finishing its prepare, and
+--echo # therefore it will be unable to commit.
+--echo # With commit ordering on, epoch 3 will not be able to commit
+--echo # and will not become visible.
+
+--source include/rpl_connection_slave1.inc
+
+begin;
+select * from t1 where a=2 for update;
+
+
+--echo # Start the replica from the other replica connection
+--source include/rpl_connection_slave.inc
+--source include/start_slave.inc
+
+--echo # Row lock should stall epoch 2, and hence also
+--echo # the independent epoch 3.
+--echo # Check that no sign of epoch 3 can be seen while the
+--echo # row lock is held.
+--echo # This also stall the replica long enough to fail the transaction
+--echo # being applied and thus replication will stop.
+let $ct=20;
+while ($ct)
+{
+  if (`select count(1) as c from test.t1 where c=3`)
+  {
+    --echo FAIL : Epoch 3 committed while Epoch 2 not applicable!
+    select * from test.t1 order by a;
+    # Will cause result-content mismatch
+  }
+  sleep 0.5;
+  dec $ct;
+}
+
+--echo # Expect that only epoch 1 successfully applied.
+select * from test.t1 order by a;
+
+--echo # Now cleanup
+
+--source include/rpl_connection_slave1.inc
+--echo # Release lock
+commit;
+
+--source include/rpl_connection_slave.inc
+
+--echo # Wait for replication error
+let $slave_sql_errno= 3030;
+--source include/wait_for_slave_sql_error.inc
+
+--echo # Show the error number and message after failure
+--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/
+query_vertical
+  SELECT WORKER_ID, LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
+    FROM performance_schema.replication_applier_status_by_worker
+    WHERE LAST_ERROR_NUMBER != 0;
+--echo # Show the coordinator error number and message
+--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/
+query_vertical
+  SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
+    FROM performance_schema.replication_applier_status_by_coordinator;
+
+--echo # Start replication again
+--source include/start_slave.inc
+
+--echo # Resync
+--source include/rpl_connection_master.inc
+--sync_slave_with_master
+
+--echo # Show resynced
+select * from test.t1 order by a;
+
+--source include/rpl_connection_master.inc
+--echo # Drop table
+drop table test.t1;
+
+--source include/rpl_end.inc