Skip to content

Commit 20384f7

Browse files
committed
Bug#34986611 Test Ndb MTA unordered commits with log
Multi-threaded-applier work for Ndb uncovered "bug#34229520 : Ndb MTA unordered commits with log" and was fixed in 8.0.33 by commit 2510e0d6a972140476a17e0283572d36eaefb00a. No testcase was added as the problem only showed up with a storage engine which handles Binlogging itself (e.g. Ndb). This patch adds the testcase. Change-Id: Ib833e37ac5003a3fd4ca4c0cf8a6b6fafc41c1ba
1 parent 14a23e9 commit 20384f7

File tree

3 files changed

+307
-0
lines changed

3 files changed

+307
-0
lines changed
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
include/master-slave.inc
2+
Warnings:
3+
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
4+
Note #### Storing MySQL user name or password information in the connection metadata repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START REPLICA; see the 'START REPLICA Syntax' in the MySQL Manual for more information.
5+
[connection master]
6+
# Generate epoch transactions without dependencies by using WRITESET
7+
# dependency tracking
8+
show variables like 'binlog_transaction_dependency_tracking';
9+
Variable_name Value
10+
binlog_transaction_dependency_tracking WRITESET
11+
show variables like 'ndb_log_transaction_dependency';
12+
Variable_name Value
13+
ndb_log_transaction_dependency ON
14+
create table t1 (
15+
a int primary key,
16+
b varchar(100),
17+
c int) engine=ndb;
18+
insert into t1 values
19+
(1, "A", 0),
20+
(2, "A", 0),
21+
(3, "A", 0);
22+
# Valid state after epoch 0
23+
select * from t1 order by a;
24+
a b c
25+
1 A 0
26+
2 A 0
27+
3 A 0
28+
[connection slave]
29+
# Relevant replica variables
30+
show variables like 'replica_parallel_workers';
31+
Variable_name Value
32+
replica_parallel_workers 10
33+
show variables like 'log_bin';
34+
Variable_name Value
35+
log_bin ON
36+
show variables like 'log_replica_updates';
37+
Variable_name Value
38+
log_replica_updates ON
39+
show variables like 'replica_preserve_commit_order';
40+
Variable_name Value
41+
replica_preserve_commit_order ON
42+
# Supress MTA errors generated by test
43+
call mtr.add_suppression(".*Could not execute Write_rows event.*");
44+
call mtr.add_suppression(".*possibly leaving data in inconsistent state.*");
45+
call mtr.add_suppression(".*worker has stopped after at least one previous worker encountered an error when replica-preserve-commit-order was enabled.*");
46+
include/start_slave.inc
47+
[connection master]
48+
include/sync_slave_sql_with_master.inc
49+
include/stop_slave.inc
50+
[connection master]
51+
# Epoch 1
52+
update t1 set b="B", c=1 where a=1;
53+
# Wait for ndb_binlog thread...
54+
# Valid state after epoch 1
55+
select * from t1 order by a;
56+
a b c
57+
1 B 1
58+
2 A 0
59+
3 A 0
60+
# Epoch 2
61+
begin;
62+
update t1 set b="C", c=2 where a=2;
63+
commit;
64+
# Wait for ndb_binlog thread...
65+
# Valid state after epoch 2
66+
select * from t1 order by a;
67+
a b c
68+
1 B 1
69+
2 C 2
70+
3 A 0
71+
# Epoch 3
72+
update t1 set b="D", c=3 where a=3;
73+
# Wait for ndb_binlog thread...
74+
# Valid state after epoch 3
75+
select * from t1 order by a;
76+
a b c
77+
1 B 1
78+
2 C 2
79+
3 D 3
80+
# Take a row lock on row 2 on one connection to the replica
81+
# This stops epoch 2 from finishing its prepare, and
82+
# therefore it will be unable to commit.
83+
# With commit ordering on, epoch 3 will not be able to commit
84+
# and will not become visible.
85+
[connection slave1]
86+
begin;
87+
select * from t1 where a=2 for update;
88+
a b c
89+
2 A 0
90+
# Start the replica from the other replica connection
91+
[connection slave]
92+
include/start_slave.inc
93+
# Row lock should stall epoch 2, and hence also
94+
# the independent epoch 3.
95+
# Check that no sign of epoch 3 can be seen while the
96+
# row lock is held.
97+
# This also stall the replica long enough to fail the transaction
98+
# being applied and thus replication will stop.
99+
# Expect that only epoch 1 successfully applied.
100+
select * from test.t1 order by a;
101+
a b c
102+
1 B 1
103+
2 A 0
104+
3 A 0
105+
# Now cleanup
106+
[connection slave1]
107+
# Release lock
108+
commit;
109+
[connection slave]
110+
# Wait for replication error
111+
include/wait_for_slave_sql_error.inc [errno=3030]
112+
# Show the error number and message after failure
113+
SELECT WORKER_ID, LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
114+
FROM performance_schema.replication_applier_status_by_worker
115+
WHERE LAST_ERROR_NUMBER != 0;
116+
WORKER_ID 1
117+
LAST_ERROR_NUMBER 1180
118+
LAST_ERROR_MESSAGE Worker 1 failed executing transaction 'ANONYMOUS' at source log master-bin.000001, end_log_pos NNN; Could not execute Write_rows event on table test.t1; Got error -1 - 'Unknown error -1' during COMMIT, Error_code: 1180; the event's source log master-bin.000001, end_log_pos NNN
119+
WORKER_ID 2
120+
LAST_ERROR_NUMBER 3030
121+
LAST_ERROR_MESSAGE Worker 2 failed executing transaction 'ANONYMOUS' at source log master-bin.000001, end_log_pos NNN; Error 'Replica worker has stopped after at least one previous worker encountered an error when replica-preserve-commit-order was enabled. To preserve commit order, the last transaction executed by this thread has not been committed. When restarting the replica after fixing any failed threads, you should fix this worker as well.' on query. Default database: ''. Query: 'COMMIT'
122+
# Show the coordinator error number and message
123+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
124+
FROM performance_schema.replication_applier_status_by_coordinator;
125+
LAST_ERROR_NUMBER 3030
126+
LAST_ERROR_MESSAGE Coordinator stopped because there were error(s) in the worker(s). The most recent failure being: Worker 2 failed executing transaction 'ANONYMOUS' at source log master-bin.000001, end_log_pos NNN. See error log and/or performance_schema.replication_applier_status_by_worker table for more details about this failure or others, if any.
127+
# Start replication again
128+
include/start_slave.inc
129+
# Resync
130+
[connection master]
131+
# Show resynced
132+
select * from test.t1 order by a;
133+
a b c
134+
1 B 1
135+
2 C 2
136+
3 D 3
137+
[connection master]
138+
# Drop table
139+
drop table test.t1;
140+
include/rpl_end.inc
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
!include suite/ndb_rpl/my.cnf
2+
3+
[mysqld]
4+
binlog-transaction-dependency-tracking=WRITESET
5+
ndb-log-transaction-dependency=ON
6+
replica-parallel-workers=10
7+
8+
[mysqld.1.slave]
9+
10+
#disable-log-bin
11+
replica-preserve-commit-order=1
12+
log-replica-updates=1
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
--source include/have_ndb.inc
2+
--source include/have_binlog_format_mixed_or_row.inc
3+
4+
# Configure replication, don't start applier
5+
--let $rpl_skip_start_slave= 1
6+
--source suite/ndb_rpl/ndb_master-slave.inc
7+
8+
--echo # Generate epoch transactions without dependencies by using WRITESET
9+
--echo # dependency tracking
10+
show variables like 'binlog_transaction_dependency_tracking';
11+
show variables like 'ndb_log_transaction_dependency';
12+
13+
create table t1 (
14+
a int primary key,
15+
b varchar(100),
16+
c int) engine=ndb;
17+
18+
# Initial row content
19+
insert into t1 values
20+
(1, "A", 0),
21+
(2, "A", 0),
22+
(3, "A", 0);
23+
24+
--echo # Valid state after epoch 0
25+
select * from t1 order by a;
26+
27+
--source include/rpl_connection_slave.inc
28+
--echo # Relevant replica variables
29+
show variables like 'replica_parallel_workers';
30+
show variables like 'log_bin';
31+
show variables like 'log_replica_updates';
32+
show variables like 'replica_preserve_commit_order';
33+
34+
--echo # Supress MTA errors generated by test
35+
call mtr.add_suppression(".*Could not execute Write_rows event.*");
36+
call mtr.add_suppression(".*possibly leaving data in inconsistent state.*");
37+
call mtr.add_suppression(".*worker has stopped after at least one previous worker encountered an error when replica-preserve-commit-order was enabled.*");
38+
39+
--source include/start_slave.inc
40+
41+
--source include/rpl_connection_master.inc
42+
--source include/sync_slave_sql_with_master.inc
43+
44+
--source include/stop_slave.inc
45+
46+
# Both clusters have same initial 3 rows
47+
# Now define some independent transactions, allowing parallel apply
48+
# but still requiring commit order if specified.
49+
50+
--source include/rpl_connection_master.inc
51+
52+
--echo # Epoch 1
53+
update t1 set b="B", c=1 where a=1;
54+
--source suite/ndb/include/ndb_binlog_wait_own_changes.inc
55+
56+
--echo # Valid state after epoch 1
57+
select * from t1 order by a;
58+
59+
--echo # Epoch 2
60+
begin;
61+
update t1 set b="C", c=2 where a=2;
62+
commit;
63+
64+
--source suite/ndb/include/ndb_binlog_wait_own_changes.inc
65+
66+
--echo # Valid state after epoch 2
67+
select * from t1 order by a;
68+
69+
--echo # Epoch 3
70+
update t1 set b="D", c=3 where a=3;
71+
72+
--source suite/ndb/include/ndb_binlog_wait_own_changes.inc
73+
74+
--echo # Valid state after epoch 3
75+
select * from t1 order by a;
76+
77+
#show binlog events;
78+
79+
--echo # Take a row lock on row 2 on one connection to the replica
80+
--echo # This stops epoch 2 from finishing its prepare, and
81+
--echo # therefore it will be unable to commit.
82+
--echo # With commit ordering on, epoch 3 will not be able to commit
83+
--echo # and will not become visible.
84+
85+
--source include/rpl_connection_slave1.inc
86+
87+
begin;
88+
select * from t1 where a=2 for update;
89+
90+
91+
--echo # Start the replica from the other replica connection
92+
--source include/rpl_connection_slave.inc
93+
--source include/start_slave.inc
94+
95+
--echo # Row lock should stall epoch 2, and hence also
96+
--echo # the independent epoch 3.
97+
--echo # Check that no sign of epoch 3 can be seen while the
98+
--echo # row lock is held.
99+
--echo # This also stall the replica long enough to fail the transaction
100+
--echo # being applied and thus replication will stop.
101+
let $ct=20;
102+
while ($ct)
103+
{
104+
if (`select count(1) as c from test.t1 where c=3`)
105+
{
106+
--echo FAIL : Epoch 3 committed while Epoch 2 not applicable!
107+
select * from test.t1 order by a;
108+
# Will cause result-content mismatch
109+
}
110+
sleep 0.5;
111+
dec $ct;
112+
}
113+
114+
--echo # Expect that only epoch 1 successfully applied.
115+
select * from test.t1 order by a;
116+
117+
--echo # Now cleanup
118+
119+
--source include/rpl_connection_slave1.inc
120+
--echo # Release lock
121+
commit;
122+
123+
--source include/rpl_connection_slave.inc
124+
125+
--echo # Wait for replication error
126+
let $slave_sql_errno= 3030;
127+
--source include/wait_for_slave_sql_error.inc
128+
129+
--echo # Show the error number and message after failure
130+
--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/
131+
query_vertical
132+
SELECT WORKER_ID, LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
133+
FROM performance_schema.replication_applier_status_by_worker
134+
WHERE LAST_ERROR_NUMBER != 0;
135+
--echo # Show the coordinator error number and message
136+
--replace_regex /end_log_pos [0-9]*/end_log_pos NNN/
137+
query_vertical
138+
SELECT LAST_ERROR_NUMBER, LAST_ERROR_MESSAGE
139+
FROM performance_schema.replication_applier_status_by_coordinator;
140+
141+
--echo # Start replication again
142+
--source include/start_slave.inc
143+
144+
--echo # Resync
145+
--source include/rpl_connection_master.inc
146+
--sync_slave_with_master
147+
148+
--echo # Show resynced
149+
select * from test.t1 order by a;
150+
151+
--source include/rpl_connection_master.inc
152+
--echo # Drop table
153+
drop table test.t1;
154+
155+
--source include/rpl_end.inc

0 commit comments

Comments
 (0)