Skip to content

Commit f94c382

Browse files
committed
Bug#31341888 RECURRING TEMPORARY ERRORS STALL NDB METADATA SYNC
Problem: -------- Auto sync of objects that fail due to temporary errors, such as failed MDL acquisitions, are simply removed from the list of detected objects. Such objects are then eligible for detection in future cycles after which the synchronization is retried and should hopefully succeed. This best-effort approach is suitable for the default auto sync behaviour but isn't ideal when the ndb_metadata_sync system variable is used. The ndb_metadata_sync variable triggers synchronization of all metadata; when synchronization is complete, the variable is automatically set to false to indicate that this has been done. If the temporary error persists for a sizable amount of time, the metadata sync could take longer than expected and in extreme cases could hang indefinitely pending user action. One such case is when the ndb_restore tool is used to restore metadata with the --disable-indexes option. The sync will enter a vicious cycle of detection and failed sync attempts due to the missing indexes until the indexes are rebuilt using ndb_restore. Fix: ---- The object is blacklisted after it fails on 10 occasions with temporary errors which ensures that the sync isn't stalled. Objects whose sync has failed with temporary errors are maintained in a list along with a count of the number of retries. The list is validated during change detection in a similar manner to the blacklist. Change-Id: I5ec9dcbfe5610a7887966ae6d29bbcc1dbfe4da4
1 parent 24d0c41 commit f94c382

10 files changed

+545
-107
lines changed

mysql-test/suite/ndb/t/ndb_metadata_upgrade_advanced.test

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ DROP TABLE t3;
154154
SET @old_ndb_metadata_check = @@global.ndb_metadata_check;
155155
SET @old_ndb_metadata_check_interval = @@global.ndb_metadata_check_interval;
156156
SET GLOBAL ndb_metadata_check = true;
157-
SET GLOBAL ndb_metadata_check_interval = 0;
157+
SET GLOBAL ndb_metadata_check_interval = 5;
158158
--enable_query_log
159159

160160
# Wait until the following 3 object changes are detected:
@@ -179,10 +179,8 @@ SET GLOBAL ndb_metadata_check_interval = @old_ndb_metadata_check_interval;
179179

180180
--disable_query_log
181181
# Set low interval so changes are detected quickly
182-
SET @old_ndb_metadata_check = @@global.ndb_metadata_check;
183-
SET @old_ndb_metadata_check_interval = @@global.ndb_metadata_check_interval;
184182
SET GLOBAL ndb_metadata_check = true;
185-
SET GLOBAL ndb_metadata_check_interval = 0;
183+
SET GLOBAL ndb_metadata_check_interval = 5;
186184
--enable_query_log
187185

188186
# Wait until the following 3 object changes are detected:
@@ -193,6 +191,12 @@ SET GLOBAL ndb_metadata_check_interval = 0;
193191
--let $max_wait = 30
194192
--source wait_metadata_changes_detected.inc
195193

194+
--disable_query_log
195+
# Changes have been detected, reset values
196+
SET GLOBAL ndb_metadata_check = @old_ndb_metadata_check;
197+
SET GLOBAL ndb_metadata_check_interval = @old_ndb_metadata_check_interval;
198+
--enable_query_log
199+
196200
--echo Confirm that the tables haven't been synced
197201
SHOW TABLES;
198202

@@ -201,15 +205,30 @@ SHOW TABLES;
201205

202206
# Indexes have been rebuilt which means that the tables are now
203207
# ready to be synced
208+
--let $initial_detected_count = query_get_value(SHOW STATUS LIKE 'Ndb_metadata_detected_count', Value, 1)
209+
--disable_query_log
210+
# Set low interval so changes are detected quickly
211+
SET GLOBAL ndb_metadata_check = true;
212+
SET GLOBAL ndb_metadata_check_interval = 0;
213+
--enable_query_log
214+
215+
# Wait until the following 3 object changes are detected:
216+
# Table 'test.t1'
217+
# Table 'test.t2'
218+
# Table 'test.t3'
219+
--let $expected_changes = 3
204220
--let $max_wait = 30
205-
--source wait_metadata_synced.inc
221+
--source wait_metadata_changes_detected.inc
206222

207223
--disable_query_log
208224
# Changes have been detected, reset values
209225
SET GLOBAL ndb_metadata_check = @old_ndb_metadata_check;
210226
SET GLOBAL ndb_metadata_check_interval = @old_ndb_metadata_check_interval;
211227
--enable_query_log
212228

229+
--let $max_wait = 30
230+
--source wait_metadata_synced.inc
231+
213232
--echo Confirm that the tables have been synced
214233
SHOW TABLES;
215234
--sorted_result

mysql-test/suite/ndb_ddl/metadata_immediate_sync.result

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,3 +265,55 @@ DROP DATABASE db1;
265265
DROP DATABASE db2;
266266
DROP DATABASE db3;
267267
DROP DATABASE db4;
268+
USE test;
269+
CREATE TABLE t1 (
270+
a INT PRIMARY KEY,
271+
b INT NOT NULL,
272+
c INT NOT NULL,
273+
UNIQUE ib(b),
274+
INDEX ic(c)
275+
) ENGINE NDB;
276+
INSERT INTO t1 VALUES (1,2,3),(2,3,5),(3,4,6),(4,5,8),(5,6,2),(6,7,2);
277+
CREATE TABLE parent (
278+
c1 INT PRIMARY KEY
279+
) ENGINE NDB;
280+
INSERT INTO parent VALUES (1),(2);
281+
CREATE TABLE child (
282+
c1 INT PRIMARY KEY,
283+
c2 INT,
284+
INDEX xc2(c2),
285+
FOREIGN KEY(c2)
286+
REFERENCES parent(c1)
287+
ON DELETE CASCADE
288+
) ENGINE NDB;
289+
INSERT INTO child VALUES (1,1),(2,2);
290+
Backup
291+
DROP TABLE t1,child,parent;
292+
Restore with --disable-indexes
293+
SET GLOBAL ndb_metadata_sync = true;
294+
SELECT schema_name, name, type FROM performance_schema.ndb_sync_excluded_objects;
295+
schema_name name type
296+
test child TABLE
297+
test parent TABLE
298+
test t1 TABLE
299+
Rebuild indexes in NDB
300+
SELECT * FROM t1;
301+
a b c
302+
1 2 3
303+
2 3 5
304+
3 4 6
305+
4 5 8
306+
5 6 2
307+
6 7 2
308+
SELECT * FROM parent;
309+
c1
310+
1
311+
2
312+
SELECT * FROM child;
313+
c1 c2
314+
1 1
315+
2 2
316+
SELECT COUNT(*) FROM performance_schema.ndb_sync_excluded_objects;
317+
COUNT(*)
318+
0
319+
DROP TABLE t1,child,parent;

mysql-test/suite/ndb_ddl/metadata_immediate_sync.test

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,5 +282,96 @@ DROP DATABASE db1;
282282
DROP DATABASE db2;
283283
DROP DATABASE db3;
284284
DROP DATABASE db4;
285+
286+
#
287+
# Case 3: Automatic synchronization of tables restored
288+
# with the --disable-indexes option.
289+
# This is a variant of the metadata_sync_restore_disable_index test
290+
# - Cause mismatch by restoring tables with --disable-indexes
291+
# - Wait until the tables are blacklisted after 10 unsuccessful tries
292+
# - Rebuild indexes and discover tables
293+
# - Clear the blacklist
294+
#
295+
296+
# Create tables with various indexes
297+
USE test;
298+
CREATE TABLE t1 (
299+
a INT PRIMARY KEY,
300+
b INT NOT NULL,
301+
c INT NOT NULL,
302+
UNIQUE ib(b),
303+
INDEX ic(c)
304+
) ENGINE NDB;
305+
306+
INSERT INTO t1 VALUES (1,2,3),(2,3,5),(3,4,6),(4,5,8),(5,6,2),(6,7,2);
307+
308+
CREATE TABLE parent (
309+
c1 INT PRIMARY KEY
310+
) ENGINE NDB;
311+
312+
INSERT INTO parent VALUES (1),(2);
313+
314+
CREATE TABLE child (
315+
c1 INT PRIMARY KEY,
316+
c2 INT,
317+
INDEX xc2(c2),
318+
FOREIGN KEY(c2)
319+
REFERENCES parent(c1)
320+
ON DELETE CASCADE
321+
) ENGINE NDB;
322+
323+
INSERT INTO child VALUES (1,1),(2,2);
324+
325+
--echo Backup
326+
--source include/ndb_backup.inc
327+
328+
DROP TABLE t1,child,parent;
329+
330+
--echo Restore with --disable-indexes
331+
--exec $NDB_RESTORE -b $the_backup_id -n 1 -m -r --disable-indexes $NDB_BACKUPS-$the_backup_id >> $NDB_TOOLS_OUTPUT
332+
--exec $NDB_RESTORE -b $the_backup_id -n 2 -r $NDB_BACKUPS-$the_backup_id >> $NDB_TOOLS_OUTPUT
333+
334+
# Wait until the changes have been synced
335+
--let $max_wait = 60
336+
--source wait_immediate_metadata_sync.inc
337+
338+
--sorted_result
339+
SELECT schema_name, name, type FROM performance_schema.ndb_sync_excluded_objects;
340+
341+
--echo Rebuild indexes in NDB
342+
--exec $NDB_RESTORE -b $the_backup_id -n 1 --rebuild-indexes $NDB_BACKUPS-$the_backup_id >> $NDB_TOOLS_OUTPUT
343+
344+
# Manually synch the tables through "discovery".
345+
# This will result in the tables being removed from the
346+
# blacklist during the next detection cycle
347+
--sorted_result
348+
SELECT * FROM t1;
349+
--sorted_result
350+
SELECT * FROM parent;
351+
--sorted_result
352+
SELECT * FROM child;
353+
354+
--disable_query_log
355+
# Enable metadata check with no interval to clear the blacklist quickly
356+
SET @old_ndb_metadata_check = @@global.ndb_metadata_check;
357+
SET @old_ndb_metadata_check_interval = @@global.ndb_metadata_check_interval;
358+
SET GLOBAL ndb_metadata_check_interval = 0;
359+
SET GLOBAL ndb_metadata_check = true;
360+
--enable_query_log
361+
362+
# Wait until the blacklist is empty
363+
--let $max_wait = 30
364+
--source wait_metadata_sync_blacklist_empty.inc
365+
366+
--disable_query_log
367+
# Reset values
368+
SET GLOBAL ndb_metadata_check = @old_ndb_metadata_check;
369+
SET GLOBAL ndb_metadata_check_interval = @old_ndb_metadata_check_interval;
370+
--enable_query_log
371+
372+
SELECT COUNT(*) FROM performance_schema.ndb_sync_excluded_objects;
373+
374+
# Clean-up
375+
DROP TABLE t1,child,parent;
285376
--source suite/ndb/include/backup_restore_cleanup.inc
286377
--remove_file $NDB_TOOLS_OUTPUT

storage/ndb/plugin/ha_ndbcluster_binlog.cc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2006, 2020, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2006, 2020, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -5555,6 +5555,10 @@ void ndbcluster_binlog_validate_sync_blacklist(THD *thd) {
55555555
ndb_binlog_thread.validate_sync_blacklist(thd);
55565556
}
55575557

5558+
void ndbcluster_binlog_validate_sync_retry_list(THD *thd) {
5559+
ndb_binlog_thread.validate_sync_retry_list(thd);
5560+
}
5561+
55585562
bool ndbcluster_binlog_check_table_async(const std::string &db_name,
55595563
const std::string &table_name) {
55605564
if (db_name.empty()) {

storage/ndb/plugin/ha_ndbcluster_binlog.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2000, 2020, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -75,6 +75,12 @@ int show_ndb_status_injector(THD *, SHOW_VAR *var, char *);
7575
*/
7676
void ndbcluster_binlog_validate_sync_blacklist(THD *thd);
7777

78+
/**
79+
@brief Validate the list of objects whose synchronization have been retried
80+
@param thd Thread handle
81+
*/
82+
void ndbcluster_binlog_validate_sync_retry_list(THD *thd);
83+
7884
/**
7985
@brief Queue up tables which the ndb binlog thread needs to check for changes
8086
@param db_name The name of database the table belongs to

storage/ndb/plugin/ndb_binlog_thread.cc

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2014, 2020, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -76,6 +76,10 @@ void Ndb_binlog_thread::validate_sync_blacklist(THD *thd) {
7676
metadata_sync.validate_blacklist(thd);
7777
}
7878

79+
void Ndb_binlog_thread::validate_sync_retry_list(THD *thd) {
80+
metadata_sync.validate_retry_list(thd);
81+
}
82+
7983
bool Ndb_binlog_thread::add_logfile_group_to_check(
8084
const std::string &lfg_name) {
8185
return metadata_sync.add_logfile_group(lfg_name);
@@ -160,10 +164,16 @@ void Ndb_binlog_thread::synchronize_detected_object(THD *thd) {
160164
object_name.c_str());
161165
increment_metadata_synced_count();
162166
} else if (temp_error) {
163-
log_info(
164-
"Failed to synchronize logfile group '%s' due to a temporary "
165-
"error",
166-
object_name.c_str());
167+
if (metadata_sync.retry_limit_exceeded(schema_name, object_name,
168+
object_type)) {
169+
metadata_sync.add_object_to_blacklist(schema_name, object_name,
170+
object_type, error_msg);
171+
} else {
172+
log_info(
173+
"Failed to synchronize logfile group '%s' due to a temporary "
174+
"error",
175+
object_name.c_str());
176+
}
167177
} else {
168178
log_error("Failed to synchronize logfile group '%s'",
169179
object_name.c_str());
@@ -182,10 +192,15 @@ void Ndb_binlog_thread::synchronize_detected_object(THD *thd) {
182192
object_name.c_str());
183193
increment_metadata_synced_count();
184194
} else if (temp_error) {
185-
log_info(
186-
"Failed to synchronize tablespace '%s' due to a temporary "
187-
"error",
188-
object_name.c_str());
195+
if (metadata_sync.retry_limit_exceeded(schema_name, object_name,
196+
object_type)) {
197+
metadata_sync.add_object_to_blacklist(schema_name, object_name,
198+
object_type, error_msg);
199+
} else {
200+
log_info(
201+
"Failed to synchronize tablespace '%s' due to a temporary error",
202+
object_name.c_str());
203+
}
189204
} else {
190205
log_error("Failed to synchronize tablespace '%s'", object_name.c_str());
191206
metadata_sync.add_object_to_blacklist(schema_name, object_name,
@@ -201,8 +216,14 @@ void Ndb_binlog_thread::synchronize_detected_object(THD *thd) {
201216
log_info("Schema '%s' successfully synchronized", schema_name.c_str());
202217
increment_metadata_synced_count();
203218
} else if (temp_error) {
204-
log_info("Failed to synchronize schema '%s' due to a temporary error",
205-
schema_name.c_str());
219+
if (metadata_sync.retry_limit_exceeded(schema_name, object_name,
220+
object_type)) {
221+
metadata_sync.add_object_to_blacklist(schema_name, object_name,
222+
object_type, error_msg);
223+
} else {
224+
log_info("Failed to synchronize schema '%s' due to a temporary error",
225+
schema_name.c_str());
226+
}
206227
} else {
207228
log_error("Failed to synchronize schema '%s'", schema_name.c_str());
208229
metadata_sync.add_object_to_blacklist(schema_name, object_name,
@@ -220,8 +241,15 @@ void Ndb_binlog_thread::synchronize_detected_object(THD *thd) {
220241
object_name.c_str());
221242
increment_metadata_synced_count();
222243
} else if (temp_error) {
223-
log_info("Failed to synchronize table '%s.%s' due to a temporary error",
224-
schema_name.c_str(), object_name.c_str());
244+
if (metadata_sync.retry_limit_exceeded(schema_name, object_name,
245+
object_type)) {
246+
metadata_sync.add_object_to_blacklist(schema_name, object_name,
247+
object_type, error_msg);
248+
} else {
249+
log_info(
250+
"Failed to synchronize table '%s.%s' due to a temporary error",
251+
schema_name.c_str(), object_name.c_str());
252+
}
225253
} else {
226254
log_error("Failed to synchronize table '%s.%s'", schema_name.c_str(),
227255
object_name.c_str());

storage/ndb/plugin/ndb_binlog_thread.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved.
2+
Copyright (c) 2014, 2020, Oracle and/or its affiliates.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License, version 2.0,
@@ -68,6 +68,17 @@ class Ndb_binlog_thread : public Ndb_component {
6868
*/
6969
void validate_sync_blacklist(THD *thd);
7070

71+
/*
72+
@brief Iterate through the retry list of objects and check the present
73+
status of the objects. The object is removed if the mismatch no
74+
longer exists or if the object has been blacklisted
75+
76+
@param thd Thread handle
77+
78+
@return void
79+
*/
80+
void validate_sync_retry_list(THD *thd);
81+
7182
/*
7283
@brief Pass the logfile group object detected to the internal implementation
7384
that shall eventually synchronize the object

0 commit comments

Comments
 (0)