Skip to content

Commit 9b721e8

Browse files
author
Ragasudha Chillara
committed
WL#12229 ATRT SUPPORTS TEST FAILURE RETRIES
Adds support for ATRT to re-run the test cases in case of failure. Modifies ATRT to accept a configuration file where user can specifies the number of retries if a test case fails. In case of failure test case, all the processes will be restarted and the test will retried up to the no of times as specified in config file. Setting a retry value in config file is optional, by default retry value is '0' , where as the retry value less than zero will be treated as an error and retry value greater than five, issues an warning. Updated the storage/ndb/test/run-test/README general algorithm to include retry algorithm. changes made in storage/ndb/test/run-test/check-tests.sh to support configuration if "max-retries" parameter is specified.
1 parent 4dfae18 commit 9b721e8

File tree

4 files changed

+172
-125
lines changed

4 files changed

+172
-125
lines changed

storage/ndb/test/run-test/README

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
# Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
2+
#
3+
# This program is free software; you can redistribute it and/or modify
4+
# it under the terms of the GNU General Public License as published by
5+
# the Free Software Foundation; version 2 of the License.
6+
#
7+
# This program is distributed in the hope that it will be useful,
8+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10+
# GNU General Public License for more details.
11+
#
12+
# You should have received a copy of the GNU General Public License
13+
# along with this program; if not, write to the Free Software
14+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
15+
16+
117
run-test/README
218

319
This document describes how atrt works and how to use it.
@@ -16,6 +32,7 @@ atrt has the following main loop:
1632
start each ndb_mgmd
1733
connect to each ndb_mgmd
1834
for each read(test case)
35+
for (run = 1; run < 1 + test_max_retries; run++)
1936
do
2037
if previous test failed (or is first test)
2138
stop each ndbd

storage/ndb/test/run-test/atrt.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ struct atrt_testcase {
121121
time_t m_max_time;
122122
BaseString m_name;
123123
BaseString m_mysqld_options;
124+
int m_max_retries;
124125

125126
struct Command {
126127
atrt_process::Type m_cmd_type;

storage/ndb/test/run-test/check-tests.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/sh
22

3-
# Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
3+
# Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
44
#
55
# This program is free software; you can redistribute it and/or modify
66
# it under the terms of the GNU General Public License as published by
@@ -59,6 +59,8 @@ check_file(){
5959
testcase=$(expr $testcase + 4);;
6060
^type:*)
6161
;;
62+
^max-retries:*)
63+
;;
6264
^$)
6365
if [ $testcase -ne 7 ]
6466
then

storage/ndb/test/run-test/main.cpp

Lines changed: 151 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <NdbSleep.h>
3030

3131
#define PATH_SEPARATOR DIR_SEPARATOR
32+
#define TESTCASE_RETRIES_THRESHOLD_WARNING 5
3233

3334
/** Global variables */
3435
static const char progname[] = "ndb_atrt";
@@ -325,161 +326,174 @@ int main(int argc, char **argv) {
325326
*/
326327
g_logger.debug("Entering main loop");
327328
while (!feof(g_test_case_file)) {
328-
/**
329-
* Do we need to restart ndb
330-
*/
331-
if (restart) {
332-
restart = false;
333-
g_logger.info("(Re)starting server processes...");
329+
atrt_testcase test_case;
330+
const int num_element_lines =
331+
read_test_case(g_test_case_file, test_case, lineno);
332+
if (num_element_lines == 0) {
333+
// Should be at end of file. Let while condition catch that.
334+
continue;
335+
}
336+
if (num_element_lines < 0) {
337+
g_logger.critical("Corrupt testcase at line %d (error %d)", lineno,
338+
num_element_lines);
339+
goto cleanup;
340+
}
341+
g_logger.info("#%d - %s", test_no, test_case.m_name.c_str());
334342

335-
if (!stop_processes(g_config, ~0)) {
336-
g_logger.critical("Failed to stop all processes");
337-
goto end;
343+
int result = 0;
344+
time_t elapsed;
345+
int testruns;
346+
int total_runs = 1 + test_case.m_max_retries;
347+
for (testruns = 1; testruns <= total_runs; testruns++) {
348+
if (testruns > 1) {
349+
g_logger.info("Retrying test #%d - '%s', attempt (%d/%d)", test_no,
350+
test_case.m_name.c_str(), testruns - 1,
351+
test_case.m_max_retries);
338352
}
339353

340-
g_logger.info("Waiting for all processes to stop...");
341-
if (!wait_for_processes_to_stop(g_config, ~0)) {
342-
g_logger.critical("Fail to stop all processes");
343-
goto end;
344-
}
354+
/**
355+
* Do we need to restart ndb
356+
*/
357+
if (restart) {
358+
restart = false;
359+
g_logger.info("(Re)starting server processes...");
360+
361+
if (!stop_processes(g_config, ~0)) {
362+
g_logger.critical("Failed to stop all processes");
363+
goto end;
364+
}
345365

346-
if (!setup_directories(g_config, 2)) {
347-
g_logger.critical("Failed to setup directories");
348-
goto end;
349-
}
366+
g_logger.info("Waiting for all processes to stop...");
367+
if (!wait_for_processes_to_stop(g_config, ~0)) {
368+
g_logger.critical("Fail to stop all processes");
369+
goto end;
370+
}
350371

351-
if (!setup_files(g_config, 2, 1)) {
352-
g_logger.critical("Failed to setup files");
353-
goto end;
354-
}
372+
if (!setup_directories(g_config, 2)) {
373+
g_logger.critical("Failed to setup directories");
374+
goto end;
375+
}
355376

356-
if (!setup_hosts(g_config)) {
357-
g_logger.critical("Failed to setup hosts");
358-
goto end;
359-
}
377+
if (!setup_files(g_config, 2, 1)) {
378+
g_logger.critical("Failed to setup files");
379+
goto end;
380+
}
360381

361-
g_logger.debug("Setup complete, starting servers");
362-
if (!start(g_config, p_ndb | p_servers)) {
363-
g_logger.critical("Failed to start server processes");
364-
g_logger.info("Gathering logs and saving them as test %u", test_no);
382+
if (!setup_hosts(g_config)) {
383+
g_logger.critical("Failed to setup hosts");
384+
goto end;
385+
}
386+
387+
g_logger.debug("Setup complete, starting servers");
388+
if (!start(g_config, p_ndb | p_servers)) {
389+
g_logger.critical("Failed to start server processes");
390+
g_logger.info("Gathering logs and saving them as test %u", test_no);
365391

366-
int tmp;
367-
if (!gather_result(g_config, &tmp)) {
368-
g_logger.critical("Failed to gather results");
392+
int tmp;
393+
if (!gather_result(g_config, &tmp)) {
394+
g_logger.critical("Failed to gather results");
395+
goto cleanup;
396+
}
397+
398+
if (g_report_file != 0) {
399+
fprintf(g_report_file, "%s ; %d ; %d ; %d ; %d\n", "start servers",
400+
test_no, ERR_FAILED_TO_START, 0, 0);
401+
fflush(g_report_file);
402+
}
403+
404+
BaseString resdir;
405+
resdir.assfmt("result.%d", test_no);
406+
remove_dir(resdir.c_str(), true);
407+
408+
if (rename("result", resdir.c_str()) != 0) {
409+
g_logger.critical("Failed to rename %s as %s", "result",
410+
resdir.c_str());
411+
goto cleanup;
412+
}
369413
goto cleanup;
370414
}
371415

372-
if (g_report_file != 0) {
373-
fprintf(g_report_file, "%s ; %d ; %d ; %d\n", "start servers",
374-
test_no, ERR_FAILED_TO_START, 0);
375-
fflush(g_report_file);
416+
if (!setup_db(g_config)) {
417+
g_logger.critical("Failed to setup database");
418+
goto cleanup;
376419
}
377420

378-
BaseString resdir;
379-
resdir.assfmt("result.%d", test_no);
380-
remove_dir(resdir.c_str(), true);
421+
g_logger.info("All servers start completed");
422+
}
381423

382-
if (rename("result", resdir.c_str()) != 0) {
383-
g_logger.critical("Failed to rename %s as %s", "result",
384-
resdir.c_str());
385-
goto cleanup;
386-
}
424+
// Assign processes to programs
425+
if (!setup_test_case(g_config, test_case)) {
426+
g_logger.critical("Failed to setup test case");
387427
goto cleanup;
388428
}
389429

390-
if (!setup_db(g_config)) {
391-
g_logger.critical("Failed to setup database");
430+
if (!start_processes(g_config, p_clients)) {
431+
g_logger.critical("Failed to start client processes");
392432
goto cleanup;
393433
}
394434

395-
g_logger.info("All servers start completed");
396-
}
435+
const time_t start = time(0);
436+
time_t now = start;
437+
do {
438+
if (!update_status(g_config, atrt_process::AP_ALL)) {
439+
g_logger.critical("Failed to get updated status for all processes");
440+
goto cleanup;
441+
}
397442

398-
// const int start_line = lineno;
399-
atrt_testcase test_case;
400-
const int num_element_lines =
401-
read_test_case(g_test_case_file, test_case, lineno);
402-
if (num_element_lines == 0) {
403-
// Should be at end of file. Let while condition catch that.
404-
continue;
405-
}
406-
if (num_element_lines < 0) {
407-
g_logger.critical("Corrupt testcase at line %d (error %d)", lineno,
408-
num_element_lines);
409-
goto cleanup;
410-
}
411-
g_logger.info("#%d - %s", test_no, test_case.m_name.c_str());
443+
if ((result = check_ndb_or_servers_failures(g_config))) {
444+
break;
445+
}
412446

413-
// Assign processes to programs
414-
if (!setup_test_case(g_config, test_case)) {
415-
g_logger.critical("Failed to setup test case");
416-
goto cleanup;
417-
}
447+
if (!is_client_running(g_config)) {
448+
break;
449+
}
418450

419-
if (!start_processes(g_config, p_clients)) {
420-
g_logger.critical("Failed to start client processes");
421-
goto cleanup;
422-
}
451+
if (!do_command(g_config)) {
452+
result = ERR_COMMAND_FAILED;
453+
g_logger.critical("Failure on client command execution");
454+
break;
455+
}
423456

424-
int result = 0;
457+
now = time(0);
458+
if (now > (start + test_case.m_max_time)) {
459+
g_logger.debug("Timed out");
460+
result = ERR_MAX_TIME_ELAPSED;
461+
g_logger.info("Timeout '%s' after %ld seconds",
462+
test_case.m_name.c_str(), test_case.m_max_time);
463+
break;
464+
}
465+
NdbSleep_SecSleep(1);
466+
} while (true);
425467

426-
const time_t start = time(0);
427-
time_t now = start;
428-
do {
429-
if (!update_status(g_config, atrt_process::AP_ALL)) {
430-
g_logger.critical("Failed to get updated status for all processes");
468+
elapsed = time(0) - start;
469+
if (!stop_processes(g_config, p_clients)) {
470+
g_logger.critical("Failed to stop client processes");
431471
goto cleanup;
432472
}
433473

434-
if ((result = check_ndb_or_servers_failures(g_config))) {
435-
break;
436-
}
437-
438-
if (!is_client_running(g_config)) {
439-
break;
474+
if (!wait_for_processes_to_stop(g_config, p_clients)) {
475+
g_logger.critical("Failed to stop client processes");
476+
goto cleanup;
440477
}
441478

442-
if (!do_command(g_config)) {
443-
result = ERR_COMMAND_FAILED;
444-
g_logger.critical("Failure on client command execution");
445-
break;
479+
int tmp, *rp = result ? &tmp : &result;
480+
if (!gather_result(g_config, rp)) {
481+
g_logger.critical("Failed to gather result after test run");
482+
goto end;
446483
}
447484

448-
now = time(0);
449-
if (now > (start + test_case.m_max_time)) {
450-
g_logger.debug("Timed out");
451-
result = ERR_MAX_TIME_ELAPSED;
452-
g_logger.info("Timeout '%s' after %ld seconds",
453-
test_case.m_name.c_str(), test_case.m_max_time);
485+
g_logger.info("#%d %s(%d)", test_no, (result == 0 ? "OK" : "FAILED"),
486+
result);
487+
if (result == 0) {
454488
break;
489+
} else {
490+
restart = true;
455491
}
456-
NdbSleep_SecSleep(1);
457-
} while (true);
458-
459-
const time_t elapsed = time(0) - start;
460-
461-
if (!stop_processes(g_config, p_clients)) {
462-
g_logger.critical("Failed to stop client processes");
463-
goto cleanup;
464-
}
465-
466-
if (!wait_for_processes_to_stop(g_config, p_clients)) {
467-
g_logger.critical("Failed to stop client processes");
468-
goto cleanup;
469492
}
470493

471-
int tmp, *rp = result ? &tmp : &result;
472-
if (!gather_result(g_config, rp)) {
473-
g_logger.critical("Failed to gather result after test run");
474-
goto end;
475-
}
476-
477-
g_logger.info("#%d %s(%d)", test_no, (result == 0 ? "OK" : "FAILED"),
478-
result);
479-
480494
if (g_report_file != 0) {
481-
fprintf(g_report_file, "%s ; %d ; %d ; %ld\n", test_case.m_name.c_str(),
482-
test_no, result, elapsed);
495+
fprintf(g_report_file, "%s ; %d ; %d ; %ld ; %d\n",
496+
test_case.m_name.c_str(), test_no, result, elapsed, testruns);
483497
fflush(g_report_file);
484498
}
485499

@@ -510,10 +524,6 @@ int main(int argc, char **argv) {
510524
if (reset_config(g_config)) {
511525
restart = true;
512526
}
513-
514-
if (result != 0) {
515-
restart = true;
516-
}
517527
test_no++;
518528
}
519529
return_code = 0;
@@ -526,8 +536,8 @@ int main(int argc, char **argv) {
526536
end:
527537
g_logger.info("Finishing, result: %d", return_code);
528538
if (return_code != 0 && g_report_file != 0) {
529-
fprintf(g_report_file, "%s ; %d ; %d ; %d\n", "critical error", test_no,
530-
ERR_FAILED_TO_START, 0);
539+
fprintf(g_report_file, "%s ; %d ; %d ; %d ; %d\n", "critical error",
540+
test_no, ERR_FAILED_TO_START, 0, 0);
531541
fflush(g_report_file);
532542
}
533543
if (g_report_file != 0) {
@@ -1379,6 +1389,23 @@ int read_test_case(FILE *file, atrt_testcase &tc, int &line) {
13791389
used_elements++;
13801390
}
13811391

1392+
tc.m_max_retries = 0;
1393+
if (p.get("max-retries", &mt)) {
1394+
tc.m_max_retries = atoi(mt);
1395+
used_elements++;
1396+
}
1397+
1398+
if (tc.m_max_retries < 0) {
1399+
g_logger.error("No of retries must not be less than zero for test '%s'",
1400+
tc.m_name.c_str());
1401+
return -4;
1402+
}
1403+
1404+
if (tc.m_max_retries > TESTCASE_RETRIES_THRESHOLD_WARNING)
1405+
g_logger.warning(
1406+
"No of retries should be less than or equal to %d for test '%s'",
1407+
TESTCASE_RETRIES_THRESHOLD_WARNING, tc.m_name.c_str());
1408+
13821409
if (used_elements != elements) {
13831410
g_logger.critical(
13841411
"Invalid test file: unknown properties in test case above line: %d",

0 commit comments

Comments
 (0)