Skip to content

Commit cc775ab

Browse files
authored
[generate dump] Move the Core/Log collection to the End of process Execution and removed default timeout (sonic-net#2209)
Recently an issue is seen during the test_max_limit[core] sonic-mgmt test (Test for auto-techsupport). This was a diff taken from two techsupport process runs and in the first case the core file to save was too large (almost 1G) and thus took upto 10 sec and all the commands that followed which append to the tar file have shown increased execution time. Finally, after 30 mins the execution timed out. Besides, it's better to collect the logs in the end, since we could collect more info and also core files are mostly static and it shouldn't matter much even if we collect them late. Thus moved the core/log collection to the end. But there is a catch regarding the above change, For eg: system is in a unstable state and most of the individual commands start to timeout, the techsupport dump eventually times out at 30m (because of the global timeout), then the dump is pretty useless, since it might not have any useful information at all Thus, i've removed the default global timeout, Clients can/should knowingly provide a value using -g option if the execution time has to be capped. A global timeout of 60 mins is used for Global timeout for Auto-techsupport invocation. Fix related to since argument was overwritten by the latest commit related to auto-techsupport on master and thus the reason for issues like this Auto-Techsupport collect logs beyond since value sonic-buildimage#11208, Made changes to fix the issue in here Signed-off-by: Vivek Reddy Karri <[email protected]>
1 parent 6dbb4bd commit cc775ab

File tree

5 files changed

+40
-18
lines changed

5 files changed

+40
-18
lines changed

scripts/generate_dump

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,11 +1287,6 @@ main() {
12871287
end_t=$(date +%s%3N)
12881288
echo "[ Capture Proc State ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
12891289

1290-
# Save logs and cores early
1291-
save_log_files
1292-
save_crash_files
1293-
save_warmboot_files
1294-
12951290
# Save all the processes within each docker
12961291
save_cmd "show services" services.summary
12971292

@@ -1426,6 +1421,10 @@ main() {
14261421
end_t=$(date +%s%3N)
14271422
echo "[ TAR /etc Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
14281423

1424+
save_log_files
1425+
save_crash_files
1426+
save_warmboot_files
1427+
14291428
finalize
14301429
}
14311430

show/main.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,7 +1266,7 @@ def users(verbose):
12661266

12671267
@cli.command()
12681268
@click.option('--since', required=False, help="Collect logs and core files since given date")
1269-
@click.option('-g', '--global-timeout', default=30, type=int, help="Global timeout in minutes. Default 30 mins")
1269+
@click.option('-g', '--global-timeout', required=False, type=int, help="Global timeout in minutes. WARN: Dump might be incomplete if enforced")
12701270
@click.option('-c', '--cmd-timeout', default=5, type=int, help="Individual command timeout in minutes. Default 5 mins")
12711271
@click.option('--verbose', is_flag=True, help="Enable verbose output")
12721272
@click.option('--allow-process-stop', is_flag=True, help="Dump additional data which may require system interruption")
@@ -1275,7 +1275,10 @@ def users(verbose):
12751275
@click.option('--redirect-stderr', '-r', is_flag=True, help="Redirect an intermediate errors to STDERR")
12761276
def techsupport(since, global_timeout, cmd_timeout, verbose, allow_process_stop, silent, debug_dump, redirect_stderr):
12771277
"""Gather information for troubleshooting"""
1278-
cmd = "sudo timeout --kill-after={}s -s SIGTERM --foreground {}m".format(COMMAND_TIMEOUT, global_timeout)
1278+
cmd = "sudo"
1279+
1280+
if global_timeout:
1281+
cmd += " timeout --kill-after={}s -s SIGTERM --foreground {}m".format(COMMAND_TIMEOUT, global_timeout)
12791282

12801283
if allow_process_stop:
12811284
cmd += " -a"

tests/coredump_gen_handler_test.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
/tmp/saisdkdump
2222
"""
2323

24+
TS_DEFAULT_CMD = "show techsupport --silent --global-timeout 60 --since 2 days ago"
25+
2426
def signal_handler(signum, frame):
2527
raise Exception("Timed out!")
2628

@@ -270,7 +272,7 @@ def test_since_argument(self):
270272
def mock_cmd(cmd, env):
271273
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
272274
cmd_str = " ".join(cmd)
273-
if "--since '4 days ago'" in cmd_str:
275+
if "--since 4 days ago" in cmd_str:
274276
patcher.fs.create_file(ts_dump)
275277
return 0, AUTO_TS_STDOUT + ts_dump, ""
276278
elif "date --date=4 days ago" in cmd_str:
@@ -336,7 +338,7 @@ def test_invalid_since_argument(self):
336338
def mock_cmd(cmd, env):
337339
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
338340
cmd_str = " ".join(cmd)
339-
if "--since '2 days ago'" in cmd_str:
341+
if "--since 2 days ago" in cmd_str:
340342
patcher.fs.create_file(ts_dump)
341343
print(AUTO_TS_STDOUT + ts_dump)
342344
return 0, AUTO_TS_STDOUT + ts_dump, ""
@@ -429,3 +431,21 @@ def mock_cmd(cmd, env):
429431
finally:
430432
signal.alarm(0)
431433

434+
def test_auto_ts_options(self):
435+
"""
436+
Scenario: Check if the techsupport is called as expected
437+
"""
438+
db_wrap = Db()
439+
redis_mock = db_wrap.db
440+
set_auto_ts_cfg(redis_mock, state="enabled", since_cfg="2 days ago")
441+
set_feature_table_cfg(redis_mock, state="enabled")
442+
with Patcher() as patcher:
443+
def mock_cmd(cmd, env):
444+
cmd_str = " ".join(cmd)
445+
if "show techsupport" in cmd_str and cmd_str != TS_DEFAULT_CMD:
446+
assert False, "Expected TS_CMD: {}, Recieved: {}".format(TS_DEFAULT_CMD, cmd_str)
447+
return 0, AUTO_TS_STDOUT, ""
448+
ts_helper.subprocess_exec = mock_cmd
449+
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
450+
cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock)
451+
cls.handle_core_dump_creation_event()

tests/techsupport_test.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,18 @@
33
from unittest.mock import patch, Mock
44
from click.testing import CliRunner
55

6-
EXPECTED_BASE_COMMAND = 'sudo timeout --kill-after=300s -s SIGTERM --foreground '
6+
EXPECTED_BASE_COMMAND = 'sudo '
77

88
@patch("show.main.run_command")
99
@pytest.mark.parametrize(
1010
"cli_arguments,expected",
1111
[
12-
([], '30m generate_dump -v -t 5'),
13-
(['--since', '2 days ago'], "30m generate_dump -v -s '2 days ago' -t 5"),
14-
(['-g', '50'], '50m generate_dump -v -t 5'),
15-
(['--allow-process-stop'], '30m -a generate_dump -v -t 5'),
16-
(['--silent'], '30m generate_dump -t 5'),
17-
(['--debug-dump', '--redirect-stderr'], '30m generate_dump -v -d -t 5 -r'),
12+
([], 'generate_dump -v -t 5'),
13+
(['--since', '2 days ago'], "generate_dump -v -s '2 days ago' -t 5"),
14+
(['-g', '50'], 'timeout --kill-after=300s -s SIGTERM --foreground 50m generate_dump -v -t 5'),
15+
(['--allow-process-stop'], '-a generate_dump -v -t 5'),
16+
(['--silent'], 'generate_dump -t 5'),
17+
(['--debug-dump', '--redirect-stderr'], 'generate_dump -v -d -t 5 -r'),
1818
]
1919
)
2020
def test_techsupport(run_command, cli_arguments, expected):

utilities_common/auto_techsupport_helper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868

6969
TIME_BUF = 20
7070
SINCE_DEFAULT = "2 days ago"
71+
TS_GLOBAL_TIMEOUT = "60"
7172

7273
# Explicity Pass this to the subprocess invoking techsupport
7374
ENV_VAR = os.environ
@@ -229,8 +230,7 @@ def parse_ts_dump_name(ts_stdout):
229230
def invoke_ts_cmd(db, num_retry=0):
230231
"""Invoke techsupport generation command"""
231232
since_cfg = get_since_arg(db)
232-
since_cfg = "'" + since_cfg + "'"
233-
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
233+
cmd_opts = ["show", "techsupport", "--silent", "--global-timeout", TS_GLOBAL_TIMEOUT, "--since", since_cfg]
234234
cmd = " ".join(cmd_opts)
235235
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
236236
new_dump = ""

0 commit comments

Comments
 (0)