Skip to content

Commit f70dc27

Browse files
authored
[techsupport] Handle minor fixes of TS Lock and update auto-TS (sonic-net#2114)
1. Print the last statement as the techsupport dump name, as some automation processes might depend of parsing the last line to infer the dump path. Previously: handle_exit Removing lock. Exit: 0 removed '/tmp/techsupport-lock/PID' removed directory '/tmp/techsupport-lock' Updated: handle_exit Removing lock. Exit: 0 removed '/tmp/techsupport-lock/PID' removed directory '/tmp/techsupport-lock' /var/dump/sonic_dump_r-bulldog-03_20220324_195553.tar.gz 2. Don't acquire the lock when running in NOOP mode 3. Set the set -v option just before running main so that it won't print the generate_dump code to stdout 4. Update the auto-techsupport script to handle EXT_RETRY and EXT_LOCKFAIL exit codes returned by show techsupport command. 5. Update the minor error in since argument for auto-techsupport Signed-off-by: Vivek Keddy Karri <[email protected]>
1 parent 51d3550 commit f70dc27

File tree

4 files changed

+66
-16
lines changed

4 files changed

+66
-16
lines changed

scripts/coredump_gen_handler.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,18 +110,26 @@ def parse_ts_dump_name(self, ts_stdout):
110110
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
111111
return ""
112112

113-
def invoke_ts_cmd(self, since_cfg):
114-
since_cfg = "'" + since_cfg + "'"
113+
def invoke_ts_cmd(self, since_cfg, num_retry=0):
115114
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
116115
cmd = " ".join(cmd_opts)
117116
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
118-
if rc:
117+
new_dump = ""
118+
if rc == EXT_LOCKFAIL:
119+
syslog.syslog(syslog.LOG_NOTICE, "Another instance of techsupport running, aborting this. stderr: {}".format(stderr))
120+
elif rc == EXT_RETRY:
121+
if num_retry <= MAX_RETRY_LIMIT:
122+
return self.invoke_ts_cmd(since_cfg, num_retry+1)
123+
else:
124+
syslog.syslog(syslog.LOG_ERR, "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}".format(stderr))
125+
elif rc != EXT_SUCCESS:
119126
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr: {}".format(rc, stderr))
120-
new_dump = self.parse_ts_dump_name(stdout)
121-
if not new_dump:
122-
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
123-
else:
124-
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
127+
else: # EXT_SUCCESS
128+
new_dump = self.parse_ts_dump_name(stdout) # Parse the dump name
129+
if not new_dump:
130+
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
131+
else:
132+
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
125133
return new_dump
126134

127135
def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):

scripts/generate_dump

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ handle_exit()
6363
ECODE=$?
6464
echo "Removing lock. Exit: $ECODE" >&2
6565
$RM $V -rf ${LOCKDIR}
66+
# Echo the filename as the last statement if the generation succeeds
67+
if [[ -f $TARFILE && ($ECODE == $EXT_SUCCESS || $ECODE == $RETURN_CODE) ]]; then
68+
echo $TARFILE
69+
fi
6670
}
6771

6872
handle_signal()
@@ -1360,8 +1364,6 @@ main() {
13601364

13611365
# Invoke the TechSupport Cleanup Hook
13621366
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &
1363-
1364-
echo ${TARFILE}
13651367

13661368
if ! $SAVE_STDERR
13671369
then
@@ -1494,7 +1496,6 @@ while getopts ":xnvhzas:t:r:d" opt; do
14941496
;;
14951497
v)
14961498
# echo commands about to be run to stderr
1497-
set -v
14981499
V="-v"
14991500
;;
15001501
n)
@@ -1547,14 +1548,17 @@ fi
15471548
## Attempt Locking
15481549
##
15491550

1550-
if mkdir "${LOCKDIR}" &>/dev/null; then
1551+
if $MKDIR "${LOCKDIR}" &>/dev/null; then
15511552
trap 'handle_exit' EXIT
15521553
echo "$$" > "${PIDFILE}"
15531554
# This handler will exit the script upon receiving these interrupts
15541555
# Trap configured on EXIT will be triggered by the exit from handle_signal function
15551556
trap 'handle_signal' SIGINT SIGHUP SIGQUIT SIGTERM
15561557
echo "Lock succesfully accquired and installed signal handlers"
15571558
# Proceed with the actual code
1559+
if [[ ! -z "${V}" ]]; then
1560+
set -v
1561+
fi
15581562
main
15591563
else
15601564
# lock failed, check if the other PID is alive

tests/coredump_gen_handler_test.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@
33
import sys
44
import pyfakefs
55
import unittest
6+
import signal
67
from pyfakefs.fake_filesystem_unittest import Patcher
78
from swsscommon import swsscommon
89
from utilities_common.general import load_module_from_source
910
from utilities_common.db import Db
11+
from utilities_common.auto_techsupport_helper import EXT_RETRY
1012
from .mock_tables import dbconnector
1113

1214
sys.path.append("scripts")
@@ -18,6 +20,9 @@
1820
/tmp/saisdkdump
1921
"""
2022

23+
def signal_handler(signum, frame):
24+
raise Exception("Timed out!")
25+
2126
def set_auto_ts_cfg(redis_mock, state="disabled",
2227
rate_limit_interval="0",
2328
max_core_size="0.0",
@@ -264,7 +269,7 @@ def test_since_argument(self):
264269
def mock_cmd(cmd, env):
265270
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
266271
cmd_str = " ".join(cmd)
267-
if "--since '4 days ago'" in cmd_str:
272+
if "--since 4 days ago" in cmd_str:
268273
patcher.fs.create_file(ts_dump)
269274
return 0, AUTO_TS_STDOUT + ts_dump, ""
270275
elif "date --date=4 days ago" in cmd_str:
@@ -330,7 +335,7 @@ def test_invalid_since_argument(self):
330335
def mock_cmd(cmd, env):
331336
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
332337
cmd_str = " ".join(cmd)
333-
if "--since '2 days ago'" in cmd_str:
338+
if "--since 2 days ago" in cmd_str:
334339
patcher.fs.create_file(ts_dump)
335340
print(AUTO_TS_STDOUT + ts_dump)
336341
return 0, AUTO_TS_STDOUT + ts_dump, ""
@@ -396,3 +401,30 @@ def mock_cmd(cmd, env):
396401
assert "orchagent.12345.123.core.gz" in current_fs
397402
assert "lldpmgrd.12345.22.core.gz" in current_fs
398403
assert "python3.12345.21.core.gz" in current_fs
404+
405+
def test_max_retry_ts_failure(self):
406+
"""
407+
Scenario: TS subprocess is continously returning EXT_RETRY
408+
Make sure auto-ts is not exceeding the limit
409+
"""
410+
db_wrap = Db()
411+
redis_mock = db_wrap.db
412+
set_auto_ts_cfg(redis_mock, state="enabled")
413+
set_feature_table_cfg(redis_mock, state="enabled")
414+
with Patcher() as patcher:
415+
def mock_cmd(cmd, env):
416+
return EXT_RETRY, "", ""
417+
418+
cdump_mod.subprocess_exec = mock_cmd
419+
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
420+
cls = cdump_mod.CriticalProcCoreDumpHandle("orchagent.12345.123.core.gz", "swss", redis_mock)
421+
422+
signal.signal(signal.SIGALRM, signal_handler)
423+
signal.alarm(5) # 5 seconds
424+
try:
425+
cls.handle_core_dump_creation_event()
426+
except Exception:
427+
assert False, "Method should not time out"
428+
finally:
429+
signal.alarm(0)
430+

utilities_common/auto_techsupport_helper.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
"CORE_DUMP_DIR", "CORE_DUMP_PTRN", "TS_DIR", "TS_PTRN",
1212
"CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF",
1313
"CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB",
14-
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER",
15-
"TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB"
14+
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER", "TIME_BUF",
15+
"SINCE_DEFAULT", "TS_PTRN_GLOB", "EXT_LOCKFAIL", "EXT_RETRY",
16+
"EXT_SUCCESS", "MAX_RETRY_LIMIT"
1617
] + [ # Methods
1718
"verify_recent_file_creation",
1819
"get_ts_dumps",
@@ -60,6 +61,11 @@
6061
TIME_BUF = 20
6162
SINCE_DEFAULT = "2 days ago"
6263

64+
# Techsupport Exit Codes
65+
EXT_LOCKFAIL = 2
66+
EXT_RETRY = 4
67+
EXT_SUCCESS = 0
68+
MAX_RETRY_LIMIT = 2
6369

6470
# Helper methods
6571
def subprocess_exec(cmd, env=None):

0 commit comments

Comments
 (0)