Skip to content

Commit ca9a020

Browse files
authored
[generate_dump] [Mellanox] Fix the duplicate dfw dump collection problem by adding symlinks (sonic-net#2536)
- What I did Currently the dfw dumps which are usually saved under /var/log/mellanox/sdk-dumps are collect twice in the techsupport. Once under log/ and once under sai_sdk_dump/ folder. Fixed the scenario by creating a symbolic link from sai_sdk_dump/sai-dfw-xxxxxxxxx.tar.gz -> ../log/sai-dfw-xxxxxxxxx.tar.gz - How I did it dfw dumps are copied from syncd currently, but the logic is updated to collect files from the host if SAI_DUMP_STORE_PATH is mounted on the host Fixed the duplicate dfw dump collection problems by adding a relative symbolic link from sai-sdk-dump/ -> log/ folder. fw dump me collection is moved to a new function collect_mellanox_dfw_dumps which in run at the end i.e. after the files under /var/log are saved - How to verify it root@switch:/home/admin# show techsupport --verbose root@switch:/home/admin/sonic_dump_r-lionfish-13_20221202_081958/log# ls -Al | grep dfw -rw-r--r-- 1 root root 1841061 Dec 2 08:21 sai-dfw-1669685690.tar.gz root@switch:/home/admin/sonic_dump_r-lionfish-13_20221202_081958/sai_sdk_dump# ls -Al Signed-off-by: Vivek Reddy Karri <[email protected]>
1 parent 92c7001 commit ca9a020

File tree

1 file changed

+94
-10
lines changed

1 file changed

+94
-10
lines changed

scripts/generate_dump

Lines changed: 94 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -993,6 +993,49 @@ enable_logrotate() {
993993
sed -i '/\/usr\/sbin\/logrotate/s/^#*//g' /etc/cron.d/logrotate
994994
}
995995

996+
###############################################################################
997+
# Create a relative symbolic link of an existing file
998+
# Globals:
999+
# BASE
1000+
# MKDIR
1001+
# TAR
1002+
# TARFILE
1003+
# DUMPDIR
1004+
# V
1005+
# RM
1006+
# NOOP
1007+
# Arguments:
1008+
# filename: the full path of the file
1009+
# dest_dir: destination dir where the link is created
1010+
# src_sir: directory under $TARDIR where the actual file exists
1011+
# Returns:
1012+
# None
1013+
###############################################################################
1014+
save_symlink() {
1015+
trap 'handle_error $? $LINENO' ERR
1016+
local start_t=$(date +%s%3N)
1017+
local end_t=0
1018+
local filename=$1
1019+
local dest_dir=$2
1020+
local src_dir=$3
1021+
local do_tar_append=${4:-true}
1022+
local file_basename=$(basename $filename)
1023+
local tar_path="$BASE/$dest_dir/$file_basename"
1024+
1025+
$MKDIR $V -p "$TARDIR/$dest_dir"
1026+
1027+
${CMD_PREFIX}pushd $TARDIR/$dest_dir
1028+
${CMD_PREFIX}ln -s ../$src_dir/$file_basename $file_basename
1029+
${CMD_PREFIX}popd
1030+
1031+
if $do_tar_append; then
1032+
($TAR $V -rf $TARFILE -C $DUMPDIR "$tar_path" \
1033+
|| abort "${EXT_PROCFS_SAVE_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \
1034+
&& $RM $V -f "$DUMPDIR/$tar_path"
1035+
fi
1036+
end_t=$(date +%s%3N)
1037+
echo "[ save_symlink:$filename] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO
1038+
}
9961039

9971040
###############################################################################
9981041
# Collect Mellanox specific information
@@ -1025,16 +1068,6 @@ collect_mellanox() {
10251068
${CMD_PREFIX}rm -rf $sai_dump_folder
10261069
${CMD_PREFIX}docker exec syncd rm -rf $sai_dump_folder
10271070

1028-
# Save SDK error dumps
1029-
local sdk_dump_path=`${CMD_PREFIX}docker exec syncd cat /tmp/sai.profile|grep "SAI_DUMP_STORE_PATH"|cut -d = -f2`
1030-
if [[ -d $sdk_dump_path ]]; then
1031-
copy_from_docker syncd $sdk_dump_path /tmp/sdk-dumps
1032-
for file in $(find /tmp/sdk-dumps -type f); do
1033-
save_file ${file} sai_sdk_dump false
1034-
done
1035-
rm -rf /tmp/sdk-dumps
1036-
fi
1037-
10381071
# run 'hw-management-generate-dump.sh' script and save the result file
10391072
HW_DUMP_FILE=/usr/bin/hw-management-generate-dump.sh
10401073
if [ -f "$HW_DUMP_FILE" ]; then
@@ -1056,6 +1089,53 @@ collect_mellanox() {
10561089

10571090
}
10581091

1092+
###############################################################################
1093+
# Collect dfw dumps if any. Applies to only MLNX platform
1094+
# Globals:
1095+
# CMD_PREFIX
1096+
# Arguments:
1097+
# None
1098+
# Returns:
1099+
# None
1100+
###############################################################################
1101+
collect_mellanox_dfw_dumps() {
1102+
trap 'handle_error $? $LINENO' ERR
1103+
local platform=$(python3 -c "from sonic_py_common import device_info; print(device_info.get_platform())")
1104+
local hwsku=$(python3 -c "from sonic_py_common import device_info; print(device_info.get_hwsku())")
1105+
local sdk_dump_path=`cat /usr/share/sonic/device/${platform}/${hwsku}/sai.profile|grep "SAI_DUMP_STORE_PATH"|cut -d = -f2`
1106+
1107+
if [[ ! -d $sdk_dump_path ]]; then
1108+
# This would mean the SAI_DUMP_STORE_PATH is not mounted on the host and is only accessible though the container
1109+
# This is a bad design and not recommended But there is nothing which restricts against it and thus the special handling
1110+
if [[ "$( docker container inspect -f '{{.State.Running}}' syncd )" == "true" ]]; then
1111+
$RM $V -rf /tmp/dfw-sdk-dumps
1112+
$MKDIR $V -p /tmp/dfw-sdk-dumps
1113+
copy_from_docker syncd $sdk_dump_path /tmp/dfw-sdk-dumps
1114+
else
1115+
echo "ERROR: dfw dumps cannot be collected"
1116+
fi
1117+
sdk_dump_path="/tmp/dfw-sdk-dumps"
1118+
fi
1119+
1120+
for file in $(find_files "$sdk_dump_path"); do
1121+
if $TAR -tf $TARFILE | grep $BASE/log/$(basename $file); then
1122+
# If this path sits under "/var/log/" dir, the files
1123+
# would've already been collected and thus just add a sym link
1124+
if [ ! -z "${file##*.gz}" ]; then
1125+
# files saved under log/ are zipped with gz
1126+
file=$file.gz
1127+
fi
1128+
${CMD_PREFIX}save_symlink ${file} sai_sdk_dump log
1129+
else
1130+
if [ ! -z "${file##*.gz}" ]; then
1131+
${CMD_PREFIX}save_file ${file} sai_sdk_dump true
1132+
else
1133+
${CMD_PREFIX}save_file ${file} sai_sdk_dump false
1134+
fi
1135+
fi
1136+
done
1137+
}
1138+
10591139
###############################################################################
10601140
# Collect Broadcom specific information
10611141
# Globals:
@@ -1626,6 +1706,10 @@ main() {
16261706
save_crash_files
16271707
save_warmboot_files
16281708

1709+
if [[ "$asic" = "mellanox" ]]; then
1710+
collect_mellanox_dfw_dumps
1711+
fi
1712+
16291713
finalize
16301714
}
16311715

0 commit comments

Comments
 (0)