Skip to content

Commit fd021be

Browse files
[Post Mortem] Log number of errors in detail log (mlcommons#2164)
* Log number of errors in detail log * [Automated Commit] Format Codebase --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 5e90395 commit fd021be

File tree

4 files changed

+15
-3
lines changed

4 files changed

+15
-3
lines changed

loadgen/logging.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ class AsyncLog {
314314
QuerySampleLatency GetMaxLatencySoFar();
315315
void SetUseTokens(bool use_tokens);
316316
void SetNeedsFirstToken(bool needs_first_token);
317+
size_t GetErrorCount() { return log_error_count_; };
317318

318319
private:
319320
void WriteAccuracyHeaderLocked();

loadgen/results.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -848,8 +848,9 @@ void PerformanceSummary::LogDetail(AsyncDetail& detail) {
848848
break;
849849
}
850850
}
851-
#endif
852851
}
852+
MLPERF_LOG(detail, "num_errors", detail.async_log().GetErrorCount());
853+
#endif
853854
}
854855
} // namespace loadgen
855856
} // namespace mlperf

tools/submission/log_parser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ def num_messages(self):
128128

129129
def num_errors(self):
130130
"""Get number of errors in the log."""
131+
if "num_errors" in self.keys:
132+
return self.__getitem__("num_errors")
131133
count = 0
132134
for message in self.messages:
133135
if message["metadata"]["is_error"]:

tools/submission/submission_checker.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1302,6 +1302,9 @@ def check_accuracy_dir(config, model, path, verbose):
13021302
fname = os.path.join(path, "mlperf_log_detail.txt")
13031303
if not find_error_in_detail_log(config, fname):
13041304
is_valid = False
1305+
log.error(
1306+
"%s has loadgen errors, number of errors: %s", path, mlperf_log.num_errors()
1307+
)
13051308

13061309
return is_valid, result_acc
13071310

@@ -1434,6 +1437,9 @@ def check_performance_dir(
14341437
fname = os.path.join(path, "mlperf_log_detail.txt")
14351438
if not find_error_in_detail_log(config, fname):
14361439
is_valid = False
1440+
log.error(
1441+
"%s has loadgen errors, number of errors: %s", path, mlperf_log.num_errors()
1442+
)
14371443

14381444
required_performance_sample_count = config.get_performance_sample_count(
14391445
model)
@@ -1702,7 +1708,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res):
17021708
samples_per_query = 8
17031709

17041710
if (scenario_fixed in ["MultiStream"]
1705-
) and scenario in ["SingleStream"]:
1711+
) and scenario in ["SingleStream"]:
17061712
power_metric = (
17071713
avg_power * power_duration * samples_per_query * 1000 / num_queries
17081714
)
@@ -1965,7 +1971,9 @@ def log_result(
19651971
if config.version == "v4.0":
19661972
unit = unit_dict[scenario_fixed]
19671973
else:
1968-
unit = special_unit_dict.get(mlperf_model, unit_dict).get(scenario_fixed, unit_dict[scenario_fixed])
1974+
unit = special_unit_dict.get(
1975+
mlperf_model, unit_dict).get(
1976+
scenario_fixed, unit_dict[scenario_fixed])
19691977
power_unit = power_unit_dict[scenario_fixed]
19701978

19711979
if (power_metric <= 0) or (

0 commit comments

Comments
 (0)