-
Notifications
You must be signed in to change notification settings - Fork 6.8k
[data] reset cpu + gpu metrics on executor shutdown and updating task submission/block generation metrics #57246
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
4362718
c0d459b
780dee1
1216e54
0d4339f
c71130e
b0672a8
ada3c90
8f12426
a95b892
de78ced
0aa2049
9a25054
5838314
24f87eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -451,7 +451,7 @@ | |
unit="seconds", | ||
targets=[ | ||
Target( | ||
expr='sum(ray_data_block_generation_time{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', | ||
expr='increase(ray_data_block_generation_time{{{global_filters}, operator=~"$Operator"}}[5m]) / increase(ray_data_num_task_outputs_generated{{{global_filters}, operator=~"$Operator"}}[5m])', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
legend="Block Generation Time: {{dataset}}, {{operator}}", | ||
) | ||
], | ||
|
@@ -466,7 +466,7 @@ | |
unit="seconds", | ||
targets=[ | ||
Target( | ||
expr='sum(ray_data_task_submission_backpressure_time{{{global_filters}, operator=~"$Operator"}}) by (dataset, operator)', | ||
expr='increase(ray_data_task_submission_backpressure_time{{{global_filters}, operator=~"$Operator"}}[5m]) / increase(ray_data_num_tasks_submitted{{{global_filters}, operator=~"$Operator"}}[5m])', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. W/O PR: shows total sum of submitted tasks (could be meaningful) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Prometheus Query Division by Zero IssueThe Prometheus query for the "Task Submission Backpressure Time" panel can result in division by zero. If no tasks are submitted within a 5-minute window, the denominator |
||
legend="Backpressure Time: {{dataset}}, {{operator}}", | ||
) | ||
], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -258,8 +258,9 @@ def shutdown(self, force: bool, exception: Optional[Exception] = None): | |
stats_summary_string = self._final_stats.to_summary().to_string( | ||
include_parent=False | ||
) | ||
# Reset the scheduling loop duration gauge. | ||
self._sched_loop_duration_s.set(0, tags={"dataset": self._dataset_id}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this meant to be nuked? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, the update_metrics calls it |
||
# Reset the scheduling loop duration gauge + resource manager budgets/usages. | ||
self._resource_manager.update_usages() | ||
self.update_metrics(0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
if self._data_context.enable_auto_log_stats: | ||
logger.info(stats_summary_string) | ||
# Close the progress bars from top to bottom to avoid them jumping | ||
|
@@ -364,7 +365,12 @@ def update_metrics(self, sched_loop_duration: int): | |
|
||
def _update_budget_metrics(self, op: PhysicalOperator, tags: Dict[str, str]): | ||
budget = self._resource_manager.get_budget(op) | ||
if budget is not None: | ||
if budget is None: | ||
cpu_budget = 0 | ||
gpu_budget = 0 | ||
memory_budget = 0 | ||
object_store_memory_budget = 0 | ||
else: | ||
# Convert inf to -1 to represent unlimited budget in metrics | ||
cpu_budget = -1 if math.isinf(budget.cpu) else budget.cpu | ||
gpu_budget = -1 if math.isinf(budget.gpu) else budget.gpu | ||
|
@@ -374,10 +380,11 @@ def _update_budget_metrics(self, op: PhysicalOperator, tags: Dict[str, str]): | |
if math.isinf(budget.object_store_memory) | ||
else budget.object_store_memory | ||
) | ||
self._cpu_budget_gauge.set(cpu_budget, tags=tags) | ||
self._gpu_budget_gauge.set(gpu_budget, tags=tags) | ||
self._memory_budget_gauge.set(memory_budget, tags=tags) | ||
self._osm_budget_gauge.set(object_store_memory_budget, tags=tags) | ||
|
||
self._cpu_budget_gauge.set(cpu_budget, tags=tags) | ||
self._gpu_budget_gauge.set(gpu_budget, tags=tags) | ||
self._memory_budget_gauge.set(memory_budget, tags=tags) | ||
self._osm_budget_gauge.set(object_store_memory_budget, tags=tags) | ||
|
||
def _update_max_bytes_to_read_metric( | ||
self, op: PhysicalOperator, tags: Dict[str, str] | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
W/O PR: shows total sum of block generation time (meaningless)
W/ PR: shows average block generation time over 5min period