Skip to content

[core][stab/01] script to find and list tests by flakiness level #52955

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
[core][tests/1] cli for finding flaky tests
Signed-off-by: can <[email protected]>
  • Loading branch information
can-anyscale committed May 13, 2025
commit 9636c6064649aeb712f7bbe7b69c9b271158cdc7
11 changes: 11 additions & 0 deletions ci/ray_ci/automation/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,17 @@ py_binary(
],
)

py_binary(
name = "find_tests",
srcs = ["find_tests.py"],
exec_compatible_with = ["//:hermetic_python"],
deps = [
ci_require("click"),
"//ci/ray_ci:ray_ci_lib",
"//release:ray_release",
],
)

py_library(
name = "update_version_lib",
srcs = ["update_version_lib.py"],
Expand Down
88 changes: 88 additions & 0 deletions ci/ray_ci/automation/find_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import click
from typing import List, Set, Dict
from enum import Enum

from ci.ray_ci.utils import logger, ci_init
from ray_release.configs.global_config import get_global_config
from ray_release.test import Test
from ray_release.result import ResultStatus
from ray_release.test_automation.ci_state_machine import CITestStateMachine

# The s3 prefix for the tests that run on Linux. It comes from the bazel prefix rule
# linux:// with the character "/" replaced by "_" for s3 compatibility
LINUX_TEST_PREFIX = "linux:__"

class TestStatistics:
@classmethod
def analyze(cls, test: Test, test_history_length: int) -> "TestStatistics":
"""
Construct a TestStatistic object with statistic computed
"""
stat = cls(test)
stat._result_histories = test.get_test_results(
limit=test_history_length,
use_async=True,
)
stat._compute_flaky_percentage()

return stat

def get_flaky_percentage(self) -> float:
"""
Get the flaky percentage of the test
"""
return self.flaky_percentage

def __str__(self) -> str:
"""
String representation of the TestStatistics object
"""
return f"Test: {self.test.get_name()}, Flaky Percentage: {self.flaky_percentage}"

def __init__(self, test: Test) -> None:
self.test = test
self.flaky_percentage = 0
self._result_histories = []

def _compute_flaky_percentage(self, test_history_length: int) -> float:
return CITestStateMachine.get_flaky_percentage(self._result_histories)

class OrderBy(str, Enum):
"""
Enum for the order by options
"""
FLAKY_PERCENTAGE = "flaky_percentage"


@click.command()
@click.argument("team", required=True, type=str)
@click.option("--test-history-length", default=100, type=int)
@click.option("--test-prefix", default=LINUX_TEST_PREFIX, type=str)
@click.option("--order-by", default=OrderBy.FLAKY_PERCENTAGE, type=click.Choice([OrderBy.FLAKY_PERCENTAGE]))
def main(
team: str,
test_history_length: int,
test_prefix: str,
order_by: str,
) -> None:
"""
This script finds tests that are low quality based on certain criteria (flakiness,
slowness, etc.)
"""
ci_init()
tests = [
test for test in Test.gen_from_s3(test_prefix) if test.get_oncall() == team
]
logger.info(f"Analyzing {len(tests)} tests for team {team}")
test_stats = ([
TestStatistics.analyze(test, test_history_length) for test in tests
]).sort(
key=lambda x: x.get_flaky_percentage(), reverse=True
)
logger.info(f"Tests sorted by {order_by}:")
for test_stat in test_stats:
logger.info(test_stat)


if __name__ == "__main__":
main()
19 changes: 15 additions & 4 deletions release/ray_release/test_automation/ci_state_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,25 @@ def _passing_to_flaky(self) -> bool:
return self.is_flaky_result_history(self.test_results)

@staticmethod
def is_flaky_result_history(results: List[TestResult]):
def is_flaky_result_history(results: List[TestResult]) -> bool:
return (
CITestStateMachine.get_flaky_percentage(results)
>= FLAKY_PERCENTAGE_THRESHOLD
)

def get_flaky_percentage(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way this is written, I don't think we can ever see a flaky percentage of 100%.

For example, with a test history of:

Pass -> Fail -> Pass -> Fail -> Pass -> Fail
Transitions=2/6 = 33%

It also means that this history is more flaky than the previous:

Fail -> Pass -> Fail -> Pass -> Fail -> Pass
Transitions=3/6 = 50%

Is there a reason why we don't count Pass -> Fail as flaky transitions?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, you're correct, it's just one way of computing a score that reflect the flip-floping behavior, and can be used as a number for comparision, no special reason why we don't count Pass -> Fail, if we do we just need to double the FLAKY_PERCENTAGE_THRESHOLD to make it not too punishing

results: List[TestResult],
) -> float:
"""
Get the percentage of flaky tests in the test history
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function does the actual flaky test calculation. Can you add the method for calculation and why it's the method we choose to the docstring?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do

"""
if not results:
return 0.0
transition = 0
for i in range(0, len(results) - 1):
if results[i].is_failing() and results[i + 1].is_passing():
transition += 1
if transition >= FLAKY_PERCENTAGE_THRESHOLD * len(results) / 100:
return True
return False
return transition * 100 / len(results)

def _consistently_failing_to_flaky(self) -> bool:
return len(self.test_results) >= CONTINUOUS_FAILURE_TO_FLAKY and all(
Expand Down