[core][tests/1] cli for finding flaky tests

Signed-off-by: can <[email protected]>
ray-project · can-anyscale · May 13, 2025 · May 13, 2025 · May 13, 2025 · 9636c6064649aeb712f7bbe7b69c9b271158cdc7
commit 9636c6064649aeb712f7bbe7b69c9b271158cdc7
@@ -67,6 +67,17 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "find_tests",
+    srcs = ["find_tests.py"],
+    exec_compatible_with = ["//:hermetic_python"],
+    deps = [
+        ci_require("click"),
+        "//ci/ray_ci:ray_ci_lib",
+        "//release:ray_release",
+    ],
+)
+
 py_library(
     name = "update_version_lib",
     srcs = ["update_version_lib.py"],

@@ -0,0 +1,88 @@
+import click
+from typing import List, Set, Dict
+from enum import Enum
+
+from ci.ray_ci.utils import logger, ci_init
+from ray_release.configs.global_config import get_global_config
+from ray_release.test import Test
+from ray_release.result import ResultStatus
+from ray_release.test_automation.ci_state_machine import CITestStateMachine
+
+# The s3 prefix for the tests that run on Linux. It comes from the bazel prefix rule
+# linux:// with the character "/" replaced by "_" for s3 compatibility
+LINUX_TEST_PREFIX = "linux:__"
+
+class TestStatistics:
+    @classmethod
+    def analyze(cls, test: Test, test_history_length: int) -> "TestStatistics":
+        """
+        Construct a TestStatistic object with statistic computed
+        """
+        stat = cls(test)
+        stat._result_histories = test.get_test_results(
+            limit=test_history_length,
+            use_async=True,
+        )
+        stat._compute_flaky_percentage()
+
+        return stat
+
+    def get_flaky_percentage(self) -> float:
+        """
+        Get the flaky percentage of the test
+        """
+        return self.flaky_percentage
+
+    def __str__(self) -> str:
+        """
+        String representation of the TestStatistics object
+        """
+        return f"Test: {self.test.get_name()}, Flaky Percentage: {self.flaky_percentage}"
+
+    def __init__(self, test: Test) -> None:
+        self.test = test
+        self.flaky_percentage = 0
+        self._result_histories = []
+
+    def _compute_flaky_percentage(self, test_history_length: int) -> float:
+        return CITestStateMachine.get_flaky_percentage(self._result_histories)
+
+class OrderBy(str, Enum):
+    """
+    Enum for the order by options
+    """
+    FLAKY_PERCENTAGE = "flaky_percentage"
+
+
+@click.command()
+@click.argument("team", required=True, type=str)
+@click.option("--test-history-length", default=100, type=int)
+@click.option("--test-prefix", default=LINUX_TEST_PREFIX, type=str)
+@click.option("--order-by", default=OrderBy.FLAKY_PERCENTAGE, type=click.Choice([OrderBy.FLAKY_PERCENTAGE]))
+def main(
+    team: str,
+    test_history_length: int,
+    test_prefix: str,
+    order_by: str,
+) -> None:
+    """
+    This script finds tests that are low quality based on certain criteria (flakiness, 
+    slowness, etc.)
+    """
+    ci_init()
+    tests = [
+        test for test in Test.gen_from_s3(test_prefix) if test.get_oncall() == team
+    ]
+    logger.info(f"Analyzing {len(tests)} tests for team {team}")
+    test_stats = ([
+        TestStatistics.analyze(test, test_history_length) for test in tests
+    ]).sort(
+        key=lambda x: x.get_flaky_percentage(), reverse=True
+    )
+    logger.info(f"Tests sorted by {order_by}:")
+    for test_stat in test_stats:
+        logger.info(test_stat)
+
+
+if __name__ == "__main__":
+    main()
@@ -112,14 +112,25 @@ def _passing_to_flaky(self) -> bool:
         return self.is_flaky_result_history(self.test_results)
 
     @staticmethod
-    def is_flaky_result_history(results: List[TestResult]):
+    def is_flaky_result_history(results: List[TestResult]) -> bool:
+        return (
+            CITestStateMachine.get_flaky_percentage(results)
+            >= FLAKY_PERCENTAGE_THRESHOLD
+        )
+
+    def get_flaky_percentage(
+        results: List[TestResult],
+    ) -> float:
+        """
+        Get the percentage of flaky tests in the test history
+        """
+        if not results:
+            return 0.0
         transition = 0
         for i in range(0, len(results) - 1):
             if results[i].is_failing() and results[i + 1].is_passing():
                 transition += 1
-        if transition >= FLAKY_PERCENTAGE_THRESHOLD * len(results) / 100:
-            return True
-        return False
+        return transition * 100 / len(results)
 
     def _consistently_failing_to_flaky(self) -> bool:
         return len(self.test_results) >= CONTINUOUS_FAILURE_TO_FLAKY and all(