-
Notifications
You must be signed in to change notification settings - Fork 6.6k
[core][stab/01] script to find and list tests by flakiness level #52955
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
Signed-off-by: can <[email protected]>
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import click | ||
from typing import List, Set, Dict | ||
from enum import Enum | ||
|
||
from ci.ray_ci.utils import logger, ci_init | ||
from ray_release.configs.global_config import get_global_config | ||
from ray_release.test import Test | ||
from ray_release.result import ResultStatus | ||
from ray_release.test_automation.ci_state_machine import CITestStateMachine | ||
|
||
# The s3 prefix for the tests that run on Linux. It comes from the bazel prefix rule | ||
# linux:// with the character "/" replaced by "_" for s3 compatibility | ||
LINUX_TEST_PREFIX = "linux:__" | ||
|
||
class TestStatistics: | ||
@classmethod | ||
def analyze(cls, test: Test, test_history_length: int) -> "TestStatistics": | ||
""" | ||
Construct a TestStatistic object with statistic computed | ||
""" | ||
stat = cls(test) | ||
stat._result_histories = test.get_test_results( | ||
limit=test_history_length, | ||
use_async=True, | ||
) | ||
stat._compute_flaky_percentage() | ||
|
||
return stat | ||
|
||
def get_flaky_percentage(self) -> float: | ||
""" | ||
Get the flaky percentage of the test | ||
""" | ||
return self.flaky_percentage | ||
|
||
def __str__(self) -> str: | ||
""" | ||
String representation of the TestStatistics object | ||
""" | ||
return f"Test: {self.test.get_name()}, Flaky Percentage: {self.flaky_percentage}" | ||
|
||
def __init__(self, test: Test) -> None: | ||
self.test = test | ||
self.flaky_percentage = 0 | ||
self._result_histories = [] | ||
|
||
def _compute_flaky_percentage(self, test_history_length: int) -> float: | ||
return CITestStateMachine.get_flaky_percentage(self._result_histories) | ||
|
||
class OrderBy(str, Enum): | ||
""" | ||
Enum for the order by options | ||
""" | ||
FLAKY_PERCENTAGE = "flaky_percentage" | ||
|
||
|
||
@click.command() | ||
@click.argument("team", required=True, type=str) | ||
@click.option("--test-history-length", default=100, type=int) | ||
@click.option("--test-prefix", default=LINUX_TEST_PREFIX, type=str) | ||
@click.option("--order-by", default=OrderBy.FLAKY_PERCENTAGE, type=click.Choice([OrderBy.FLAKY_PERCENTAGE])) | ||
def main( | ||
team: str, | ||
test_history_length: int, | ||
test_prefix: str, | ||
order_by: str, | ||
) -> None: | ||
""" | ||
This script finds tests that are low quality based on certain criteria (flakiness, | ||
slowness, etc.) | ||
""" | ||
ci_init() | ||
tests = [ | ||
test for test in Test.gen_from_s3(test_prefix) if test.get_oncall() == team | ||
] | ||
logger.info(f"Analyzing {len(tests)} tests for team {team}") | ||
test_stats = ([ | ||
TestStatistics.analyze(test, test_history_length) for test in tests | ||
]).sort( | ||
key=lambda x: x.get_flaky_percentage(), reverse=True | ||
) | ||
logger.info(f"Tests sorted by {order_by}:") | ||
for test_stat in test_stats: | ||
logger.info(test_stat) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,14 +112,25 @@ def _passing_to_flaky(self) -> bool: | |
return self.is_flaky_result_history(self.test_results) | ||
|
||
@staticmethod | ||
def is_flaky_result_history(results: List[TestResult]): | ||
def is_flaky_result_history(results: List[TestResult]) -> bool: | ||
return ( | ||
CITestStateMachine.get_flaky_percentage(results) | ||
>= FLAKY_PERCENTAGE_THRESHOLD | ||
) | ||
|
||
def get_flaky_percentage( | ||
results: List[TestResult], | ||
) -> float: | ||
""" | ||
Get the percentage of flaky tests in the test history | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function does the actual flaky test calculation. Can you add the method for calculation and why it's the method we choose to the docstring? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will do |
||
""" | ||
if not results: | ||
return 0.0 | ||
transition = 0 | ||
for i in range(0, len(results) - 1): | ||
if results[i].is_failing() and results[i + 1].is_passing(): | ||
transition += 1 | ||
if transition >= FLAKY_PERCENTAGE_THRESHOLD * len(results) / 100: | ||
return True | ||
return False | ||
return transition * 100 / len(results) | ||
|
||
def _consistently_failing_to_flaky(self) -> bool: | ||
return len(self.test_results) >= CONTINUOUS_FAILURE_TO_FLAKY and all( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The way this is written, I don't think we can ever see a flaky percentage of 100%.
For example, with a test history of:
Pass -> Fail -> Pass -> Fail -> Pass -> Fail
Transitions=2/6 = 33%
It also means that this history is more flaky than the previous:
Fail -> Pass -> Fail -> Pass -> Fail -> Pass
Transitions=3/6 = 50%
Is there a reason why we don't count Pass -> Fail as flaky transitions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, you're correct, it's just one way of computing a score that reflect the flip-floping behavior, and can be used as a number for comparision, no special reason why we don't count Pass -> Fail, if we do we just need to double the
FLAKY_PERCENTAGE_THRESHOLD
to make it not too punishing