Skip to content

Even without --f2p_only, not the entire test suite is being run. #105

Open
@MarcCote

Description

@MarcCote

According to #95

  • In get_test_cmd, we remove logic that only runs the files with FAIL_TO_PASS tests for evaluation - instead, the entire test suite is run.

When looking at the code that build the test command, I noticed only test files related to the files modified in the patch will be used (i.e. not the entire test suite).
https://github.com/SWE-bench/SWE-smith/blob/main/swesmith/profiles/base.py#L288-L321

For instance, this causes an issue for the following task
iterative__dvc.1d6ea681.func_pm_class_rm_base__1ygoyhp0

which will have this test_cmd

pytest --disable-warnings --color=no --tb=no --verbose tests/unit/test_context.py

which when the gold patch is applied will output

tests/unit/test_context.py::test_context PASSED                          [  5%]
tests/unit/test_context.py::test_context_dict_ignores_keys_except_str PASSED [ 10%]
tests/unit/test_context.py::test_context_list PASSED                     [ 15%]
tests/unit/test_context.py::test_context_setitem_getitem PASSED          [ 20%]
tests/unit/test_context.py::test_loop_context PASSED                     [ 25%]
tests/unit/test_context.py::test_repr PASSED                             [ 30%]
tests/unit/test_context.py::test_select PASSED                           [ 35%]
tests/unit/test_context.py::test_select_unwrap PASSED                    [ 40%]
tests/unit/test_context.py::test_merge_dict PASSED                       [ 45%]
tests/unit/test_context.py::test_merge_list PASSED                       [ 50%]
tests/unit/test_context.py::test_overwrite_with_setitem PASSED           [ 55%]
tests/unit/test_context.py::test_load_from PASSED                        [ 60%]
tests/unit/test_context.py::test_clone PASSED                            [ 65%]
tests/unit/test_context.py::test_track PASSED                            [ 70%]
tests/unit/test_context.py::test_track_from_multiple_files PASSED        [ 75%]
tests/unit/test_context.py::test_node_value PASSED                       [ 80%]
tests/unit/test_context.py::test_resolve_resolves_dict_keys PASSED       [ 85%]
tests/unit/test_context.py::test_resolve_resolves_boolean_value PASSED   [ 90%]
tests/unit/test_context.py::test_load_from_raises_if_file_not_exist PASSED [ 95%]
tests/unit/test_context.py::test_load_from_raises_if_file_is_directory PASSED [100%]

============================== 20 passed in 2.22s ==============================

while the expected FAIL_TO_PASS for this task according to the SWE-Smith dataset (https://huggingface.co/datasets/SWE-bench/SWE-smith/sql-console/t8ei8w2) has much more tests

[tests/func/parsing/test_errors.py::test_failed_to_interpolate,tests/func/parsing/test_errors.py::test_local_vars_params_file_not_exist,tests/func/parsing/test_errors.py::test_specified_key_does_not_exist,tests/func/parsing/test_errors.py::test_interpolate_non_string,tests/func/parsing/test_errors.py::test_interpolate_nested_iterable,tests/func/parsing/test_errors.py::test_partial_vars_doesnot_exist,tests/func/parsing/test_errors.py::test_foreach_data_key_does_not_exists[modelss],tests/func/parsing/test_errors.py::test_foreach_data_key_does_not_exists[modelss.123],tests/func/parsing/test_errors.py::test_foreach_data_expects_list_or_dict[${foo}],tests/func/parsing/test_errors.py::test_foreach_data_expects_list_or_dict[${dct.model1}],tests/func/parsing/test_errors.py::test_foreach_data_expects_list_or_dict[${lst.0}],tests/func/parsing/test_errors.py::test_foreach_data_expects_list_or_dict[foobar],tests/func/parsing/test_errors.py::test_foreach_do_syntax_errors,tests/func/parsing/test_errors.py::test_foreach_do_definition_item_does_not_exist[[email protected]],tests/func/parsing/test_errors.py::test_foreach_do_definition_item_does_not_exist[[email protected]],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[True-redefine0],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[True-redefine1],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[True-redefine2],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[True-redefine3],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[False-redefine0],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[False-redefine1],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[False-redefine2],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[False-redefine3],tests/func/parsing/test_errors.py::test_foreach_wdir_key_does_not_exist,tests/func/parsing/test_foreach.py::test_with_simple_list_data,tests/func/parsing/test_foreach.py::test_with_dict_data,tests/func/parsing/test_foreach.py::test_with_dict_with_non_str_keys,tests/func/parsing/test_foreach.py::test_with_composite_list,tests/func/parsing/test_foreach.py::test_foreach_interpolated_simple_list,tests/func/parsing/test_foreach.py::test_foreach_interpolate_with_composite_data[foreach_data0-result0-${item.thresh}],tests/func/parsing/test_foreach.py::test_foreach_interpolate_with_composite_data[foreach_data0-result0-${item[thresh]}],tests/func/parsing/test_foreach.py::test_foreach_interpolate_with_composite_data[foreach_data1-result1-${item.thresh}],tests/func/parsing/test_foreach.py::test_foreach_interpolate_with_composite_data[foreach_data1-result1-${item[thresh]}],tests/func/parsing/test_foreach.py::test_params_file_with_dict_tracked,tests/func/parsing/test_foreach.py::test_params_file_tracked_for_composite_list,tests/func/parsing/test_foreach.py::test_foreach_data_from_nested_vars,tests/func/parsing/test_foreach.py::test_foreach_partial_interpolations,tests/func/parsing/test_foreach.py::test_mixed_vars_for_foreach_data,tests/func/parsing/test_foreach.py::test_mixed_vars_for_foreach_data_2,tests/func/parsing/test_foreach.py::test_foreach_with_interpolated_wdir,tests/func/parsing/test_foreach.py::test_foreach_with_local_vars,tests/func/parsing/test_foreach.py::test_foreach_with_imported_vars[test_params.yaml],tests/func/parsing/test_foreach.py::test_foreach_with_imported_vars[test_params.yaml:train],tests/func/parsing/test_foreach.py::test_foreach_with_imported_vars[test_params.yaml:train,prepare],tests/func/parsing/test_foreach.py::test_foreach_with_interpolated_wdir_and_local_vars[params.yaml],tests/func/parsing/test_foreach.py::test_foreach_with_interpolated_wdir_and_local_vars[params.yaml:train,prepare],tests/func/parsing/test_foreach.py::test_foreach_do_syntax_is_checked_once,tests/func/parsing/test_foreach.py::test_foreach_data_is_only_resolved_once,tests/func/parsing/test_interpolated_entry.py::test_simple,tests/func/parsing/test_interpolated_entry.py::test_vars_import,tests/func/parsing/test_interpolated_entry.py::test_vars_and_params_import,tests/func/parsing/test_interpolated_entry.py::test_stage_with_wdir,tests/func/parsing/test_interpolated_entry.py::test_with_templated_wdir,tests/func/parsing/test_interpolated_entry.py::test_resolve_local_tries_to_load_globally_used_files,tests/func/parsing/test_interpolated_entry.py::test_resolve_local_tries_to_load_globally_used_params_yaml,tests/func/parsing/test_interpolated_entry.py::test_vars_relpath_overwrite,tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_0-True],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_0-False],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_1-True],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_1-False],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_2-True],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_2-False],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_3-True],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_3-False],tests/func/parsing/test_interpolated_entry.py::test_cmd_dict[None-None],tests/func/parsing/test_interpolated_entry.py::test_cmd_dict[store_true-nargs],tests/func/parsing/test_interpolated_entry.py::test_cmd_dict[boolean_optional-append],tests/func/parsing/test_matrix.py::test_matrix_interpolated[matrix0],tests/func/parsing/test_matrix.py::test_matrix_interpolated[matrix1],tests/func/parsing/test_matrix.py::test_matrix_key_present[matrix0],tests/func/parsing/test_matrix.py::test_matrix_key_present[matrix1],tests/func/parsing/test_resolver.py::test_resolver,tests/func/parsing/test_resolver.py::test_default_params_file_not_exist,tests/func/parsing/test_resolver.py::test_no_params_yaml_and_vars,tests/func/parsing/test_resolver.py::test_local_vars,tests/func/parsing/test_resolver.py::test_default_params_file[vars_0],tests/func/parsing/test_resolver.py::test_default_params_file[vars_1],tests/func/parsing/test_resolver.py::test_default_params_file[vars_2],tests/func/parsing/test_resolver.py::test_load_vars_from_file,tests/func/parsing/test_resolver.py::test_load_vars_with_relpath,tests/func/parsing/test_resolver.py::test_global_overwrite_error_on_imports,tests/func/parsing/test_resolver.py::test_global_overwrite_vars,tests/func/parsing/test_resolver.py::test_local_declared_vars_overwrite,tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_0-True],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_0-False],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_1-True],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_1-False],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_2-True],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_2-False],tests/func/parsing/test_resolver.py::test_local_overwrite_error[vars_0-build.vars[0]],tests/func/parsing/test_resolver.py::test_local_overwrite_error[params.json-params.json],tests/func/parsing/test_top_level.py::test_params,tests/func/parsing/test_top_level.py::test_metrics,tests/func/parsing/test_top_level.py::test_plots,tests/func/parsing/test_top_level.py::test_artifacts,tests/func/parsing/test_top_level.py::test_datasets]

Since now the grading logic assume failure when a test is not present, that means the sanity check fails.

Step to reproduce

python swesmith/harness/eval.py --run_id sanity --predictions_path gold --instance_ids iterative__dvc.1d6ea681.func_pm_class_rm_base__1ygoyhp0python swesmith/harness/eval.py --run_id sanity --redo_failing --predictions_path gold --instance_ids iterative__dvc.1d6ea681.func_pm_class_rm_base__1ygoyhp0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions