Description
According to #95
- In
get_test_cmd
, we remove logic that only runs the files withFAIL_TO_PASS
tests for evaluation - instead, the entire test suite is run.
When looking at the code that build the test command, I noticed only test files related to the files modified in the patch will be used (i.e. not the entire test suite).
https://github.com/SWE-bench/SWE-smith/blob/main/swesmith/profiles/base.py#L288-L321
For instance, this causes an issue for the following task
iterative__dvc.1d6ea681.func_pm_class_rm_base__1ygoyhp0
which will have this test_cmd
pytest --disable-warnings --color=no --tb=no --verbose tests/unit/test_context.py
which when the gold patch is applied will output
tests/unit/test_context.py::test_context PASSED [ 5%]
tests/unit/test_context.py::test_context_dict_ignores_keys_except_str PASSED [ 10%]
tests/unit/test_context.py::test_context_list PASSED [ 15%]
tests/unit/test_context.py::test_context_setitem_getitem PASSED [ 20%]
tests/unit/test_context.py::test_loop_context PASSED [ 25%]
tests/unit/test_context.py::test_repr PASSED [ 30%]
tests/unit/test_context.py::test_select PASSED [ 35%]
tests/unit/test_context.py::test_select_unwrap PASSED [ 40%]
tests/unit/test_context.py::test_merge_dict PASSED [ 45%]
tests/unit/test_context.py::test_merge_list PASSED [ 50%]
tests/unit/test_context.py::test_overwrite_with_setitem PASSED [ 55%]
tests/unit/test_context.py::test_load_from PASSED [ 60%]
tests/unit/test_context.py::test_clone PASSED [ 65%]
tests/unit/test_context.py::test_track PASSED [ 70%]
tests/unit/test_context.py::test_track_from_multiple_files PASSED [ 75%]
tests/unit/test_context.py::test_node_value PASSED [ 80%]
tests/unit/test_context.py::test_resolve_resolves_dict_keys PASSED [ 85%]
tests/unit/test_context.py::test_resolve_resolves_boolean_value PASSED [ 90%]
tests/unit/test_context.py::test_load_from_raises_if_file_not_exist PASSED [ 95%]
tests/unit/test_context.py::test_load_from_raises_if_file_is_directory PASSED [100%]
============================== 20 passed in 2.22s ==============================
while the expected FAIL_TO_PASS for this task according to the SWE-Smith dataset (https://huggingface.co/datasets/SWE-bench/SWE-smith/sql-console/t8ei8w2) has much more tests
[tests/func/parsing/test_errors.py::test_failed_to_interpolate,tests/func/parsing/test_errors.py::test_local_vars_params_file_not_exist,tests/func/parsing/test_errors.py::test_specified_key_does_not_exist,tests/func/parsing/test_errors.py::test_interpolate_non_string,tests/func/parsing/test_errors.py::test_interpolate_nested_iterable,tests/func/parsing/test_errors.py::test_partial_vars_doesnot_exist,tests/func/parsing/test_errors.py::test_foreach_data_key_does_not_exists[modelss],tests/func/parsing/test_errors.py::test_foreach_data_key_does_not_exists[modelss.123],tests/func/parsing/test_errors.py::test_foreach_data_expects_list_or_dict[${foo}],tests/func/parsing/test_errors.py::test_foreach_data_expects_list_or_dict[${dct.model1}],tests/func/parsing/test_errors.py::test_foreach_data_expects_list_or_dict[${lst.0}],tests/func/parsing/test_errors.py::test_foreach_data_expects_list_or_dict[foobar],tests/func/parsing/test_errors.py::test_foreach_do_syntax_errors,tests/func/parsing/test_errors.py::test_foreach_do_definition_item_does_not_exist[[email protected]],tests/func/parsing/test_errors.py::test_foreach_do_definition_item_does_not_exist[[email protected]],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[True-redefine0],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[True-redefine1],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[True-redefine2],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[True-redefine3],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[False-redefine0],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[False-redefine1],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[False-redefine2],tests/func/parsing/test_errors.py::test_item_key_in_generated_stage_vars[False-redefine3],tests/func/parsing/test_errors.py::test_foreach_wdir_key_does_not_exist,tests/func/parsing/test_foreach.py::test_with_simple_list_data,tests/func/parsing/test_foreach.py::test_with_dict_data,tests/func/parsing/test_foreach.py::test_with_dict_with_non_str_keys,tests/func/parsing/test_foreach.py::test_with_composite_list,tests/func/parsing/test_foreach.py::test_foreach_interpolated_simple_list,tests/func/parsing/test_foreach.py::test_foreach_interpolate_with_composite_data[foreach_data0-result0-${item.thresh}],tests/func/parsing/test_foreach.py::test_foreach_interpolate_with_composite_data[foreach_data0-result0-${item[thresh]}],tests/func/parsing/test_foreach.py::test_foreach_interpolate_with_composite_data[foreach_data1-result1-${item.thresh}],tests/func/parsing/test_foreach.py::test_foreach_interpolate_with_composite_data[foreach_data1-result1-${item[thresh]}],tests/func/parsing/test_foreach.py::test_params_file_with_dict_tracked,tests/func/parsing/test_foreach.py::test_params_file_tracked_for_composite_list,tests/func/parsing/test_foreach.py::test_foreach_data_from_nested_vars,tests/func/parsing/test_foreach.py::test_foreach_partial_interpolations,tests/func/parsing/test_foreach.py::test_mixed_vars_for_foreach_data,tests/func/parsing/test_foreach.py::test_mixed_vars_for_foreach_data_2,tests/func/parsing/test_foreach.py::test_foreach_with_interpolated_wdir,tests/func/parsing/test_foreach.py::test_foreach_with_local_vars,tests/func/parsing/test_foreach.py::test_foreach_with_imported_vars[test_params.yaml],tests/func/parsing/test_foreach.py::test_foreach_with_imported_vars[test_params.yaml:train],tests/func/parsing/test_foreach.py::test_foreach_with_imported_vars[test_params.yaml:train,prepare],tests/func/parsing/test_foreach.py::test_foreach_with_interpolated_wdir_and_local_vars[params.yaml],tests/func/parsing/test_foreach.py::test_foreach_with_interpolated_wdir_and_local_vars[params.yaml:train,prepare],tests/func/parsing/test_foreach.py::test_foreach_do_syntax_is_checked_once,tests/func/parsing/test_foreach.py::test_foreach_data_is_only_resolved_once,tests/func/parsing/test_interpolated_entry.py::test_simple,tests/func/parsing/test_interpolated_entry.py::test_vars_import,tests/func/parsing/test_interpolated_entry.py::test_vars_and_params_import,tests/func/parsing/test_interpolated_entry.py::test_stage_with_wdir,tests/func/parsing/test_interpolated_entry.py::test_with_templated_wdir,tests/func/parsing/test_interpolated_entry.py::test_resolve_local_tries_to_load_globally_used_files,tests/func/parsing/test_interpolated_entry.py::test_resolve_local_tries_to_load_globally_used_params_yaml,tests/func/parsing/test_interpolated_entry.py::test_vars_relpath_overwrite,tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_0-True],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_0-False],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_1-True],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_1-False],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_2-True],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_2-False],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_3-True],tests/func/parsing/test_interpolated_entry.py::test_vars_load_partial[vars_3-False],tests/func/parsing/test_interpolated_entry.py::test_cmd_dict[None-None],tests/func/parsing/test_interpolated_entry.py::test_cmd_dict[store_true-nargs],tests/func/parsing/test_interpolated_entry.py::test_cmd_dict[boolean_optional-append],tests/func/parsing/test_matrix.py::test_matrix_interpolated[matrix0],tests/func/parsing/test_matrix.py::test_matrix_interpolated[matrix1],tests/func/parsing/test_matrix.py::test_matrix_key_present[matrix0],tests/func/parsing/test_matrix.py::test_matrix_key_present[matrix1],tests/func/parsing/test_resolver.py::test_resolver,tests/func/parsing/test_resolver.py::test_default_params_file_not_exist,tests/func/parsing/test_resolver.py::test_no_params_yaml_and_vars,tests/func/parsing/test_resolver.py::test_local_vars,tests/func/parsing/test_resolver.py::test_default_params_file[vars_0],tests/func/parsing/test_resolver.py::test_default_params_file[vars_1],tests/func/parsing/test_resolver.py::test_default_params_file[vars_2],tests/func/parsing/test_resolver.py::test_load_vars_from_file,tests/func/parsing/test_resolver.py::test_load_vars_with_relpath,tests/func/parsing/test_resolver.py::test_global_overwrite_error_on_imports,tests/func/parsing/test_resolver.py::test_global_overwrite_vars,tests/func/parsing/test_resolver.py::test_local_declared_vars_overwrite,tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_0-True],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_0-False],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_1-True],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_1-False],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_2-True],tests/func/parsing/test_resolver.py::test_vars_already_loaded_message[vars_2-False],tests/func/parsing/test_resolver.py::test_local_overwrite_error[vars_0-build.vars[0]],tests/func/parsing/test_resolver.py::test_local_overwrite_error[params.json-params.json],tests/func/parsing/test_top_level.py::test_params,tests/func/parsing/test_top_level.py::test_metrics,tests/func/parsing/test_top_level.py::test_plots,tests/func/parsing/test_top_level.py::test_artifacts,tests/func/parsing/test_top_level.py::test_datasets]
Since now the grading logic assume failure when a test is not present, that means the sanity check fails.
Step to reproduce
python swesmith/harness/eval.py --run_id sanity --predictions_path gold --instance_ids iterative__dvc.1d6ea681.func_pm_class_rm_base__1ygoyhp0python swesmith/harness/eval.py --run_id sanity --redo_failing --predictions_path gold --instance_ids iterative__dvc.1d6ea681.func_pm_class_rm_base__1ygoyhp0