Refactor evaluate and introduce construct_result_table method (stanfordnlp#7991)

TomeHirata · web-flow · commit c3ac575dba4e · 2025-03-21T07:46:56.000-07:00
* set last_result_df when evaluate is called

* refactor evaluate logic to add construct_result_df

* refactor _display_result_table

* fix annotation

* Fix result signature

* Fix doc
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -1,6 +1,6 @@
 import logging
 import types
-from typing import TYPE_CHECKING, Any, Callable, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, Tuple
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -54,7 +54,7 @@ def __init__(
         metric: Optional[Callable] = None,
         num_threads: int = 1,
         display_progress: bool = False,
-        display_table: bool = False,
+        display_table: Union[bool, int] = False,
         max_errors: int = 5,
         return_all_scores: bool = False,
         return_outputs: bool = False,
@@ -68,7 +68,8 @@ def __init__(
             metric (Callable): The metric function to use for evaluation.
             num_threads (int): The number of threads to use for parallel evaluation.
             display_progress (bool): Whether to display progress during evaluation.
-            display_table (bool): Whether to display the evaluation results in a table.
+            display_table (Union[bool, int]): Whether to display the evaluation results in a table. 
+                If a number is passed, the evaluation results will be truncated to that number before displayed. 
             max_errors (int): The maximum number of errors to allow before stopping evaluation.
             return_all_scores (bool): Whether to return scores for every data record in `devset`.
             return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`.
@@ -94,7 +95,7 @@ def __call__(
         devset: Optional[List["dspy.Example"]] = None,
         num_threads: Optional[int] = None,
         display_progress: Optional[bool] = None,
-        display_table: Optional[bool] = None,
+        display_table: Optional[Union[bool, int]] = None,
         return_all_scores: Optional[bool] = None,
         return_outputs: Optional[bool] = None,
         callback_metadata: Optional[dict[str, Any]] = None,
@@ -108,8 +109,8 @@ def __call__(
                 `self.num_threads`.
             display_progress (bool): Whether to display progress during evaluation. if not provided, use
                 `self.display_progress`.
-            display_table (bool): Whether to display the evaluation results in a table. if not provided, use
-                `self.display_table`.
+            display_table (Union[bool, int]): Whether to display the evaluation results in a table. if not provided, use
+                `self.display_table`. If a number is passed, the evaluation results will be truncated to that number before displayed.
             return_all_scores (bool): Whether to return scores for every data record in `devset`. if not provided,
                 use `self.return_all_scores`.
             return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not
@@ -174,12 +175,38 @@ def process_item(example):
         ncorrect, ntotal = sum(score for *_, score in results), len(devset)
 
         logger.info(f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)")
-            
-        def prediction_is_dictlike(prediction):
-            # Downstream logic for displaying dictionary-like predictions depends solely on the predictions
-            # having a method called `items()` for iterating through key/value pairs
-            return hasattr(prediction, "items") and callable(getattr(prediction, "items"))
 
+        # Rename the 'correct' column to the name of the metric object
+        metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__
+        # Construct a pandas DataFrame from the results
+        result_df = self._construct_result_table(results, metric_name)
+
+        if display_table:
+            self._display_result_table(result_df, display_table, metric_name)
+
+        if return_all_scores and return_outputs:
+            return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
+        if return_all_scores:
+            return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
+        if return_outputs:
+            return round(100 * ncorrect / ntotal, 2), results
+
+        return round(100 * ncorrect / ntotal, 2)
+    
+
+    def _construct_result_table(self, results: list[Tuple[dspy.Example, dspy.Example, Any]], metric_name: str) -> "pd.DataFrame":
+        """
+        Construct a pandas DataFrame from the specified result list.
+        Let's not try to change the name of this method as it may be patched by external tracing tools.
+
+        Args:
+            results: The list of results to construct the result DataFrame from.
+            metric_name: The name of the metric used for evaluation.
+        
+        Returns:
+            The constructed pandas DataFrame.
+        """
+        import pandas as pd
         data = [
             (
                 merge_dicts(example, prediction) | {"correct": score}
@@ -189,50 +216,53 @@ def prediction_is_dictlike(prediction):
             for example, prediction, score in results
         ]
 
-
-        import pandas as pd
         # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
         result_df = pd.DataFrame(data)
         result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell)
 
-        # Rename the 'correct' column to the name of the metric object
-        metric_name = metric.__name__ if isinstance(metric, types.FunctionType) else metric.__class__.__name__
-        result_df = result_df.rename(columns={"correct": metric_name})
+        return result_df.rename(columns={"correct": metric_name})
 
-        if display_table:
-            if isinstance(display_table, bool):
-                df_to_display = result_df.copy()
-                truncated_rows = 0
-            else:
-                df_to_display = result_df.head(display_table).copy()
-                truncated_rows = len(result_df) - display_table
-
-            df_to_display = stylize_metric_name(df_to_display, metric_name)
-
-            display_dataframe(df_to_display)
-
-            if truncated_rows > 0:
-                # Simplified message about the truncated rows
-                message = f"""
-                <div style='
-                    text-align: center;
-                    font-size: 16px;
-                    font-weight: bold;
-                    color: #555;
-                    margin: 10px 0;'>
-                    ... {truncated_rows} more rows not displayed ...
-                </div>
-                """
-                display(HTML(message))
 
-        if return_all_scores and return_outputs:
-            return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
-        if return_all_scores:
-            return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
-        if return_outputs:
-            return round(100 * ncorrect / ntotal, 2), results
+    def _display_result_table(self, result_df: "pd.DataFrame", display_table: Union[bool, int], metric_name: str):
+        """
+        Display the specified result DataFrame in a table format.
 
-        return round(100 * ncorrect / ntotal, 2)
+        Args:
+            result_df: The result DataFrame to display.
+            display_table: Whether to display the evaluation results in a table. 
+                If a number is passed, the evaluation results will be truncated to that number before displayed.
+            metric_name: The name of the metric used for evaluation.
+        """
+        if isinstance(display_table, bool):
+            df_to_display = result_df.copy()
+            truncated_rows = 0
+        else:
+            df_to_display = result_df.head(display_table).copy()
+            truncated_rows = len(result_df) - display_table
+
+        df_to_display = stylize_metric_name(df_to_display, metric_name)
+
+        display_dataframe(df_to_display)
+
+        if truncated_rows > 0:
+            # Simplified message about the truncated rows
+            message = f"""
+            <div style='
+                text-align: center;
+                font-size: 16px;
+                font-weight: bold;
+                color: #555;
+                margin: 10px 0;'>
+                ... {truncated_rows} more rows not displayed ...
+            </div>
+            """
+            display(HTML(message))
+
+
+def prediction_is_dictlike(prediction):
+    # Downstream logic for displaying dictionary-like predictions depends solely on the predictions
+    # having a method called `items()` for iterating through key/value pairs
+    return hasattr(prediction, "items") and callable(getattr(prediction, "items"))
 
 
 def merge_dicts(d1, d2) -> dict:
diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
@@ -1,6 +1,7 @@
 import signal
 import threading
 from unittest.mock import patch
+import pandas as pd
 
 import pytest
 
@@ -54,6 +55,30 @@ def test_evaluate_call():
     assert score == 100.0
 
 
+def test_construct_result_df():
+    devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]
+    ev = Evaluate(
+        devset=devset,
+        metric=answer_exact_match,
+    )
+    results = [
+        (devset[0], {"answer": "2"}, 100.0),
+        (devset[1], {"answer": "4"}, 100.0),
+    ]
+    result_df = ev._construct_result_table(results, answer_exact_match.__name__)
+    pd.testing.assert_frame_equal(
+        result_df,
+        pd.DataFrame(
+            {
+                "question": ["What is 1+1?", "What is 2+2?"],
+                "example_answer": ["2", "4"],
+                "pred_answer": ["2", "4"],
+                "answer_exact_match": [100.0, 100.0],
+            }
+        )
+    )
+
+
 def test_multithread_evaluate_call():
     dspy.settings.configure(lm=DummyLM({"What is 1+1?": {"answer": "2"}, "What is 2+2?": {"answer": "4"}}))
     devset = [new_example("What is 1+1?", "2"), new_example("What is 2+2?", "4")]