fix(dspy): Example.toDict() fails to serialize dspy.History objects (stanfordnlp#9047)

Copilot · TomeHirata · web-flow · commit a5671ef1e99e · 2025-11-17T16:37:47.000+09:00
* Initial plan

* Fix Example.toDict() to serialize dspy.History objects properly

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Consolidate History tests into existing test_evaluate.py

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Refactor: move toDict() call to merge_dicts for cleaner code

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Use isinstance(BaseModel) instead of hasattr for Pydantic check

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Simplify _prepare_results_output to always use toDict()

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Remove unnecessary try/except for pydantic import since it's a core dependency

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -233,7 +233,7 @@ def _prepare_results_output(
             (
                 merge_dicts(example, prediction) | {metric_name: score}
                 if prediction_is_dictlike(prediction)
-                else dict(example) | {"prediction": prediction, metric_name: score}
+                else example.toDict() | {"prediction": prediction, metric_name: score}
             )
             for example, prediction, score in results
         ]
@@ -305,6 +305,12 @@ def prediction_is_dictlike(prediction):
 
 
 def merge_dicts(d1, d2) -> dict:
+    # Convert to dict if objects have toDict method (e.g., Example objects)
+    if hasattr(d1, "toDict"):
+        d1 = d1.toDict()
+    if hasattr(d2, "toDict"):
+        d2 = d2.toDict()
+
     merged = {}
     for k, v in d1.items():
         if k in d2:
diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py
@@ -1,3 +1,6 @@
+from pydantic import BaseModel
+
+
 class Example:
     """A flexible data container for DSPy examples and training data.
 
@@ -193,6 +196,9 @@ def toDict(self):  # noqa: N802
         def convert_to_serializable(value):
             if hasattr(value, "toDict"):
                 return value.toDict()
+            elif isinstance(value, BaseModel):
+                # Handle Pydantic models (e.g., dspy.History)
+                return value.model_dump()
             elif isinstance(value, list):
                 return [convert_to_serializable(item) for item in value]
             elif isinstance(value, dict):
diff --git a/tests/evaluate/test_evaluate.py b/tests/evaluate/test_evaluate.py
@@ -1,4 +1,6 @@
+import json
 import signal
+import tempfile
 import threading
 from unittest.mock import patch
 
@@ -261,3 +263,136 @@ def on_evaluate_end(
 def test_evaluation_result_repr():
     result = EvaluationResult(score=100.0, results=[(new_example("What is 1+1?", "2"), {"answer": "2"}, 100.0)])
     assert repr(result) == "EvaluationResult(score=100.0, results=<list of 1 results>)"
+
+
+def test_evaluate_save_as_json_with_history():
+    """Test that save_as_json works with Examples containing dspy.History objects."""
+    # Setup
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+                "What is 2+2?": {"answer": "4"},
+            }
+        )
+    )
+
+    # Create history objects
+    history1 = dspy.History(
+        messages=[
+            {"question": "Previous Q1", "answer": "Previous A1"},
+        ]
+    )
+    history2 = dspy.History(
+        messages=[
+            {"question": "Previous Q2", "answer": "Previous A2"},
+            {"question": "Previous Q3", "answer": "Previous A3"},
+        ]
+    )
+
+    # Create examples with history
+    devset = [
+        dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"),
+        dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"),
+    ]
+
+    program = Predict("question -> answer")
+
+    # Create evaluator with save_as_json
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        temp_json = f.name
+
+    try:
+        evaluator = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            save_as_json=temp_json,
+        )
+
+        result = evaluator(program)
+        assert result.score == 100.0
+
+        # Verify JSON file was created and is valid
+        with open(temp_json) as f:
+            data = json.load(f)
+
+        assert len(data) == 2
+
+        # Verify history was properly serialized in first record
+        assert "history" in data[0]
+        assert isinstance(data[0]["history"], dict)
+        assert "messages" in data[0]["history"]
+        assert len(data[0]["history"]["messages"]) == 1
+        assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"}
+
+        # Verify history was properly serialized in second record
+        assert "history" in data[1]
+        assert isinstance(data[1]["history"], dict)
+        assert "messages" in data[1]["history"]
+        assert len(data[1]["history"]["messages"]) == 2
+        assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"}
+        assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"}
+
+    finally:
+        import os
+        if os.path.exists(temp_json):
+            os.unlink(temp_json)
+
+
+def test_evaluate_save_as_csv_with_history():
+    """Test that save_as_csv works with Examples containing dspy.History objects."""
+    # Setup
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+            }
+        )
+    )
+
+    # Create history object
+    history = dspy.History(
+        messages=[
+            {"question": "Previous Q", "answer": "Previous A"},
+        ]
+    )
+
+    # Create example with history
+    devset = [
+        dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"),
+    ]
+
+    program = Predict("question -> answer")
+
+    # Create evaluator with save_as_csv
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+        temp_csv = f.name
+
+    try:
+        evaluator = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            save_as_csv=temp_csv,
+        )
+
+        result = evaluator(program)
+        assert result.score == 100.0
+
+        # Verify CSV file was created
+        import csv
+        with open(temp_csv) as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+
+        assert len(rows) == 1
+        assert "history" in rows[0]
+        # CSV will have string representation of the dict
+        assert "messages" in rows[0]["history"]
+
+    finally:
+        import os
+        if os.path.exists(temp_csv):
+            os.unlink(temp_csv)
+
diff --git a/tests/primitives/test_example.py b/tests/primitives/test_example.py
@@ -123,3 +123,34 @@ def test_example_copy_without():
 def test_example_to_dict():
     example = Example(a=1, b=2)
     assert example.toDict() == {"a": 1, "b": 2}
+
+
+def test_example_to_dict_with_history():
+    """Test that Example.toDict() properly serializes dspy.History objects."""
+    history = dspy.History(
+        messages=[
+            {"question": "What is the capital of France?", "answer": "Paris"},
+            {"question": "What is the capital of Germany?", "answer": "Berlin"},
+        ]
+    )
+    example = Example(question="Test question", history=history, answer="Test answer")
+
+    result = example.toDict()
+
+    # Verify the result is a dictionary
+    assert isinstance(result, dict)
+    assert "history" in result
+
+    # Verify history is serialized to a dict (not a History object)
+    assert isinstance(result["history"], dict)
+    assert "messages" in result["history"]
+    assert result["history"]["messages"] == [
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "What is the capital of Germany?", "answer": "Berlin"},
+    ]
+
+    # Verify JSON serialization works
+    import json
+    json_str = json.dumps(result)
+    restored = json.loads(json_str)
+    assert restored["history"]["messages"] == result["history"]["messages"]

Original file line number	Diff line number	Diff line change
`@@ -233,7 +233,7 @@ def _prepare_results_output(`
`233`	`233`	`(`
`234`	`234`	`merge_dicts(example, prediction) \| {metric_name: score}`
`235`	`235`	`if prediction_is_dictlike(prediction)`
`236`		`- else dict(example) \| {"prediction": prediction, metric_name: score}`
	`236`	`+ else example.toDict() \| {"prediction": prediction, metric_name: score}`
`237`	`237`	`)`
`238`	`238`	`for example, prediction, score in results`
`239`	`239`	`]`
`@@ -305,6 +305,12 @@ def prediction_is_dictlike(prediction):`
`305`	`305`
`306`	`306`
`307`	`307`	`def merge_dicts(d1, d2) -> dict:`
	`308`	`+ # Convert to dict if objects have toDict method (e.g., Example objects)`
	`309`	`+ if hasattr(d1, "toDict"):`
	`310`	`+ d1 = d1.toDict()`
	`311`	`+ if hasattr(d2, "toDict"):`
	`312`	`+ d2 = d2.toDict()`
	`313`	`+`
`308`	`314`	`merged = {}`
`309`	`315`	`for k, v in d1.items():`
`310`	`316`	`if k in d2:`