Merge pull request stanfordnlp#1086 from pedramsalimi/main

arnavsinghvi11 · web-flow · commit 1e29689e85bd · 2024-06-17T07:25:06.000-07:00
Fix the issue of handling input_keys using Dataset class (Issue stanfordnlp#898)
diff --git a/dspy/datasets/dataset.py b/dspy/datasets/dataset.py
@@ -6,13 +6,15 @@
 
 
 class Dataset:
-    def __init__(self, train_seed=0, train_size=None, eval_seed=0, dev_size=None, test_size=None):
+    def __init__(self, train_seed=0, train_size=None, eval_seed=0, dev_size=None, test_size=None, input_keys=[]):
         self.train_size = train_size
         self.train_seed = train_seed
         self.dev_size = dev_size
         self.dev_seed = eval_seed
         self.test_size = test_size
         self.test_seed = eval_seed
+        self.input_keys = input_keys
+
         self.do_shuffle = True
 
         self.name = self.__class__.__name__
@@ -73,8 +75,10 @@ def _shuffle_and_sample(self, split, data, size, seed=0):
         output = []
 
         for example in data:
-            output.append(Example(**example, dspy_uuid=str(uuid.uuid4()), dspy_split=split))
-        
+            example_obj = Example(**example, dspy_uuid=str(uuid.uuid4()), dspy_split=split)
+            if self.input_keys:
+                example_obj = example_obj.with_inputs(*self.input_keys)
+            output.append(example_obj)
         # TODO: NOTE: Ideally we use these uuids for dedup internally, for demos and internal train/val splits.
         # Now, some tasks (like convQA and Colors) have overlapping examples. Here, we should allow the user to give us
         # a uuid field that would respect this in some way. This means that we need a more refined concept that
diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py
@@ -1,4 +1,3 @@
-
 class Example:
     def __init__(self, base=None, **kwargs):
         # Internal storage and other attributes
@@ -16,20 +15,20 @@ def __init__(self, base=None, **kwargs):
 
         # Update with provided kwargs
         self._store.update(kwargs)
-    
+
     def __getattr__(self, key):
-        if key.startswith('__') and key.endswith('__'):
+        if key.startswith("__") and key.endswith("__"):
             raise AttributeError
         if key in self._store:
             return self._store[key]
         raise AttributeError(f"'{type(self).__name__}' object has no attribute '{key}'")
 
     def __setattr__(self, key, value):
-        if key.startswith('_') or key in dir(self.__class__):  
+        if key.startswith("_") or key in dir(self.__class__):
             super().__setattr__(key, value)
         else:
             self._store[key] = value
-    
+
     def __getitem__(self, key):
         return self._store[key]
 
@@ -41,55 +40,58 @@ def __delitem__(self, key):
 
     def __contains__(self, key):
         return key in self._store
-    
+
     def __len__(self):
-        return len([k for k in self._store if not k.startswith('dspy_')])
-    
+        return len([k for k in self._store if not k.startswith("dspy_")])
+
     def __repr__(self):
         # return f"Example({self._store})" + f" (input_keys={self._input_keys}, demos={self._demos})"
-        d = {k: v for k, v in self._store.items() if not k.startswith('dspy_')}
+        d = {k: v for k, v in self._store.items() if not k.startswith("dspy_")}
         return f"Example({d})" + f" (input_keys={self._input_keys})"
-    
+
     def __str__(self):
         return self.__repr__()
-    
+
     def __eq__(self, other):
         return isinstance(other, Example) and self._store == other._store
-    
+
     def __hash__(self):
         return hash(tuple(self._store.items()))
 
     def keys(self, include_dspy=False):
-        return [k for k in self._store.keys() if not k.startswith('dspy_') or include_dspy]
-    
+        return [k for k in self._store.keys() if not k.startswith("dspy_") or include_dspy]
+
     def values(self, include_dspy=False):
-        return [v for k, v in self._store.items() if not k.startswith('dspy_') or include_dspy]
+        return [v for k, v in self._store.items() if not k.startswith("dspy_") or include_dspy]
 
     def items(self, include_dspy=False):
-        return [(k, v) for k, v in self._store.items() if not k.startswith('dspy_') or include_dspy]
+        return [(k, v) for k, v in self._store.items() if not k.startswith("dspy_") or include_dspy]
 
     def get(self, key, default=None):
         return self._store.get(key, default)
-    
+
     def with_inputs(self, *keys):
         copied = self.copy()
         copied._input_keys = set(keys)
         return copied
-    
+
     def inputs(self):
         if self._input_keys is None:
             raise ValueError("Inputs have not been set for this example. Use `example.with_inputs()` to set them.")
 
         # return items that are in input_keys
         d = {key: self._store[key] for key in self._store if key in self._input_keys}
-        return type(self)(d)
-    
+        # return type(self)(d)
+        new_instance = type(self)(base=d)
+        new_instance._input_keys = self._input_keys  # Preserve input_keys in new instance
+        return new_instance
+
     def labels(self):
         # return items that are NOT in input_keys
         input_keys = self.inputs().keys()
         d = {key: self._store[key] for key in self._store if key not in input_keys}
         return type(self)(d)
-    
+
     def __iter__(self):
         return iter(dict(self._store))
 
@@ -101,6 +103,6 @@ def without(self, *keys):
         for key in keys:
             del copied[key]
         return copied
-    
+
     def toDict(self):
         return self._store.copy()
diff --git a/tests/datasets/test_dataset.py b/tests/datasets/test_dataset.py
@@ -0,0 +1,49 @@
+import unittest
+import uuid
+
+import pandas as pd
+
+from dspy import Example
+from dspy.datasets.dataset import Dataset
+
+dummy_data = """content,question,answer
+"This is content 1","What is this?","This is answer 1"
+"This is content 2","What is that?","This is answer 2"
+"""
+
+with open("dummy.csv", "w") as file:
+    file.write(dummy_data)
+
+
+class CSVDataset(Dataset):
+    def __init__(self, file_path, input_keys=None, *args, **kwargs) -> None:
+        super().__init__(input_keys=input_keys, *args, **kwargs)
+        df = pd.read_csv(file_path)
+        data = df.to_dict(orient="records")
+        self._train = [
+            Example(**record, dspy_uuid=str(uuid.uuid4()), dspy_split="train").with_inputs(*input_keys)
+            for record in data[:1]
+        ]
+        self._dev = [
+            Example(**record, dspy_uuid=str(uuid.uuid4()), dspy_split="dev").with_inputs(*input_keys)
+            for record in data[1:2]
+        ]
+
+
+class TestCSVDataset(unittest.TestCase):
+    def test_input_keys(self):
+        dataset = CSVDataset("dummy.csv", input_keys=["content", "question"])
+        self.assertIsNotNone(dataset.train)
+
+        for example in dataset.train:
+            print(example)
+            inputs = example.inputs()
+            print(f"Example inputs: {inputs}")
+            self.assertIsNotNone(inputs)
+            self.assertIn("content", inputs)
+            self.assertIn("question", inputs)
+            self.assertEqual(set(example._input_keys), {"content", "question"})
+
+
+if __name__ == "__main__":
+    unittest.main()