Minor fixes to code and documentation.

trebedea · trebedea · commit 9d4ab08055e6 · 2023-06-28T23:36:06.000+03:00
diff --git a/nemoguardrails/eval/data/topical/README.md b/nemoguardrails/eval/data/topical/README.md
@@ -28,7 +28,7 @@ For additional information about topical rails evaluation and results on the two
 ### Chit-chat dataset
 
 We are using a slightly modified version of the chit-chat dataset available [here](https://github.com/rahul051296/small-talk-rasa-stack).
-For this dataset, we have configured a [Guardrail app](./chitchat) that already has the:
+For this dataset, we have configured a [Guardrail app](./chitchat) that already has:
 - Config file: `config.yml`
 - A set of defined flows: `flows.co`
 - A set of predefined bot messages for the topical rails: `bot.co`
@@ -51,7 +51,7 @@ To run the topical evaluation on this dataset run:
 ### Banking dataset
 
 We are starting from the banking dataset available [here](https://github.com/PolyAI-LDN/task-specific-datasets/tree/master/banking_data).
-For this dataset, we have configured a [Guardrail app](./banking) that already has the:
+For this dataset, we have configured a [Guardrail app](./banking) that already has:
 - Config file: `config.yml`
 - A set of defined flows: `flows.co`
 - A file mapping the user intents in the original dataset to user canonical forms used by Guardrails: `categories_canonical_forms.json`
@@ -73,5 +73,7 @@ To run the topical evaluation on this dataset run:
 
 If you want to assess the performance of topical rails with a new NLU dataset, you can use the `./nemoguardrails/eval/data/topical/dataset_tools.py` functionality.
 For each dataset, you need to define a new class that extends the `DatasetConnector` class and implements the two following two functions:
-- `read_dataset`
-- `_read_canonical_forms`
+- `read_dataset`: Reads the dataset from the specified path, instantiating at least intent names, intent canonical forms, and intent samples.
+The path received as parameter should contain the original dataset files, in the specific format they were distributed.
+- `_read_canonical_forms`: Reads the intent - canonical form mappings from a file.
+This can be a `json` or any other format and should be created by the evaluation user as the mapping is not part of the original dataset.
diff --git a/nemoguardrails/eval/data/topical/create_colang_intent_file.py b/nemoguardrails/eval/data/topical/create_colang_intent_file.py
@@ -62,21 +62,6 @@ def main(
             output_file_name="./chitchat/user.co",
             num_samples_per_intent=max_samples_intent,
         )
-
-        with open("./chitchat/user.co") as file1:
-            lines1 = file1.readlines()
-            intents1 = []
-            for line in lines1:
-                if line.startswith("define user"):
-                    intents1.append(line)
-
-        with open("./../../../../evals/config/chitchat/user.co") as file2:
-            lines2 = file2.readlines()
-            for line in lines2:
-                if line.startswith("define user"):
-                    if line not in intents1:
-                        print("Not found: " + line)
-
         print("Created user.co file for banking dataset.")
     else:
         print(f"Unknown dataset {dataset_name}, cannot create user.co file!")
diff --git a/nemoguardrails/eval/data/topical/dataset_tools.py b/nemoguardrails/eval/data/topical/dataset_tools.py
@@ -139,8 +139,10 @@ def _read_canonical_forms(
             for intent_canonical_entry in data:
                 if len(intent_canonical_entry) != 2:
                     print(
-                        "Problem: no canonical form found or too many canonical forms!"
+                        f"Problem: no canonical form found or too many canonical forms "
+                        f"for entry {intent_canonical_entry}!"
                     )
+                    continue
                 intent = intent_canonical_entry[0]
                 canonical_form = intent_canonical_entry[1]
                 intent_canonical_forms[intent] = canonical_form
@@ -186,8 +188,6 @@ def read_dataset(self, dataset_path: str = BANKING77_FOLDER) -> None:
                         )
                     )
 
-        return None
-
 
 class ChitChatConnector(DatasetConnector):
     CHITCHAT_FOLDER = "./chitchat/original_dataset/"
@@ -208,8 +208,10 @@ def _read_canonical_forms(
             for intent_canonical_entry in data:
                 if len(intent_canonical_entry) != 2:
                     print(
-                        "Problem: no canonical form found or too many canonical forms!"
+                        f"Problem: no canonical form found or too many canonical forms "
+                        f"for entry {intent_canonical_entry}!"
                     )
+                    continue
                 intent = intent_canonical_entry[0]
                 canonical_form = intent_canonical_entry[1]
                 intent_canonical_forms[intent] = canonical_form
@@ -261,5 +263,3 @@ def read_dataset(self, dataset_path: str = CHITCHAT_FOLDER) -> None:
                                     intent=intent, text=text, dataset_split=dataset_type
                                 )
                             )
-
-        return None

Original file line number	Diff line number	Diff line change
`@@ -139,8 +139,10 @@ def _read_canonical_forms(`
`139`	`139`	`for intent_canonical_entry in data:`
`140`	`140`	`if len(intent_canonical_entry) != 2:`
`141`	`141`	`print(`
`142`		`- "Problem: no canonical form found or too many canonical forms!"`
	`142`	`+ f"Problem: no canonical form found or too many canonical forms "`
	`143`	`+ f"for entry {intent_canonical_entry}!"`
`143`	`144`	`)`
	`145`	`+ continue`
`144`	`146`	`intent = intent_canonical_entry[0]`
`145`	`147`	`canonical_form = intent_canonical_entry[1]`
`146`	`148`	`intent_canonical_forms[intent] = canonical_form`
`@@ -186,8 +188,6 @@ def read_dataset(self, dataset_path: str = BANKING77_FOLDER) -> None:`
`186`	`188`	`)`
`187`	`189`	`)`
`188`	`190`
`189`		`- return None`
`190`		`-`
`191`	`191`
`192`	`192`	`class ChitChatConnector(DatasetConnector):`
`193`	`193`	`CHITCHAT_FOLDER = "./chitchat/original_dataset/"`
`@@ -208,8 +208,10 @@ def _read_canonical_forms(`
`208`	`208`	`for intent_canonical_entry in data:`
`209`	`209`	`if len(intent_canonical_entry) != 2:`
`210`	`210`	`print(`
`211`		`- "Problem: no canonical form found or too many canonical forms!"`
	`211`	`+ f"Problem: no canonical form found or too many canonical forms "`
	`212`	`+ f"for entry {intent_canonical_entry}!"`
`212`	`213`	`)`
	`214`	`+ continue`
`213`	`215`	`intent = intent_canonical_entry[0]`
`214`	`216`	`canonical_form = intent_canonical_entry[1]`
`215`	`217`	`intent_canonical_forms[intent] = canonical_form`
`@@ -261,5 +263,3 @@ def read_dataset(self, dataset_path: str = CHITCHAT_FOLDER) -> None:`
`261`	`263`	`intent=intent, text=text, dataset_split=dataset_type`
`262`	`264`	`)`
`263`	`265`	`)`
`264`		`-`
`265`		`- return None`