Fix errors in update CLI call

rohitgarud · rohitgarud · commit 1e370220214a · 2023-04-17T09:49:23.000+05:30
diff --git a/asreviewcontrib/preprocess/config.py b/asreviewcontrib/preprocess/config.py
@@ -17,6 +17,7 @@
     "year": ["year"],
     "ref_type": ["ref_type", "type_of_reference"],
     "journal": ["journal"],
+    "doi": ["doi"],
     "volume": ["volume"],
     "pages": ["pages", "start_page"],
     "number": ["number", "issue"],
diff --git a/asreviewcontrib/preprocess/entry_points/entrypoint.py b/asreviewcontrib/preprocess/entry_points/entrypoint.py
@@ -147,7 +147,7 @@ def execute(self, argv):
                     )
 
                 input_path = update_args.input_path[0]
-                output_path = ep_utils.get_output_path(update_args)
+                output_path = ep_utils.get_output_path(update_args, after="updated")
 
                 update_records(
                     input_path=input_path,
diff --git a/asreviewcontrib/preprocess/entry_points/ep_utils.py b/asreviewcontrib/preprocess/entry_points/ep_utils.py
@@ -2,7 +2,7 @@
 from datetime import datetime
 
 
-def get_output_path(args):
+def get_output_path(args, after="deduplicated"):
     """Get output path based on user input.
 
     If path is given, check if it is accepted format.
@@ -21,5 +21,5 @@ def get_output_path(args):
                 output_path += ".csv"
     else:
         output_path = os.path.basename(input_path)
-        output_path = f"{os.path.splitext(output_path)[0]}-deduplicated-{datetime.now().strftime('%Y%m%dT%H%M')}.csv"
+        output_path = f"{os.path.splitext(output_path)[0]}-{after}-{datetime.now().strftime('%Y%m%dT%H%M')}.csv"
     return output_path
diff --git a/asreviewcontrib/preprocess/update_data/crossref_doi_updater.py b/asreviewcontrib/preprocess/update_data/crossref_doi_updater.py
@@ -37,7 +37,7 @@ def retrieve_dois(self, records_df: pd.DataFrame) -> pd.DataFrame:
         data_df = records_df.copy()
 
         # Set invalid years to None
-        data_df[col_specs["year"]][
+        data_df.loc[:, col_specs["year"]].loc[
             np.where(
                 (data_df[col_specs["year"]] <= 1800)
                 | (data_df[col_specs["year"]] >= datetime.date.today().year + 2)
@@ -50,20 +50,27 @@ def retrieve_dois(self, records_df: pd.DataFrame) -> pd.DataFrame:
         # Make missing title and year values as NAN
         cols = [col_specs["title"], col_specs["year"]]
         data_df[cols] = (
-            data_df[cols]
-            .fillna("")
-            .applymap(lambda val: np.nan if len(val) == 0 else val)
+            data_df[cols].fillna("").applymap(lambda val: np.nan if not val else val)
         )
 
         # Check if DOI is missing
+        # TODO: Check if name and year is available in localdb
         missing_doi_count = data_df[col_specs["doi"]].isna().sum()
         print(f"Requesting Crossref to infer {missing_doi_count} missing DOIs")
 
         data_df[col_specs["title"]] = data_df[col_specs["title"]].apply(
-            urllib.parse.quote
+            lambda url: urllib.parse.quote(url) if not pd.isna(url) else np.nan
         )
-        for i, row in data_df[data_df[col_specs["doi"]].isna()].iterrows():
+        # TODO: Remove limit after testing
+        counter = 0
+        for i, row in tqdm(
+            data_df[data_df[col_specs["doi"]].isna()].iterrows(),
+            desc="Finding missing DOIs",
+        ):
+            counter += 1
             data_df.loc[i, col_specs["doi"]] = self._crossref_doi_finder(row)
+            if counter > 5:
+                break
 
         fixed_doi_count = missing_doi_count - data_df[col_specs["doi"]].isna().sum()
         print(
diff --git a/asreviewcontrib/preprocess/update_data/update.py b/asreviewcontrib/preprocess/update_data/update.py
@@ -1,6 +1,7 @@
 import logging
 
 import numpy as np
+import pandas as pd
 from asreviewcontrib.preprocess import utils
 from asreviewcontrib.preprocess.data.load import load_data
 from asreviewcontrib.preprocess.deduplication import dd_utils
@@ -35,10 +36,11 @@ def update_records(
     records_df, _ = load_data(input_path)
 
     col_specs = io_utils._get_column_spec(records_df)
+    print(f"Column Definitions: {col_specs}")
 
-    db = utils._localdb_class_from_entry_point(local_database)
-    doi_updater = utils._updater_class_from_entry_point(doi_update_method)
-    data_updater = utils._updater_class_from_entry_point(data_update_method)
+    db = utils._localdb_class_from_entry_point(local_database)()
+    doi_updater = utils._updater_class_from_entry_point(doi_update_method)()
+    data_updater = utils._updater_class_from_entry_point(data_update_method)()
 
     # Get polite access to updater APIs such as Openalex and Crossref
     if email:
@@ -85,24 +87,22 @@ def update_records(
     retrieved_metadata = data_updater.retrieve_metadata(db, doi_list)
     retrieved_records_df = data_updater.parse_metadata(retrieved_metadata)
 
-    retrieved_records_df = (
-        records_df[col_specs["doi"]]
-        .reset_index()
-        .merge(retrieved_records_df, on="doi")
-        .set_index("record_id")
-    )
+    records_df_only_doi = pd.DataFrame({"doi": records_df[col_specs["doi"]].values})
+    retrieved_records_df = records_df_only_doi.merge(
+        retrieved_records_df, on="doi", how="left"
+    )  # .set_index("record_id")
 
     # Update original df only where the data was missing and is retrieved
     updated_records_df = records_df.combine_first(retrieved_records_df)
 
     n_missing_abstracts_after = _get_no_of_missing_abstracts(
         updated_records_df, col_specs
     )
-    logging.info(
-        f"{n_missing_abstracts_before} abstracts were missing.\n"
-        f"{n_missing_abstracts_before - n_missing_abstracts_after} missing abstracts were retrieved.\n"
-        f"{n_missing_abstracts_after} abstracts are still missing.\n"
+    print(f"{n_missing_abstracts_before} abstracts were missing.")
+    print(
+        f"{n_missing_abstracts_before - n_missing_abstracts_after} missing abstracts were retrieved."
     )
+    print(f"{n_missing_abstracts_after} abstracts are still missing.\n")
 
     updated_records_df.to_csv(output_path)
     print(f"Updated dataset saved to {output_path}")

Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ def execute(self, argv):`
`147`	`147`	`)`
`148`	`148`
`149`	`149`	`input_path = update_args.input_path[0]`
`150`		`- output_path = ep_utils.get_output_path(update_args)`
	`150`	`+ output_path = ep_utils.get_output_path(update_args, after="updated")`
`151`	`151`
`152`	`152`	`update_records(`
`153`	`153`	`input_path=input_path,`