Skip to content

Commit 1e37022

Browse files
committed
Fix errors in update CLI call
1 parent 43d7472 commit 1e37022

File tree

5 files changed

+30
-22
lines changed

5 files changed

+30
-22
lines changed

asreviewcontrib/preprocess/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
"year": ["year"],
1818
"ref_type": ["ref_type", "type_of_reference"],
1919
"journal": ["journal"],
20+
"doi": ["doi"],
2021
"volume": ["volume"],
2122
"pages": ["pages", "start_page"],
2223
"number": ["number", "issue"],

asreviewcontrib/preprocess/entry_points/entrypoint.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def execute(self, argv):
147147
)
148148

149149
input_path = update_args.input_path[0]
150-
output_path = ep_utils.get_output_path(update_args)
150+
output_path = ep_utils.get_output_path(update_args, after="updated")
151151

152152
update_records(
153153
input_path=input_path,

asreviewcontrib/preprocess/entry_points/ep_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from datetime import datetime
33

44

5-
def get_output_path(args):
5+
def get_output_path(args, after="deduplicated"):
66
"""Get output path based on user input.
77
88
If path is given, check if it is accepted format.
@@ -21,5 +21,5 @@ def get_output_path(args):
2121
output_path += ".csv"
2222
else:
2323
output_path = os.path.basename(input_path)
24-
output_path = f"{os.path.splitext(output_path)[0]}-deduplicated-{datetime.now().strftime('%Y%m%dT%H%M')}.csv"
24+
output_path = f"{os.path.splitext(output_path)[0]}-{after}-{datetime.now().strftime('%Y%m%dT%H%M')}.csv"
2525
return output_path

asreviewcontrib/preprocess/update_data/crossref_doi_updater.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def retrieve_dois(self, records_df: pd.DataFrame) -> pd.DataFrame:
3737
data_df = records_df.copy()
3838

3939
# Set invalid years to None
40-
data_df[col_specs["year"]][
40+
data_df.loc[:, col_specs["year"]].loc[
4141
np.where(
4242
(data_df[col_specs["year"]] <= 1800)
4343
| (data_df[col_specs["year"]] >= datetime.date.today().year + 2)
@@ -50,20 +50,27 @@ def retrieve_dois(self, records_df: pd.DataFrame) -> pd.DataFrame:
5050
# Make missing title and year values as NAN
5151
cols = [col_specs["title"], col_specs["year"]]
5252
data_df[cols] = (
53-
data_df[cols]
54-
.fillna("")
55-
.applymap(lambda val: np.nan if len(val) == 0 else val)
53+
data_df[cols].fillna("").applymap(lambda val: np.nan if not val else val)
5654
)
5755

5856
# Check if DOI is missing
57+
# TODO: Check if name and year is available in localdb
5958
missing_doi_count = data_df[col_specs["doi"]].isna().sum()
6059
print(f"Requesting Crossref to infer {missing_doi_count} missing DOIs")
6160

6261
data_df[col_specs["title"]] = data_df[col_specs["title"]].apply(
63-
urllib.parse.quote
62+
lambda url: urllib.parse.quote(url) if not pd.isna(url) else np.nan
6463
)
65-
for i, row in data_df[data_df[col_specs["doi"]].isna()].iterrows():
64+
# TODO: Remove limit after testing
65+
counter = 0
66+
for i, row in tqdm(
67+
data_df[data_df[col_specs["doi"]].isna()].iterrows(),
68+
desc="Finding missing DOIs",
69+
):
70+
counter += 1
6671
data_df.loc[i, col_specs["doi"]] = self._crossref_doi_finder(row)
72+
if counter > 5:
73+
break
6774

6875
fixed_doi_count = missing_doi_count - data_df[col_specs["doi"]].isna().sum()
6976
print(

asreviewcontrib/preprocess/update_data/update.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22

33
import numpy as np
4+
import pandas as pd
45
from asreviewcontrib.preprocess import utils
56
from asreviewcontrib.preprocess.data.load import load_data
67
from asreviewcontrib.preprocess.deduplication import dd_utils
@@ -35,10 +36,11 @@ def update_records(
3536
records_df, _ = load_data(input_path)
3637

3738
col_specs = io_utils._get_column_spec(records_df)
39+
print(f"Column Definitions: {col_specs}")
3840

39-
db = utils._localdb_class_from_entry_point(local_database)
40-
doi_updater = utils._updater_class_from_entry_point(doi_update_method)
41-
data_updater = utils._updater_class_from_entry_point(data_update_method)
41+
db = utils._localdb_class_from_entry_point(local_database)()
42+
doi_updater = utils._updater_class_from_entry_point(doi_update_method)()
43+
data_updater = utils._updater_class_from_entry_point(data_update_method)()
4244

4345
# Get polite access to updater APIs such as Openalex and Crossref
4446
if email:
@@ -85,24 +87,22 @@ def update_records(
8587
retrieved_metadata = data_updater.retrieve_metadata(db, doi_list)
8688
retrieved_records_df = data_updater.parse_metadata(retrieved_metadata)
8789

88-
retrieved_records_df = (
89-
records_df[col_specs["doi"]]
90-
.reset_index()
91-
.merge(retrieved_records_df, on="doi")
92-
.set_index("record_id")
93-
)
90+
records_df_only_doi = pd.DataFrame({"doi": records_df[col_specs["doi"]].values})
91+
retrieved_records_df = records_df_only_doi.merge(
92+
retrieved_records_df, on="doi", how="left"
93+
) # .set_index("record_id")
9494

9595
# Update original df only where the data was missing and is retrieved
9696
updated_records_df = records_df.combine_first(retrieved_records_df)
9797

9898
n_missing_abstracts_after = _get_no_of_missing_abstracts(
9999
updated_records_df, col_specs
100100
)
101-
logging.info(
102-
f"{n_missing_abstracts_before} abstracts were missing.\n"
103-
f"{n_missing_abstracts_before - n_missing_abstracts_after} missing abstracts were retrieved.\n"
104-
f"{n_missing_abstracts_after} abstracts are still missing.\n"
101+
print(f"{n_missing_abstracts_before} abstracts were missing.")
102+
print(
103+
f"{n_missing_abstracts_before - n_missing_abstracts_after} missing abstracts were retrieved."
105104
)
105+
print(f"{n_missing_abstracts_after} abstracts are still missing.\n")
106106

107107
updated_records_df.to_csv(output_path)
108108
print(f"Updated dataset saved to {output_path}")

0 commit comments

Comments
 (0)