# Do some basic data reading and prep
import pandas as pd
import seaborn as sns
sns.set_theme(style="whitegrid")

df = pd.read_csv("annotated.csv", dtype={"id": str})

# We have only annotated 25 so far
df = df[:25]
df.head()


sns.countplot(x=df.best_guess_paper_used_code)

<Axes: xlabel='best_guess_paper_used_code', ylabel='count'>


# Filter out the repos where we don't think code was used
# Usually math or theory papers
df = df[df.best_guess_paper_used_code == "yes"]
len(df)

20


df["code_found"] = ~df.code_repository_link.isna()
sns.countplot(x=df.code_found)

<Axes: xlabel='code_found', ylabel='count'>


sns.countplot(x="code_found", hue="code_repository_linked_in_paper", data=df)

<Axes: xlabel='code_found', ylabel='count'>


df[(df.code_found == False) & (df.code_repository_linked_in_paper == "yes")].iloc[0].comments

'code has been removed entirely (or was never published) -- lead author account has contributed to: https://github.com/sisl/AutomotiveDrivingModels.jl which is similar'


# Get dataframe with just the papers where we manually found code
code_found = df.loc[df.code_found]


from papers_without_code import search_for_repos
from tqdm import tqdm

# Attempt to find repos for each paper and sort their results
matching_results_benchmark_rows = []
for _, row in tqdm(code_found.iterrows(), total=len(code_found)):
    # Prepend the search with arxiv
    paper_repo_results = search_for_repos(f"arxiv:{row.id}")

    # Check all results
    found_match = False
    match_category = ""
    for i, repo in enumerate(paper_repo_results):
        # Check for match
        if repo.link == row.code_repository_link:
            found_match = True
            if i == 0:
                match_category = "first"
            elif i < 3:
                match_category = "top three"
            elif i < 5:
                match_category = "top five"
            else:
                match_category = "after top five"

            # Break out and finish up this paper
            break

    # Update row with info
    if found_match:
        row["match"] = match_category
    else:
        row["match"] = "not found"

    # Add row to new dataframe
    matching_results_benchmark_rows.append(row)
    
matching_results_benchmark = pd.DataFrame(matching_results_benchmark_rows)

/opt/hostedtoolcache/Python/3.11.7/x64/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
  0%|          | 0/13 [00:00<?, ?it/s]
.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 10.0MB/s]

1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.43MB/s]

README.md: 100%|██████████| 68.1k/68.1k [00:00<00:00, 107MB/s]

config.json: 100%|██████████| 583/583 [00:00<00:00, 4.27MB/s]

model.mlmodel: 100%|██████████| 122k/122k [00:00<00:00, 60.8MB/s]

weight.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]
weight.bin:  24%|██▎       | 31.5M/133M [00:00<00:00, 296MB/s]
weight.bin:  47%|████▋     | 62.9M/133M [00:00<00:00, 301MB/s]
weight.bin:  71%|███████   | 94.4M/133M [00:00<00:00, 305MB/s]
weight.bin: 100%|██████████| 133M/133M [00:00<00:00, 296MB/s]

(…)on/float32_model.mlpackage/Manifest.json: 100%|██████████| 617/617 [00:00<00:00, 5.02MB/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]
model.safetensors:  31%|███▏      | 21.0M/66.7M [00:00<00:00, 136MB/s]
model.safetensors: 100%|██████████| 66.7M/66.7M [00:00<00:00, 197MB/s]

pytorch_model.bin:   0%|          | 0.00/66.8M [00:00<?, ?B/s]
pytorch_model.bin:  47%|████▋     | 31.5M/66.8M [00:00<00:00, 307MB/s]
pytorch_model.bin: 100%|██████████| 66.8M/66.8M [00:00<00:00, 289MB/s]

sentence_bert_config.json: 100%|██████████| 57.0/57.0 [00:00<00:00, 322kB/s]

special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 820kB/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]
tokenizer.json: 100%|██████████| 712k/712k [00:00<00:00, 5.75MB/s]

tokenizer_config.json: 100%|██████████| 394/394 [00:00<00:00, 2.60MB/s]

vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 4.01MB/s]

modules.json: 100%|██████████| 385/385 [00:00<00:00, 3.50MB/s]
100%|██████████| 13/13 [04:13<00:00, 19.52s/it]


matching_results_benchmark[["id", "code_repository_linked_in_paper", "match"]]

	id	doi	arxiv_link	update_date	title	best_guess_paper_used_code	code_repository_linked_in_paper	code_repository_link	found_via	comments
0	2101.10263	NaN	https://arxiv.org/abs/2101.10263	2021-01-26	Generative Autoencoder Kernels on Deep Learnin...	yes	no	https://github.com/galtan-PhD/Deep_Autoencoder...	google search corresponding author -> github p...	NaN
1	2106.09719	NaN	https://arxiv.org/abs/2106.09719	2021-12-03	Machining Cycle Time Prediction: Data-driven M...	yes	no	NaN	NaN	NaN
2	2107.05962	NaN	https://arxiv.org/abs/2107.05962	2021-07-23	COLiER: Collaborative Editing of Raster Images	yes	no	NaN	NaN	NaN
3	2112.08371	10.1007/978-3-030-74009-2	https://arxiv.org/abs/2112.08371	2021-12-17	Blockchain as an IoT intermediary	yes	no	NaN	NaN	NaN
4	1803.09565	10.1371/journal.pcbi.1006454	https://arxiv.org/abs/1803.09565	2018-09-13	SIG-DB: leveraging homomorphic encryption to S...	yes	yes	https://github.com/BNext-IQT/GEMstone	NaN	NaN

Papers without Code EDA and Benchmarking¶

"How many papers likely use code?"¶

"Of the 20, how many papers can we find repositories for?"¶

"How do the papers break down by if code was found AND the code had to be manually found (it wasn't linked in the paper)?"¶

"How many repositories can we find with our automated methods?"¶

Takeaways¶

The Bad¶

The Good¶

	id	code_repository_linked_in_paper	match
0	2101.10263	no	not found
4	1803.09565	yes	first
5	2202.13538	yes	first
6	2111.14338	yes	first
9	1706.07119	yes	not found
10	2003.01479	yes	first
11	2110.06912	no	not found
13	2205.04892	no	not found
14	1501.05151	no	not found
16	2007.10100	no	top three
19	2110.05877	yes	first
21	2202.06443	yes	not found
22	2111.12485	yes	not found