Skip to content

Commit 3c2a4de

Browse files
Add PR action to validate notebook format (#1793)
1 parent 8fd8b9b commit 3c2a4de

File tree

5 files changed

+124
-39
lines changed

5 files changed

+124
-39
lines changed

.github/scripts/check_notebooks.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import subprocess
2+
import sys
3+
from pathlib import Path
4+
5+
import nbformat
6+
7+
8+
def get_changed_notebooks(base_ref: str = "origin/main") -> list[Path]:
9+
"""
10+
Returns a list of changed notebook paths in the current git branch
11+
compared to the specified base reference.
12+
"""
13+
result = subprocess.run(
14+
["git", "diff", "--name-only", base_ref, "--", "*.ipynb"],
15+
capture_output=True,
16+
text=True,
17+
check=True,
18+
)
19+
return [Path(line.strip()) for line in result.stdout.splitlines() if line.strip()]
20+
21+
22+
def is_valid_notebook(path: Path) -> bool:
23+
"""
24+
Checks if the notebook at the given path is valid by attempting to read it
25+
with nbformat.
26+
"""
27+
try:
28+
with open(path, "r", encoding="utf-8") as f:
29+
nbformat.read(f, as_version=4)
30+
return True
31+
except Exception as e:
32+
print(f"{path}: INVALID - {e}")
33+
return False
34+
35+
36+
def main() -> None:
37+
"""
38+
Main function to validate the format of changed notebooks.
39+
"""
40+
changed_notebooks = get_changed_notebooks()
41+
if not changed_notebooks:
42+
print("No changed .ipynb files to validate.")
43+
sys.exit(0)
44+
45+
print(f"Validating {len(changed_notebooks)} notebook(s)...")
46+
errors = 0
47+
for path in changed_notebooks:
48+
if not path.exists():
49+
continue # skip deleted files
50+
if not is_valid_notebook(path):
51+
errors += 1
52+
53+
if errors:
54+
print(f"{errors} invalid notebook(s) found.")
55+
sys.exit(1)
56+
else:
57+
print("All changed notebooks are valid.")
58+
59+
60+
if __name__ == "__main__":
61+
main()
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
name: Validate Changed Notebooks
2+
3+
on: [pull_request]
4+
5+
jobs:
6+
validate-notebooks:
7+
name: Validate Notebooks
8+
runs-on: ubuntu-latest
9+
10+
steps:
11+
- name: Checkout code
12+
uses: actions/checkout@v3
13+
with:
14+
fetch-depth: 0 # needed for git diff to work
15+
16+
- name: Set up Python
17+
uses: actions/setup-python@v4
18+
with:
19+
python-version: '3.12'
20+
21+
- name: Install dependencies
22+
run: pip install nbformat
23+
24+
- name: Validate changed .ipynb files
25+
run: python .github/scripts/check_notebooks.py

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,6 @@ examples/fine-tuned_qa/local_cache/*
140140

141141
# PyCharm files
142142
.idea/
143+
144+
# VS Code files
145+
.vscode/

examples/vector_databases/chroma/hyde-with-chroma-and-openai.ipynb

+34-38
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@
4141
"name": "stdout",
4242
"output_type": "stream",
4343
"text": [
44-
"\u001b[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
45-
"\u001b[0m\n",
46-
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
47-
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
4844
"Note: you may need to restart the kernel to use updated packages.\n"
4945
]
5046
}
@@ -236,7 +232,7 @@
236232
"def build_prompt(claim):\n",
237233
" return [\n",
238234
" {\"role\": \"system\", \"content\": \"I will ask you to assess a scientific claim. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"},\n",
239-
" {\"role\": \"user\", \"content\": f\"\"\" \n",
235+
" {\"role\": \"user\", \"content\": f\"\"\"\n",
240236
"Example:\n",
241237
"\n",
242238
"Claim:\n",
@@ -298,7 +294,7 @@
298294
"# Let's take a look at 50 claims\n",
299295
"samples = claim_df.sample(50)\n",
300296
"\n",
301-
"claims = samples['claim'].tolist() \n"
297+
"claims = samples['claim'].tolist()\n"
302298
]
303299
},
304300
{
@@ -317,7 +313,7 @@
317313
"def get_groundtruth(evidence):\n",
318314
" groundtruth = []\n",
319315
" for e in evidence:\n",
320-
" # Evidence is empty \n",
316+
" # Evidence is empty\n",
321317
" if len(e) == 0:\n",
322318
" groundtruth.append('NEE')\n",
323319
" else:\n",
@@ -392,17 +388,17 @@
392388
"text": [
393389
"\tGroundtruth\n",
394390
"\tTrue\tFalse\tNEE\n",
395-
"True\t12\t4\t16\t\n",
396-
"False\t0\t4\t3\t\n",
397-
"NEE\t6\t2\t3\t\n"
391+
"True\t9\t3\t15\t\n",
392+
"False\t0\t3\t2\t\n",
393+
"NEE\t8\t6\t4\t\n"
398394
]
399395
},
400396
{
401397
"data": {
402398
"text/plain": [
403-
"{'True': {'True': 12, 'False': 4, 'NEE': 16},\n",
404-
" 'False': {'True': 0, 'False': 4, 'NEE': 3},\n",
405-
" 'NEE': {'True': 6, 'False': 2, 'NEE': 3}}"
399+
"{'True': {'True': 9, 'False': 3, 'NEE': 15},\n",
400+
" 'False': {'True': 0, 'False': 3, 'NEE': 2},\n",
401+
" 'NEE': {'True': 8, 'False': 6, 'NEE': 4}}"
406402
]
407403
},
408404
"execution_count": 10,
@@ -631,13 +627,13 @@
631627
"outputs": [],
632628
"source": [
633629
"def build_prompt_with_context(claim, context):\n",
634-
" return [{'role': 'system', 'content': \"I will ask you to assess whether a particular scientific claim, based on evidence provided. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"}, \n",
630+
" return [{'role': 'system', 'content': \"I will ask you to assess whether a particular scientific claim, based on evidence provided. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"},\n",
635631
" {'role': 'user', 'content': f\"\"\"\"\n",
636632
"The evidence is the following:\n",
637633
"\n",
638634
"{' '.join(context)}\n",
639635
"\n",
640-
"Assess the following claim on the basis of the evidence. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence. Do not output any other text. \n",
636+
"Assess the following claim on the basis of the evidence. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence. Do not output any other text.\n",
641637
"\n",
642638
"Claim:\n",
643639
"{claim}\n",
@@ -683,17 +679,17 @@
683679
"text": [
684680
"\tGroundtruth\n",
685681
"\tTrue\tFalse\tNEE\n",
686-
"True\t13\t0\t3\t\n",
687-
"False\t0\t9\t3\t\n",
688-
"NEE\t5\t1\t16\t\n"
682+
"True\t13\t1\t4\t\n",
683+
"False\t1\t10\t2\t\n",
684+
"NEE\t3\t1\t15\t\n"
689685
]
690686
},
691687
{
692688
"data": {
693689
"text/plain": [
694-
"{'True': {'True': 13, 'False': 0, 'NEE': 3},\n",
695-
" 'False': {'True': 0, 'False': 9, 'NEE': 3},\n",
696-
" 'NEE': {'True': 5, 'False': 1, 'NEE': 16}}"
690+
"{'True': {'True': 13, 'False': 1, 'NEE': 4},\n",
691+
" 'False': {'True': 1, 'False': 10, 'NEE': 2},\n",
692+
" 'NEE': {'True': 3, 'False': 1, 'NEE': 15}}"
697693
]
698694
},
699695
"execution_count": 16,
@@ -774,17 +770,17 @@
774770
"text": [
775771
"\tGroundtruth\n",
776772
"\tTrue\tFalse\tNEE\n",
777-
"True\t6\t0\t3\t\n",
778-
"False\t0\t3\t0\t\n",
779-
"NEE\t12\t7\t19\t\n"
773+
"True\t9\t0\t1\t\n",
774+
"False\t0\t7\t0\t\n",
775+
"NEE\t8\t5\t20\t\n"
780776
]
781777
},
782778
{
783779
"data": {
784780
"text/plain": [
785-
"{'True': {'True': 6, 'False': 0, 'NEE': 3},\n",
786-
" 'False': {'True': 0, 'False': 3, 'NEE': 0},\n",
787-
" 'NEE': {'True': 12, 'False': 7, 'NEE': 19}}"
781+
"{'True': {'True': 9, 'False': 0, 'NEE': 1},\n",
782+
" 'False': {'True': 0, 'False': 7, 'NEE': 0},\n",
783+
" 'NEE': {'True': 8, 'False': 5, 'NEE': 20}}"
788784
]
789785
},
790786
"execution_count": 19,
@@ -843,19 +839,19 @@
843839
"source": [
844840
"def build_hallucination_prompt(claim):\n",
845841
" return [{'role': 'system', 'content': \"\"\"I will ask you to write an abstract for a scientific paper which supports or refutes a given claim. It should be written in scientific language, include a title. Output only one abstract, then stop.\n",
846-
" \n",
842+
"\n",
847843
" An Example:\n",
848844
"\n",
849845
" Claim:\n",
850846
" A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.\n",
851847
"\n",
852848
" Abstract:\n",
853-
" BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia homozygosity confers considerable protection against severe malaria, including severe malarial anaemia (SMA) (Hb concentration < 50 g/l), but does not influence parasite count. We tested the hypothesis that the erythrocyte indices associated with alpha(+)-thalassaemia homozygosity provide a haematological benefit during acute malaria. \n",
854-
" METHODS AND FINDINGS Data from children living on the north coast of Papua New Guinea who had participated in a case-control study of the protection afforded by alpha(+)-thalassaemia against severe malaria were reanalysed to assess the genotype-specific reduction in erythrocyte count and Hb levels associated with acute malarial disease. We observed a reduction in median erythrocyte count of approximately 1.5 x 10(12)/l in all children with acute falciparum malaria relative to values in community children (p < 0.001). We developed a simple mathematical model of the linear relationship between Hb concentration and erythrocyte count. This model predicted that children homozygous for alpha(+)-thalassaemia lose less Hb than children of normal genotype for a reduction in erythrocyte count of >1.1 x 10(12)/l as a result of the reduced mean cell Hb in homozygous alpha(+)-thalassaemia. In addition, children homozygous for alpha(+)-thalassaemia require a 10% greater reduction in erythrocyte count than children of normal genotype (p = 0.02) for Hb concentration to fall to 50 g/l, the cutoff for SMA. We estimated that the haematological profile in children homozygous for alpha(+)-thalassaemia reduces the risk of SMA during acute malaria compared to children of normal genotype (relative risk 0.52; 95% confidence interval [CI] 0.24-1.12, p = 0.09). \n",
849+
" BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia homozygosity confers considerable protection against severe malaria, including severe malarial anaemia (SMA) (Hb concentration < 50 g/l), but does not influence parasite count. We tested the hypothesis that the erythrocyte indices associated with alpha(+)-thalassaemia homozygosity provide a haematological benefit during acute malaria.\n",
850+
" METHODS AND FINDINGS Data from children living on the north coast of Papua New Guinea who had participated in a case-control study of the protection afforded by alpha(+)-thalassaemia against severe malaria were reanalysed to assess the genotype-specific reduction in erythrocyte count and Hb levels associated with acute malarial disease. We observed a reduction in median erythrocyte count of approximately 1.5 x 10(12)/l in all children with acute falciparum malaria relative to values in community children (p < 0.001). We developed a simple mathematical model of the linear relationship between Hb concentration and erythrocyte count. This model predicted that children homozygous for alpha(+)-thalassaemia lose less Hb than children of normal genotype for a reduction in erythrocyte count of >1.1 x 10(12)/l as a result of the reduced mean cell Hb in homozygous alpha(+)-thalassaemia. In addition, children homozygous for alpha(+)-thalassaemia require a 10% greater reduction in erythrocyte count than children of normal genotype (p = 0.02) for Hb concentration to fall to 50 g/l, the cutoff for SMA. We estimated that the haematological profile in children homozygous for alpha(+)-thalassaemia reduces the risk of SMA during acute malaria compared to children of normal genotype (relative risk 0.52; 95% confidence interval [CI] 0.24-1.12, p = 0.09).\n",
855851
" CONCLUSIONS The increased erythrocyte count and microcytosis in children homozygous for alpha(+)-thalassaemia may contribute substantially to their protection against SMA. A lower concentration of Hb per erythrocyte and a larger population of erythrocytes may be a biologically advantageous strategy against the significant reduction in erythrocyte count that occurs during acute infection with the malaria parasite Plasmodium falciparum. This haematological profile may reduce the risk of anaemia by other Plasmodium species, as well as other causes of anaemia. Other host polymorphisms that induce an increased erythrocyte count and microcytosis may confer a similar advantage.\n",
856852
"\n",
857-
" End of example. \n",
858-
" \n",
853+
" End of example.\n",
854+
"\n",
859855
" \"\"\"}, {'role': 'user', 'content': f\"\"\"\"\n",
860856
" Perform the task for the following claim.\n",
861857
"\n",
@@ -931,17 +927,17 @@
931927
"text": [
932928
"\tGroundtruth\n",
933929
"\tTrue\tFalse\tNEE\n",
934-
"True\t11\t0\t5\t\n",
935-
"False\t0\t8\t1\t\n",
936-
"NEE\t7\t2\t16\t\n"
930+
"True\t13\t0\t3\t\n",
931+
"False\t1\t10\t1\t\n",
932+
"NEE\t3\t2\t17\t\n"
937933
]
938934
},
939935
{
940936
"data": {
941937
"text/plain": [
942-
"{'True': {'True': 11, 'False': 0, 'NEE': 5},\n",
943-
" 'False': {'True': 0, 'False': 8, 'NEE': 1},\n",
944-
" 'NEE': {'True': 7, 'False': 2, 'NEE': 16}}"
938+
"{'True': {'True': 13, 'False': 0, 'NEE': 3},\n",
939+
" 'False': {'True': 1, 'False': 10, 'NEE': 1},\n",
940+
" 'NEE': {'True': 3, 'False': 2, 'NEE': 17}}"
945941
]
946942
},
947943
"execution_count": 23,

registry.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,7 @@
694694
date: 2025-04-23
695695
authors:
696696
- atroyn
697-
- brandonbaker
697+
- brandonbaker-openai
698698
tags:
699699
- embeddings
700700
- completions

0 commit comments

Comments
 (0)