Skip to content

Commit 64960f9

Browse files
committed
first and last commit
0 parents  commit 64960f9

File tree

10 files changed

+1251
-0
lines changed

10 files changed

+1251
-0
lines changed

.gitignore

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Python-generated files
2+
__pycache__/
3+
*.py[oc]
4+
build/
5+
dist/
6+
wheels/
7+
*.egg-info
8+
9+
# Virtual environments
10+
.venv
11+
12+
# Custom
13+
*_data/
14+
*.epub

.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.10

README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# reader 3
2+
3+
![reader3](reader3.png)
4+
5+
A lightweight, self-hosted EPUB reader that lets you read through EPUB books one chapter at a time. This makes it very easy to copy paste the contents of a chapter to an LLM, to read along. Basically - get epub books (e.g. [Project Gutenberg](https://www.gutenberg.org/) has many), open them up in this reader, copy paste text around to your favorite LLM, and read together and along.
6+
7+
This project was 90% vibe coded just to illustrate how one can very easily [read books together with LLMs](https://x.com/karpathy/status/1990577951671509438). I'm not going to support it in any way, it's provided here as is for other people's inspiration and I don't intend to improve it. Code is ephemeral now and libraries are over, ask your LLM to change it in whatever way you like.
8+
9+
## Usage
10+
11+
The project uses [uv](https://docs.astral.sh/uv/). So for example, download [Dracula EPUB3](https://www.gutenberg.org/ebooks/345) to this directory as `dracula.epub`, then:
12+
13+
```bash
14+
uv run reader3.py dracula.epub
15+
```
16+
17+
This creates the directory `dracula_data`, which registers the book to your local library. We can then run the server:
18+
19+
```bash
20+
uv run server.py
21+
```
22+
23+
And visit [localhost:8123](http://localhost:8123/) to see your current Library. You can easily add more books, or delete them from your library by deleting the folder. It's not supposed to be complicated or complex.
24+
25+
## License
26+
27+
MIT

pyproject.toml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[project]
2+
name = "reader3"
3+
version = "0.1.0"
4+
description = "Simple EPUB reader web app"
5+
readme = "README.md"
6+
requires-python = ">=3.10"
7+
dependencies = [
8+
"beautifulsoup4>=4.14.2",
9+
"ebooklib>=0.20",
10+
"fastapi>=0.121.2",
11+
"jinja2>=3.1.6",
12+
"uvicorn>=0.38.0",
13+
]

reader3.png

233 KB
Loading

reader3.py

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
"""
2+
Parses an EPUB file into a structured object that can be used to serve the book via a web interface.
3+
"""
4+
5+
import os
6+
import pickle
7+
import shutil
8+
from dataclasses import dataclass, field
9+
from typing import List, Dict, Optional, Any
10+
from datetime import datetime
11+
from urllib.parse import unquote
12+
13+
import ebooklib
14+
from ebooklib import epub
15+
from bs4 import BeautifulSoup, Comment
16+
17+
# --- Data structures ---
18+
19+
@dataclass
20+
class ChapterContent:
21+
"""
22+
Represents a physical file in the EPUB (Spine Item).
23+
A single file might contain multiple logical chapters (TOC entries).
24+
"""
25+
id: str # Internal ID (e.g., 'item_1')
26+
href: str # Filename (e.g., 'part01.html')
27+
title: str # Best guess title from file
28+
content: str # Cleaned HTML with rewritten image paths
29+
text: str # Plain text for search/LLM context
30+
order: int # Linear reading order
31+
32+
33+
@dataclass
34+
class TOCEntry:
35+
"""Represents a logical entry in the navigation sidebar."""
36+
title: str
37+
href: str # original href (e.g., 'part01.html#chapter1')
38+
file_href: str # just the filename (e.g., 'part01.html')
39+
anchor: str # just the anchor (e.g., 'chapter1'), empty if none
40+
children: List['TOCEntry'] = field(default_factory=list)
41+
42+
43+
@dataclass
44+
class BookMetadata:
45+
"""Metadata"""
46+
title: str
47+
language: str
48+
authors: List[str] = field(default_factory=list)
49+
description: Optional[str] = None
50+
publisher: Optional[str] = None
51+
date: Optional[str] = None
52+
identifiers: List[str] = field(default_factory=list)
53+
subjects: List[str] = field(default_factory=list)
54+
55+
56+
@dataclass
57+
class Book:
58+
"""The Master Object to be pickled."""
59+
metadata: BookMetadata
60+
spine: List[ChapterContent] # The actual content (linear files)
61+
toc: List[TOCEntry] # The navigation tree
62+
images: Dict[str, str] # Map: original_path -> local_path
63+
64+
# Meta info
65+
source_file: str
66+
processed_at: str
67+
version: str = "3.0"
68+
69+
70+
# --- Utilities ---
71+
72+
def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup:
73+
74+
# Remove dangerous/useless tags
75+
for tag in soup(['script', 'style', 'iframe', 'video', 'nav', 'form', 'button']):
76+
tag.decompose()
77+
78+
# Remove HTML comments
79+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
80+
comment.extract()
81+
82+
# Remove input tags
83+
for tag in soup.find_all('input'):
84+
tag.decompose()
85+
86+
return soup
87+
88+
89+
def extract_plain_text(soup: BeautifulSoup) -> str:
90+
"""Extract clean text for LLM/Search usage."""
91+
text = soup.get_text(separator=' ')
92+
# Collapse whitespace
93+
return ' '.join(text.split())
94+
95+
96+
def parse_toc_recursive(toc_list, depth=0) -> List[TOCEntry]:
97+
"""
98+
Recursively parses the TOC structure from ebooklib.
99+
"""
100+
result = []
101+
102+
for item in toc_list:
103+
# ebooklib TOC items are either `Link` objects or tuples (Section, [Children])
104+
if isinstance(item, tuple):
105+
section, children = item
106+
entry = TOCEntry(
107+
title=section.title,
108+
href=section.href,
109+
file_href=section.href.split('#')[0],
110+
anchor=section.href.split('#')[1] if '#' in section.href else "",
111+
children=parse_toc_recursive(children, depth + 1)
112+
)
113+
result.append(entry)
114+
elif isinstance(item, epub.Link):
115+
entry = TOCEntry(
116+
title=item.title,
117+
href=item.href,
118+
file_href=item.href.split('#')[0],
119+
anchor=item.href.split('#')[1] if '#' in item.href else ""
120+
)
121+
result.append(entry)
122+
# Note: ebooklib sometimes returns direct Section objects without children
123+
elif isinstance(item, epub.Section):
124+
entry = TOCEntry(
125+
title=item.title,
126+
href=item.href,
127+
file_href=item.href.split('#')[0],
128+
anchor=item.href.split('#')[1] if '#' in item.href else ""
129+
)
130+
result.append(entry)
131+
132+
return result
133+
134+
135+
def get_fallback_toc(book_obj) -> List[TOCEntry]:
136+
"""
137+
If TOC is missing, build a flat one from the Spine.
138+
"""
139+
toc = []
140+
for item in book_obj.get_items():
141+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
142+
name = item.get_name()
143+
# Try to guess a title from the content or ID
144+
title = item.get_name().replace('.html', '').replace('.xhtml', '').replace('_', ' ').title()
145+
toc.append(TOCEntry(title=title, href=name, file_href=name, anchor=""))
146+
return toc
147+
148+
149+
def extract_metadata_robust(book_obj) -> BookMetadata:
150+
"""
151+
Extracts metadata handling both single and list values.
152+
"""
153+
def get_list(key):
154+
data = book_obj.get_metadata('DC', key)
155+
return [x[0] for x in data] if data else []
156+
157+
def get_one(key):
158+
data = book_obj.get_metadata('DC', key)
159+
return data[0][0] if data else None
160+
161+
return BookMetadata(
162+
title=get_one('title') or "Untitled",
163+
language=get_one('language') or "en",
164+
authors=get_list('creator'),
165+
description=get_one('description'),
166+
publisher=get_one('publisher'),
167+
date=get_one('date'),
168+
identifiers=get_list('identifier'),
169+
subjects=get_list('subject')
170+
)
171+
172+
173+
# --- Main Conversion Logic ---
174+
175+
def process_epub(epub_path: str, output_dir: str) -> Book:
176+
177+
# 1. Load Book
178+
print(f"Loading {epub_path}...")
179+
book = epub.read_epub(epub_path)
180+
181+
# 2. Extract Metadata
182+
metadata = extract_metadata_robust(book)
183+
184+
# 3. Prepare Output Directories
185+
if os.path.exists(output_dir):
186+
shutil.rmtree(output_dir)
187+
images_dir = os.path.join(output_dir, 'images')
188+
os.makedirs(images_dir, exist_ok=True)
189+
190+
# 4. Extract Images & Build Map
191+
print("Extracting images...")
192+
image_map = {} # Key: internal_path, Value: local_relative_path
193+
194+
for item in book.get_items():
195+
if item.get_type() == ebooklib.ITEM_IMAGE:
196+
# Normalize filename
197+
original_fname = os.path.basename(item.get_name())
198+
# Sanitize filename for OS
199+
safe_fname = "".join([c for c in original_fname if c.isalpha() or c.isdigit() or c in '._-']).strip()
200+
201+
# Save to disk
202+
local_path = os.path.join(images_dir, safe_fname)
203+
with open(local_path, 'wb') as f:
204+
f.write(item.get_content())
205+
206+
# Map keys: We try both the full internal path and just the basename
207+
# to be robust against messy HTML src attributes
208+
rel_path = f"images/{safe_fname}"
209+
image_map[item.get_name()] = rel_path
210+
image_map[original_fname] = rel_path
211+
212+
# 5. Process TOC
213+
print("Parsing Table of Contents...")
214+
toc_structure = parse_toc_recursive(book.toc)
215+
if not toc_structure:
216+
print("Warning: Empty TOC, building fallback from Spine...")
217+
toc_structure = get_fallback_toc(book)
218+
219+
# 6. Process Content (Spine-based to preserve HTML validity)
220+
print("Processing chapters...")
221+
spine_chapters = []
222+
223+
# We iterate over the spine (linear reading order)
224+
for i, spine_item in enumerate(book.spine):
225+
item_id, linear = spine_item
226+
item = book.get_item_with_id(item_id)
227+
228+
if not item:
229+
continue
230+
231+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
232+
# Raw content
233+
raw_content = item.get_content().decode('utf-8', errors='ignore')
234+
soup = BeautifulSoup(raw_content, 'html.parser')
235+
236+
# A. Fix Images
237+
for img in soup.find_all('img'):
238+
src = img.get('src', '')
239+
if not src: continue
240+
241+
# Decode URL (part01/image%201.jpg -> part01/image 1.jpg)
242+
src_decoded = unquote(src)
243+
filename = os.path.basename(src_decoded)
244+
245+
# Try to find in map
246+
if src_decoded in image_map:
247+
img['src'] = image_map[src_decoded]
248+
elif filename in image_map:
249+
img['src'] = image_map[filename]
250+
251+
# B. Clean HTML
252+
soup = clean_html_content(soup)
253+
254+
# C. Extract Body Content only
255+
body = soup.find('body')
256+
if body:
257+
# Extract inner HTML of body
258+
final_html = "".join([str(x) for x in body.contents])
259+
else:
260+
final_html = str(soup)
261+
262+
# D. Create Object
263+
chapter = ChapterContent(
264+
id=item_id,
265+
href=item.get_name(), # Important: This links TOC to Content
266+
title=f"Section {i+1}", # Fallback, real titles come from TOC
267+
content=final_html,
268+
text=extract_plain_text(soup),
269+
order=i
270+
)
271+
spine_chapters.append(chapter)
272+
273+
# 7. Final Assembly
274+
final_book = Book(
275+
metadata=metadata,
276+
spine=spine_chapters,
277+
toc=toc_structure,
278+
images=image_map,
279+
source_file=os.path.basename(epub_path),
280+
processed_at=datetime.now().isoformat()
281+
)
282+
283+
return final_book
284+
285+
286+
def save_to_pickle(book: Book, output_dir: str):
287+
p_path = os.path.join(output_dir, 'book.pkl')
288+
with open(p_path, 'wb') as f:
289+
pickle.dump(book, f)
290+
print(f"Saved structured data to {p_path}")
291+
292+
293+
# --- CLI ---
294+
295+
if __name__ == "__main__":
296+
297+
import sys
298+
if len(sys.argv) < 2:
299+
print("Usage: python reader3.py <file.epub>")
300+
sys.exit(1)
301+
302+
epub_file = sys.argv[1]
303+
assert os.path.exists(epub_file), "File not found."
304+
out_dir = os.path.splitext(epub_file)[0] + "_data"
305+
306+
book_obj = process_epub(epub_file, out_dir)
307+
save_to_pickle(book_obj, out_dir)
308+
print("\n--- Summary ---")
309+
print(f"Title: {book_obj.metadata.title}")
310+
print(f"Authors: {', '.join(book_obj.metadata.authors)}")
311+
print(f"Physical Files (Spine): {len(book_obj.spine)}")
312+
print(f"TOC Root Items: {len(book_obj.toc)}")
313+
print(f"Images extracted: {len(book_obj.images)}")

0 commit comments

Comments
 (0)