Skip to content

Commit 5669461

Browse files
committed
Create s1_data_downloader.py
1. combine the step 1 process.
1 parent aacdf83 commit 5669461

File tree

1 file changed

+135
-0
lines changed

1 file changed

+135
-0
lines changed

s1_data_downloader.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/usr/bin/env python3
2+
"""
3+
s1_data_downloader.py
4+
5+
This script downloads YouTube transcripts and videos for video IDs specified in conf.ID.
6+
Transcripts are downloaded first (if not already saved) and then videos are downloaded
7+
(if not already present). Logging is used to provide detailed debugging information.
8+
"""
9+
10+
import os
11+
import time
12+
import logging
13+
from glob import glob
14+
from yt_dlp import YoutubeDL
15+
from youtube_transcript_api import YouTubeTranscriptApi
16+
from youtube_transcript_api import CouldNotRetrieveTranscript, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable, TooManyRequests
17+
from youtube_transcript_api.formatters import JSONFormatter
18+
from tqdm import tqdm
19+
20+
import conf as c # Using 'c' for configuration
21+
22+
# Configure logging for debugging
23+
logging.basicConfig(
24+
level=logging.INFO,
25+
format="%(asctime)s - %(levelname)s - %(message)s",
26+
)
27+
logger = logging.getLogger(__name__)
28+
29+
30+
def get_existing_ids(directory, ext):
31+
"""Return a set of IDs from files with the specified extension in the directory."""
32+
files = glob(os.path.join(directory, f"*.{ext}"))
33+
return {os.path.splitext(os.path.basename(f))[0] for f in files}
34+
35+
36+
def download_transcripts():
37+
"""Download transcripts for video IDs in conf.ID if not already saved."""
38+
os.makedirs(c.TRANSCRIPT_DIR, exist_ok=True)
39+
existing_ids = get_existing_ids(c.TRANSCRIPT_DIR, "json")
40+
41+
# Read target video IDs and remove those already downloaded
42+
with open(c.ID, "r", encoding="utf-8") as f:
43+
all_ids = {line.strip() for line in f if line.strip()}
44+
45+
ids = all_ids - existing_ids
46+
47+
if not ids:
48+
logger.info("All transcripts are already downloaded.")
49+
return
50+
51+
formatter = JSONFormatter()
52+
sleep_time = 0.2
53+
54+
# Use a progress bar to show download progress
55+
for video_id in tqdm(list(ids), desc="Downloading transcripts"):
56+
try:
57+
time.sleep(sleep_time) # Rate limiting pause
58+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=c.LANGUAGE)
59+
json_transcript = formatter.format_transcript(transcript)
60+
transcript_path = os.path.join(c.TRANSCRIPT_DIR, f"{video_id}.json")
61+
with open(transcript_path, "w", encoding="utf-8") as out_file:
62+
out_file.write(json_transcript)
63+
logger.info("SUCCESS: Transcript for %s saved.", video_id)
64+
except CouldNotRetrieveTranscript as e:
65+
logger.error("Could not retrieve transcript for %s. Error: %s", video_id, e)
66+
except TranscriptsDisabled as e:
67+
logger.error("Transcripts are disabled for %s. Error: %s", video_id, e)
68+
except NoTranscriptFound as e:
69+
logger.error("No transcript found for %s in the specified languages. Error: %s", video_id, e)
70+
except VideoUnavailable as e:
71+
logger.error("Video %s is unavailable. Error: %s", video_id, e)
72+
except TooManyRequests as e:
73+
sleep_time += 0.2 # Slightly increase delay on error
74+
logger.error("Too many requests for %s. Error: %s", video_id, e)
75+
except Exception as e:
76+
logger.error("An unexpected error occurred for %s. Error: %s", video_id, e)
77+
78+
79+
def process_youtube_video(video_id, download_options):
80+
"""Download a YouTube video using specified options."""
81+
video_url = f"https://www.youtube.com/watch?v={video_id}"
82+
try:
83+
with YoutubeDL(download_options) as yt:
84+
yt.extract_info(video_url)
85+
logger.info("SUCCESS: Video %s downloaded.", video_id)
86+
except Exception as e:
87+
logger.error("FAILED: Video %s download failed. Error: %s", video_id, e)
88+
89+
90+
def download_videos():
91+
"""Download videos for video IDs specified in conf.ID if not already downloaded."""
92+
os.makedirs(c.OUTPUT_DIR, exist_ok=True)
93+
os.makedirs(c.VIDEO_DIR, exist_ok=True)
94+
existing_ids = get_existing_ids(c.OUTPUT_DIR, "mp4")
95+
96+
with open(c.ID, "r", encoding="utf-8") as f:
97+
all_ids = {line.strip() for line in f if line.strip()}
98+
99+
ids = all_ids - existing_ids
100+
101+
for video_id in ids:
102+
time.sleep(1) # Rate limiting pause
103+
process_youtube_video(video_id, download_options)
104+
105+
106+
# Global YouTube download configuration
107+
download_options = {
108+
"format": "worstvideo[height>=720]/bestvideo[height<=480]",
109+
"writesubtitles": False,
110+
"outtmpl": os.path.join(c.VIDEO_DIR, "%(id)s.%(ext)s"),
111+
"nocheckcertificate": True,
112+
"noplaylist": True,
113+
"no-metadata-json": True,
114+
"no-metadata": True,
115+
"concurrent-fragments": 5,
116+
"hls-prefer-ffmpeg": True,
117+
"http-chunk-size": 10485760, # 10MB chunks
118+
"sleep-interval": 0,
119+
"geo-bypass": True,
120+
"limit_rate": "5M",
121+
}
122+
123+
124+
def main():
125+
logger.info("Starting transcript download...")
126+
download_transcripts()
127+
logger.info("Transcript download completed.\n")
128+
129+
logger.info("Starting video download...")
130+
download_videos()
131+
logger.info("Video download completed.")
132+
133+
134+
if __name__ == "__main__":
135+
main()

0 commit comments

Comments
 (0)