Skip to content

Commit 7fed579

Browse files
committed
ver25.02.25
1. make the data downloader clear and more readable.
1 parent a3e9d98 commit 7fed579

File tree

1 file changed

+53
-39
lines changed

1 file changed

+53
-39
lines changed

s1_data_downloader.py

Lines changed: 53 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -44,47 +44,59 @@ def get_existing_ids(directory, ext):
4444
return {os.path.splitext(os.path.basename(f))[0] for f in files}
4545

4646

47+
def load_video_ids(file_path):
48+
with open(file_path, "r", encoding="utf-8") as f:
49+
return {line.strip() for line in f if line.strip()}
50+
51+
52+
def download_single_transcript(video_id, formatter, sleep_time):
53+
"""Download a single transcript for a video ID."""
54+
try:
55+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=c.LANGUAGE)
56+
json_transcript = formatter.format_transcript(transcript)
57+
transcript_path = os.path.join(c.TRANSCRIPT_DIR, f"{video_id}.json")
58+
with open(transcript_path, "w", encoding="utf-8") as out_file:
59+
out_file.write(json_transcript)
60+
logger.info("SUCCESS: Transcript for %s saved.", video_id)
61+
return True, sleep_time
62+
except TooManyRequests as e:
63+
sleep_time += 0.1 # Slightly increase delay on error
64+
logger.error("Too many requests for %s. Error: %s", video_id, e)
65+
return False, sleep_time
66+
except Exception as e:
67+
logger.error("An unexpected error occurred for %s. Error: %s", video_id, e)
68+
return False, sleep_time
69+
70+
4771
def download_transcripts():
4872
"""Download transcripts for video IDs in conf.ID if not already saved."""
4973
os.makedirs(c.TRANSCRIPT_DIR, exist_ok=True)
5074
existing_ids = get_existing_ids(c.TRANSCRIPT_DIR, "json")
5175

52-
# Read target video IDs and remove those already downloaded
53-
with open(c.ID, "r", encoding="utf-8") as f:
54-
all_ids = {line.strip() for line in f if line.strip()}
55-
56-
ids = all_ids - existing_ids
76+
all_ids = load_video_ids(c.ID)
77+
ids = list(all_ids - existing_ids)
5778

5879
if not ids:
5980
logger.info("All transcripts are already downloaded.")
6081
return
6182

6283
formatter = JSONFormatter()
6384
sleep_time = 0.2
85+
error_count = 0
6486

6587
# Use a progress bar to show download progress
66-
for video_id in tqdm(list(ids), desc="Downloading transcripts"):
67-
try:
88+
with tqdm(ids, desc="Downloading transcripts") as pbar:
89+
for video_id in pbar:
90+
sleep_time = min(sleep_time, 2)
6891
time.sleep(sleep_time) # Rate limiting pause
69-
transcript = YouTubeTranscriptApi.get_transcript(
70-
video_id, languages=c.LANGUAGE
92+
success, sleep_time = download_single_transcript(
93+
video_id, formatter, sleep_time
7194
)
72-
json_transcript = formatter.format_transcript(transcript)
73-
transcript_path = os.path.join(c.TRANSCRIPT_DIR, f"{video_id}.json")
74-
with open(transcript_path, "w", encoding="utf-8") as out_file:
75-
out_file.write(json_transcript)
76-
logger.info("SUCCESS: Transcript for %s saved.", video_id)
77-
except TranscriptsDisabled as e:
78-
logger.error("Transcripts are disabled for %s. Error: %s", video_id, e)
79-
except NoTranscriptFound as e:
80-
logger.error("No transcript %s in specified langs. Error: %s", video_id, e)
81-
except VideoUnavailable as e:
82-
logger.error("Video %s is unavailable. Error: %s", video_id, e)
83-
except TooManyRequests as e:
84-
sleep_time += 0.2 # Slightly increase delay on error
85-
logger.error("Too many requests for %s. Error: %s", video_id, e)
86-
except Exception as e:
87-
logger.error("An unexpected error occurred for %s. Error: %s", video_id, e)
95+
96+
if not success:
97+
error_count += 1
98+
99+
pbar.set_postfix(errors=error_count)
88100

89101

90102
def download_single_video(video_id, download_options):
@@ -94,26 +106,27 @@ def download_single_video(video_id, download_options):
94106
with YoutubeDL(download_options) as yt:
95107
yt.extract_info(video_url)
96108
logger.info("SUCCESS: Video %s downloaded.", video_id)
97-
except DownloadError as e:
98-
logger.error("Download error for video %s. Error: %s", video_id, e)
99-
except ExtractorError as e:
100-
logger.error("Extractor error for video %s. Error: %s", video_id, e)
101-
except PostProcessingError as e:
102-
logger.error("Post-processing error for video %s. Error: %s", video_id, e)
103-
except UnavailableVideoError as e:
104-
logger.error("Video %s is unavailable. Error: %s", video_id, e)
109+
return True
110+
except (
111+
DownloadError,
112+
ExtractorError,
113+
PostProcessingError,
114+
UnavailableVideoError,
115+
) as e:
116+
logger.error("Error downloading video %s. Error: %s", video_id, e)
117+
return False
105118
except Exception as e:
106119
logger.error("An unexpected error occurred for %s. Error: %s", video_id, e)
120+
return False
107121

108122

109123
def download_videos():
110124
"""Download videos for video IDs specified in conf.ID if not already downloaded."""
125+
os.makedirs(c.OUTPUT_DIR, exist_ok=True)
111126
os.makedirs(c.VIDEO_DIR, exist_ok=True)
112-
existing_ids = get_existing_ids(c.VIDEO_DIR, "mp4")
113-
114-
with open(c.ID, "r", encoding="utf-8") as f:
115-
all_ids = {line.strip() for line in f if line.strip()}
127+
existing_ids = get_existing_ids(c.OUTPUT_DIR, "mp4")
116128

129+
all_ids = load_video_ids(c.ID)
117130
ids = list(all_ids - existing_ids)
118131

119132
if not ids:
@@ -124,8 +137,9 @@ def download_videos():
124137
# Use tqdm progress bar to show progress
125138
with tqdm(ids, desc="Downloading videos", unit="video") as pbar:
126139
for video_id in pbar:
127-
time.sleep(1) # Rate limiting pause
128-
if not download_single_video(video_id, c.YT_CONFIG):
140+
time.sleep(0.2) # Rate limiting pause
141+
success = download_single_video(video_id, c.YT_CONFIG)
142+
if not success:
129143
error_count += 1
130144
pbar.set_postfix(errors=error_count)
131145

0 commit comments

Comments
 (0)