Skip to content

Commit a3e9d98

Browse files
committed
ver25.02.22
1. rename the function data_downloader.py to make it clear 2. move the youtube download config to conf.py.
1 parent e0cf7df commit a3e9d98

File tree

2 files changed

+51
-30
lines changed

2 files changed

+51
-30
lines changed

conf.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
import os
22

3+
# YouTube download configuration
4+
YT_CONFIG = {
5+
"format": "worstvideo[height>=720]/bestvideo[height<=480]",
6+
"writesubtitles": False,
7+
"outtmpl": os.path.join(c.VIDEO_DIR, "%(id)s.%(ext)s"),
8+
"nocheckcertificate": True,
9+
"noplaylist": True,
10+
"no-metadata-json": True,
11+
"no-metadata": True,
12+
"concurrent-fragments": 5,
13+
"hls-prefer-ffmpeg": True,
14+
"http-chunk-size": 10485760, # 10MB chunks
15+
"sleep-interval": 0,
16+
"geo-bypass": True,
17+
"limit_rate": "5M",
18+
}
19+
320
# the number of frames to skip when extracting frames from a video
421
FRAME_SKIP = 2
522

s1_data_downloader.py

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,19 @@
1212
import logging
1313
from glob import glob
1414
from yt_dlp import YoutubeDL
15-
from yt_dlp.utils import DownloadError, ExtractorError, PostProcessingError, UnavailableVideoError
15+
from yt_dlp.utils import (
16+
DownloadError,
17+
ExtractorError,
18+
PostProcessingError,
19+
UnavailableVideoError,
20+
)
1621
from youtube_transcript_api import YouTubeTranscriptApi
17-
from youtube_transcript_api import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable, TooManyRequests
22+
from youtube_transcript_api import (
23+
TranscriptsDisabled,
24+
NoTranscriptFound,
25+
VideoUnavailable,
26+
TooManyRequests,
27+
)
1828
from youtube_transcript_api.formatters import JSONFormatter
1929
from tqdm import tqdm
2030

@@ -56,7 +66,9 @@ def download_transcripts():
5666
for video_id in tqdm(list(ids), desc="Downloading transcripts"):
5767
try:
5868
time.sleep(sleep_time) # Rate limiting pause
59-
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=c.LANGUAGE)
69+
transcript = YouTubeTranscriptApi.get_transcript(
70+
video_id, languages=c.LANGUAGE
71+
)
6072
json_transcript = formatter.format_transcript(transcript)
6173
transcript_path = os.path.join(c.TRANSCRIPT_DIR, f"{video_id}.json")
6274
with open(transcript_path, "w", encoding="utf-8") as out_file:
@@ -65,7 +77,7 @@ def download_transcripts():
6577
except TranscriptsDisabled as e:
6678
logger.error("Transcripts are disabled for %s. Error: %s", video_id, e)
6779
except NoTranscriptFound as e:
68-
logger.error("No transcript found for %s in the specified languages. Error: %s", video_id, e)
80+
logger.error("No transcript %s in specified langs. Error: %s", video_id, e)
6981
except VideoUnavailable as e:
7082
logger.error("Video %s is unavailable. Error: %s", video_id, e)
7183
except TooManyRequests as e:
@@ -75,7 +87,7 @@ def download_transcripts():
7587
logger.error("An unexpected error occurred for %s. Error: %s", video_id, e)
7688

7789

78-
def process_youtube_video(video_id, download_options):
90+
def download_single_video(video_id, download_options):
7991
"""Download a YouTube video using specified options."""
8092
video_url = f"https://www.youtube.com/watch?v={video_id}"
8193
try:
@@ -91,41 +103,33 @@ def process_youtube_video(video_id, download_options):
91103
except UnavailableVideoError as e:
92104
logger.error("Video %s is unavailable. Error: %s", video_id, e)
93105
except Exception as e:
94-
logger.error("An unexpected error occurred for video %s. Error: %s", video_id, e)
106+
logger.error("An unexpected error occurred for %s. Error: %s", video_id, e)
95107

96108

97109
def download_videos():
98110
"""Download videos for video IDs specified in conf.ID if not already downloaded."""
99-
os.makedirs(c.OUTPUT_DIR, exist_ok=True)
100111
os.makedirs(c.VIDEO_DIR, exist_ok=True)
101-
existing_ids = get_existing_ids(c.OUTPUT_DIR, "mp4")
112+
existing_ids = get_existing_ids(c.VIDEO_DIR, "mp4")
102113

103114
with open(c.ID, "r", encoding="utf-8") as f:
104115
all_ids = {line.strip() for line in f if line.strip()}
105116

106-
ids = all_ids - existing_ids
117+
ids = list(all_ids - existing_ids)
118+
119+
if not ids:
120+
logger.info("All videos have already been downloaded.")
121+
return
122+
123+
error_count = 0
124+
# Use tqdm progress bar to show progress
125+
with tqdm(ids, desc="Downloading videos", unit="video") as pbar:
126+
for video_id in pbar:
127+
time.sleep(1) # Rate limiting pause
128+
if not download_single_video(video_id, c.YT_CONFIG):
129+
error_count += 1
130+
pbar.set_postfix(errors=error_count)
107131

108-
for video_id in ids:
109-
time.sleep(1) # Rate limiting pause
110-
process_youtube_video(video_id, download_options)
111-
112-
113-
# Global YouTube download configuration
114-
download_options = {
115-
"format": "worstvideo[height>=720]/bestvideo[height<=480]",
116-
"writesubtitles": False,
117-
"outtmpl": os.path.join(c.VIDEO_DIR, "%(id)s.%(ext)s"),
118-
"nocheckcertificate": True,
119-
"noplaylist": True,
120-
"no-metadata-json": True,
121-
"no-metadata": True,
122-
"concurrent-fragments": 5,
123-
"hls-prefer-ffmpeg": True,
124-
"http-chunk-size": 10485760, # 10MB chunks
125-
"sleep-interval": 0,
126-
"geo-bypass": True,
127-
"limit_rate": "5M",
128-
}
132+
logger.info("Video download completed: Total %d, Errors %d.", error_count)
129133

130134

131135
def main():

0 commit comments

Comments
 (0)