@@ -44,47 +44,59 @@ def get_existing_ids(directory, ext):
4444 return {os .path .splitext (os .path .basename (f ))[0 ] for f in files }
4545
4646
47+ def load_video_ids (file_path ):
48+ with open (file_path , "r" , encoding = "utf-8" ) as f :
49+ return {line .strip () for line in f if line .strip ()}
50+
51+
52+ def download_single_transcript (video_id , formatter , sleep_time ):
53+ """Download a single transcript for a video ID."""
54+ try :
55+ transcript = YouTubeTranscriptApi .get_transcript (video_id , languages = c .LANGUAGE )
56+ json_transcript = formatter .format_transcript (transcript )
57+ transcript_path = os .path .join (c .TRANSCRIPT_DIR , f"{ video_id } .json" )
58+ with open (transcript_path , "w" , encoding = "utf-8" ) as out_file :
59+ out_file .write (json_transcript )
60+ logger .info ("SUCCESS: Transcript for %s saved." , video_id )
61+ return True , sleep_time
62+ except TooManyRequests as e :
63+ sleep_time += 0.1 # Slightly increase delay on error
64+ logger .error ("Too many requests for %s. Error: %s" , video_id , e )
65+ return False , sleep_time
66+ except Exception as e :
67+ logger .error ("An unexpected error occurred for %s. Error: %s" , video_id , e )
68+ return False , sleep_time
69+
70+
4771def download_transcripts ():
4872 """Download transcripts for video IDs in conf.ID if not already saved."""
4973 os .makedirs (c .TRANSCRIPT_DIR , exist_ok = True )
5074 existing_ids = get_existing_ids (c .TRANSCRIPT_DIR , "json" )
5175
52- # Read target video IDs and remove those already downloaded
53- with open (c .ID , "r" , encoding = "utf-8" ) as f :
54- all_ids = {line .strip () for line in f if line .strip ()}
55-
56- ids = all_ids - existing_ids
76+ all_ids = load_video_ids (c .ID )
77+ ids = list (all_ids - existing_ids )
5778
5879 if not ids :
5980 logger .info ("All transcripts are already downloaded." )
6081 return
6182
6283 formatter = JSONFormatter ()
6384 sleep_time = 0.2
85+ error_count = 0
6486
6587 # Use a progress bar to show download progress
66- for video_id in tqdm (list (ids ), desc = "Downloading transcripts" ):
67- try :
88+ with tqdm (ids , desc = "Downloading transcripts" ) as pbar :
89+ for video_id in pbar :
90+ sleep_time = min (sleep_time , 2 )
6891 time .sleep (sleep_time ) # Rate limiting pause
69- transcript = YouTubeTranscriptApi . get_transcript (
70- video_id , languages = c . LANGUAGE
92+ success , sleep_time = download_single_transcript (
93+ video_id , formatter , sleep_time
7194 )
72- json_transcript = formatter .format_transcript (transcript )
73- transcript_path = os .path .join (c .TRANSCRIPT_DIR , f"{ video_id } .json" )
74- with open (transcript_path , "w" , encoding = "utf-8" ) as out_file :
75- out_file .write (json_transcript )
76- logger .info ("SUCCESS: Transcript for %s saved." , video_id )
77- except TranscriptsDisabled as e :
78- logger .error ("Transcripts are disabled for %s. Error: %s" , video_id , e )
79- except NoTranscriptFound as e :
80- logger .error ("No transcript %s in specified langs. Error: %s" , video_id , e )
81- except VideoUnavailable as e :
82- logger .error ("Video %s is unavailable. Error: %s" , video_id , e )
83- except TooManyRequests as e :
84- sleep_time += 0.2 # Slightly increase delay on error
85- logger .error ("Too many requests for %s. Error: %s" , video_id , e )
86- except Exception as e :
87- logger .error ("An unexpected error occurred for %s. Error: %s" , video_id , e )
95+
96+ if not success :
97+ error_count += 1
98+
99+ pbar .set_postfix (errors = error_count )
88100
89101
90102def download_single_video (video_id , download_options ):
@@ -94,26 +106,27 @@ def download_single_video(video_id, download_options):
94106 with YoutubeDL (download_options ) as yt :
95107 yt .extract_info (video_url )
96108 logger .info ("SUCCESS: Video %s downloaded." , video_id )
97- except DownloadError as e :
98- logger .error ("Download error for video %s. Error: %s" , video_id , e )
99- except ExtractorError as e :
100- logger .error ("Extractor error for video %s. Error: %s" , video_id , e )
101- except PostProcessingError as e :
102- logger .error ("Post-processing error for video %s. Error: %s" , video_id , e )
103- except UnavailableVideoError as e :
104- logger .error ("Video %s is unavailable. Error: %s" , video_id , e )
109+ return True
110+ except (
111+ DownloadError ,
112+ ExtractorError ,
113+ PostProcessingError ,
114+ UnavailableVideoError ,
115+ ) as e :
116+ logger .error ("Error downloading video %s. Error: %s" , video_id , e )
117+ return False
105118 except Exception as e :
106119 logger .error ("An unexpected error occurred for %s. Error: %s" , video_id , e )
120+ return False
107121
108122
109123def download_videos ():
110124 """Download videos for video IDs specified in conf.ID if not already downloaded."""
125+ os .makedirs (c .OUTPUT_DIR , exist_ok = True )
111126 os .makedirs (c .VIDEO_DIR , exist_ok = True )
112- existing_ids = get_existing_ids (c .VIDEO_DIR , "mp4" )
113-
114- with open (c .ID , "r" , encoding = "utf-8" ) as f :
115- all_ids = {line .strip () for line in f if line .strip ()}
127+ existing_ids = get_existing_ids (c .OUTPUT_DIR , "mp4" )
116128
129+ all_ids = load_video_ids (c .ID )
117130 ids = list (all_ids - existing_ids )
118131
119132 if not ids :
@@ -124,8 +137,9 @@ def download_videos():
124137 # Use tqdm progress bar to show progress
125138 with tqdm (ids , desc = "Downloading videos" , unit = "video" ) as pbar :
126139 for video_id in pbar :
127- time .sleep (1 ) # Rate limiting pause
128- if not download_single_video (video_id , c .YT_CONFIG ):
140+ time .sleep (0.2 ) # Rate limiting pause
141+ success = download_single_video (video_id , c .YT_CONFIG )
142+ if not success :
129143 error_count += 1
130144 pbar .set_postfix (errors = error_count )
131145
0 commit comments