Skip to content

Commit 50a6c59

Browse files
committed
Update youtube-dl v2021.01.03
1 parent 47ef04c commit 50a6c59

File tree

13 files changed

+949
-426
lines changed

13 files changed

+949
-426
lines changed

youtube_dl/extractor/acast.py

Lines changed: 53 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,47 @@
22
from __future__ import unicode_literals
33

44
import re
5-
import functools
65

76
from .common import InfoExtractor
8-
from ..compat import compat_str
97
from ..utils import (
108
clean_html,
11-
float_or_none,
129
int_or_none,
13-
try_get,
14-
unified_timestamp,
15-
OnDemandPagedList,
10+
parse_iso8601,
1611
)
1712

1813

19-
class ACastIE(InfoExtractor):
14+
class ACastBaseIE(InfoExtractor):
15+
def _extract_episode(self, episode, show_info):
16+
title = episode['title']
17+
info = {
18+
'id': episode['id'],
19+
'display_id': episode.get('episodeUrl'),
20+
'url': episode['url'],
21+
'title': title,
22+
'description': clean_html(episode.get('description') or episode.get('summary')),
23+
'thumbnail': episode.get('image'),
24+
'timestamp': parse_iso8601(episode.get('publishDate')),
25+
'duration': int_or_none(episode.get('duration')),
26+
'filesize': int_or_none(episode.get('contentLength')),
27+
'season_number': int_or_none(episode.get('season')),
28+
'episode': title,
29+
'episode_number': int_or_none(episode.get('episode')),
30+
}
31+
info.update(show_info)
32+
return info
33+
34+
def _extract_show_info(self, show):
35+
return {
36+
'creator': show.get('author'),
37+
'series': show.get('title'),
38+
}
39+
40+
def _call_api(self, path, video_id, query=None):
41+
return self._download_json(
42+
'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query)
43+
44+
45+
class ACastIE(ACastBaseIE):
2046
IE_NAME = 'acast'
2147
_VALID_URL = r'''(?x)
2248
https?://
@@ -28,15 +54,15 @@ class ACastIE(InfoExtractor):
2854
'''
2955
_TESTS = [{
3056
'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna',
31-
'md5': '16d936099ec5ca2d5869e3a813ee8dc4',
57+
'md5': 'f5598f3ad1e4776fed12ec1407153e4b',
3258
'info_dict': {
3359
'id': '2a92b283-1a75-4ad8-8396-499c641de0d9',
3460
'ext': 'mp3',
3561
'title': '2. Raggarmordet - Röster ur det förflutna',
36-
'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4',
62+
'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67',
3763
'timestamp': 1477346700,
3864
'upload_date': '20161024',
39-
'duration': 2766.602563,
65+
'duration': 2766,
4066
'creator': 'Anton Berg & Martin Johnson',
4167
'series': 'Spår',
4268
'episode': '2. Raggarmordet - Röster ur det förflutna',
@@ -45,7 +71,7 @@ class ACastIE(InfoExtractor):
4571
'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015',
4672
'only_matching': True,
4773
}, {
48-
'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22',
74+
'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2',
4975
'only_matching': True,
5076
}, {
5177
'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9',
@@ -54,40 +80,14 @@ class ACastIE(InfoExtractor):
5480

5581
def _real_extract(self, url):
5682
channel, display_id = re.match(self._VALID_URL, url).groups()
57-
s = self._download_json(
58-
'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id),
59-
display_id)
60-
media_url = s['url']
61-
if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id):
62-
episode_url = s.get('episodeUrl')
63-
if episode_url:
64-
display_id = episode_url
65-
else:
66-
channel, display_id = re.match(self._VALID_URL, s['link']).groups()
67-
cast_data = self._download_json(
68-
'https://play-api.acast.com/splash/%s/%s' % (channel, display_id),
69-
display_id)['result']
70-
e = cast_data['episode']
71-
title = e.get('name') or s['title']
72-
return {
73-
'id': compat_str(e['id']),
74-
'display_id': display_id,
75-
'url': media_url,
76-
'title': title,
77-
'description': e.get('summary') or clean_html(e.get('description') or s.get('description')),
78-
'thumbnail': e.get('image'),
79-
'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')),
80-
'duration': float_or_none(e.get('duration') or s.get('duration')),
81-
'filesize': int_or_none(e.get('contentLength')),
82-
'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str),
83-
'series': try_get(cast_data, lambda x: x['show']['name'], compat_str),
84-
'season_number': int_or_none(e.get('seasonNumber')),
85-
'episode': title,
86-
'episode_number': int_or_none(e.get('episodeNumber')),
87-
}
83+
episode = self._call_api(
84+
'%s/episodes/%s' % (channel, display_id),
85+
display_id, {'showInfo': 'true'})
86+
return self._extract_episode(
87+
episode, self._extract_show_info(episode.get('show') or {}))
8888

8989

90-
class ACastChannelIE(InfoExtractor):
90+
class ACastChannelIE(ACastBaseIE):
9191
IE_NAME = 'acast:channel'
9292
_VALID_URL = r'''(?x)
9393
https?://
@@ -102,34 +102,24 @@ class ACastChannelIE(InfoExtractor):
102102
'info_dict': {
103103
'id': '4efc5294-5385-4847-98bd-519799ce5786',
104104
'title': 'Today in Focus',
105-
'description': 'md5:9ba5564de5ce897faeb12963f4537a64',
105+
'description': 'md5:c09ce28c91002ce4ffce71d6504abaae',
106106
},
107-
'playlist_mincount': 35,
107+
'playlist_mincount': 200,
108108
}, {
109109
'url': 'http://play.acast.com/s/ft-banking-weekly',
110110
'only_matching': True,
111111
}]
112-
_API_BASE_URL = 'https://play.acast.com/api/'
113-
_PAGE_SIZE = 10
114112

115113
@classmethod
116114
def suitable(cls, url):
117115
return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url)
118116

119-
def _fetch_page(self, channel_slug, page):
120-
casts = self._download_json(
121-
self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page),
122-
channel_slug, note='Download page %d of channel data' % page)
123-
for cast in casts:
124-
yield self.url_result(
125-
'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']),
126-
'ACast', cast['id'])
127-
128117
def _real_extract(self, url):
129-
channel_slug = self._match_id(url)
130-
channel_data = self._download_json(
131-
self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug)
132-
entries = OnDemandPagedList(functools.partial(
133-
self._fetch_page, channel_slug), self._PAGE_SIZE)
134-
return self.playlist_result(entries, compat_str(
135-
channel_data['id']), channel_data['name'], channel_data.get('description'))
118+
show_slug = self._match_id(url)
119+
show = self._call_api(show_slug, show_slug)
120+
show_info = self._extract_show_info(show)
121+
entries = []
122+
for episode in (show.get('episodes') or []):
123+
entries.append(self._extract_episode(episode, show_info))
124+
return self.playlist_result(
125+
entries, show.get('id'), show.get('title'), show.get('description'))

youtube_dl/extractor/arcpublishing.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
# coding: utf-8
2+
from __future__ import unicode_literals
3+
4+
import re
5+
6+
from .common import InfoExtractor
7+
from ..utils import (
8+
extract_attributes,
9+
int_or_none,
10+
parse_iso8601,
11+
try_get,
12+
)
13+
14+
15+
class ArcPublishingIE(InfoExtractor):
16+
_UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
17+
_VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX
18+
_TESTS = [{
19+
# https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/
20+
'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
21+
'only_matching': True,
22+
}, {
23+
# https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/
24+
'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1',
25+
'only_matching': True,
26+
}, {
27+
# https://www.actionnewsjax.com/video/live-stream/
28+
'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a',
29+
'only_matching': True,
30+
}, {
31+
# https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/
32+
'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3',
33+
'only_matching': True,
34+
}, {
35+
# https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/
36+
'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe',
37+
'only_matching': True,
38+
}, {
39+
# https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/
40+
'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e',
41+
'only_matching': True,
42+
}, {
43+
# https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/
44+
'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143',
45+
'only_matching': True,
46+
}, {
47+
# https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/
48+
'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055',
49+
'only_matching': True,
50+
}, {
51+
# https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/
52+
'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d',
53+
'only_matching': True,
54+
}, {
55+
# https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/
56+
'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7',
57+
'only_matching': True,
58+
}, {
59+
# https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/
60+
'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b',
61+
'only_matching': True,
62+
}, {
63+
# https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html
64+
'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685',
65+
'only_matching': True,
66+
}]
67+
_POWA_DEFAULTS = [
68+
(['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'),
69+
([
70+
'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo',
71+
'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom',
72+
'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek',
73+
], 'video-api-cdn.%s.arcpublishing.com/api'),
74+
]
75+
76+
@staticmethod
77+
def _extract_urls(webpage):
78+
entries = []
79+
# https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
80+
for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):
81+
powa = extract_attributes(powa_el) or {}
82+
org = powa.get('data-org')
83+
uuid = powa.get('data-uuid')
84+
if org and uuid:
85+
entries.append('arcpublishing:%s:%s' % (org, uuid))
86+
return entries
87+
88+
def _real_extract(self, url):
89+
org, uuid = re.match(self._VALID_URL, url).groups()
90+
for orgs, tmpl in self._POWA_DEFAULTS:
91+
if org in orgs:
92+
base_api_tmpl = tmpl
93+
break
94+
else:
95+
base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api'
96+
if org == 'wapo':
97+
org = 'washpost'
98+
video = self._download_json(
99+
'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org),
100+
uuid, query={'uuid': uuid})[0]
101+
title = video['headlines']['basic']
102+
is_live = video.get('status') == 'live'
103+
104+
urls = []
105+
formats = []
106+
for s in video.get('streams', []):
107+
s_url = s.get('url')
108+
if not s_url or s_url in urls:
109+
continue
110+
urls.append(s_url)
111+
stream_type = s.get('stream_type')
112+
if stream_type == 'smil':
113+
smil_formats = self._extract_smil_formats(
114+
s_url, uuid, fatal=False)
115+
for f in smil_formats:
116+
if f['url'].endswith('/cfx/st'):
117+
f['app'] = 'cfx/st'
118+
if not f['play_path'].startswith('mp4:'):
119+
f['play_path'] = 'mp4:' + f['play_path']
120+
if isinstance(f['tbr'], float):
121+
f['vbr'] = f['tbr'] * 1000
122+
del f['tbr']
123+
f['format_id'] = 'rtmp-%d' % f['vbr']
124+
formats.extend(smil_formats)
125+
elif stream_type in ('ts', 'hls'):
126+
m3u8_formats = self._extract_m3u8_formats(
127+
s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native',
128+
m3u8_id='hls', fatal=False)
129+
if all([f.get('acodec') == 'none' for f in m3u8_formats]):
130+
continue
131+
for f in m3u8_formats:
132+
if f.get('acodec') == 'none':
133+
f['preference'] = -40
134+
elif f.get('vcodec') == 'none':
135+
f['preference'] = -50
136+
height = f.get('height')
137+
if not height:
138+
continue
139+
vbr = self._search_regex(
140+
r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None)
141+
if vbr:
142+
f['vbr'] = int(vbr)
143+
formats.extend(m3u8_formats)
144+
else:
145+
vbr = int_or_none(s.get('bitrate'))
146+
formats.append({
147+
'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type,
148+
'vbr': vbr,
149+
'width': int_or_none(s.get('width')),
150+
'height': int_or_none(s.get('height')),
151+
'filesize': int_or_none(s.get('filesize')),
152+
'url': s_url,
153+
'preference': -1,
154+
})
155+
self._sort_formats(
156+
formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id'))
157+
158+
subtitles = {}
159+
for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):
160+
subtitle_url = subtitle.get('url')
161+
if subtitle_url:
162+
subtitles.setdefault('en', []).append({'url': subtitle_url})
163+
164+
return {
165+
'id': uuid,
166+
'title': self._live_title(title) if is_live else title,
167+
'thumbnail': try_get(video, lambda x: x['promo_image']['url']),
168+
'description': try_get(video, lambda x: x['subheadlines']['basic']),
169+
'formats': formats,
170+
'duration': int_or_none(video.get('duration'), 100),
171+
'timestamp': parse_iso8601(video.get('created_date')),
172+
'subtitles': subtitles,
173+
'is_live': is_live,
174+
}

youtube_dl/extractor/common.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2135,17 +2135,15 @@ def extract_common(source):
21352135
def extract_Initialization(source):
21362136
initialization = source.find(_add_ns('Initialization'))
21372137
if initialization is not None:
2138-
ms_info['initialization_url'] = initialization.get('sourceURL')
2138+
ms_info['initialization_url'] = initialization.attrib['sourceURL']
21392139

21402140
segment_list = element.find(_add_ns('SegmentList'))
21412141
if segment_list is not None:
21422142
extract_common(segment_list)
21432143
extract_Initialization(segment_list)
21442144
segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
21452145
if segment_urls_e:
2146-
segment_urls = [segment.get('media') for segment in segment_urls_e if segment.get('media') is not None]
2147-
if len(segment_urls) > 0:
2148-
ms_info['segment_urls'] = segment_urls
2146+
ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
21492147
else:
21502148
segment_template = element.find(_add_ns('SegmentTemplate'))
21512149
if segment_template is not None:

youtube_dl/extractor/extractors.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
AppleTrailersSectionIE,
5757
)
5858
from .archiveorg import ArchiveOrgIE
59+
from .arcpublishing import ArcPublishingIE
5960
from .arkena import ArkenaIE
6061
from .ard import (
6162
ARDBetaMediathekIE,
@@ -789,6 +790,7 @@
789790
NRKSkoleIE,
790791
NRKTVIE,
791792
NRKTVDirekteIE,
793+
NRKRadioPodkastIE,
792794
NRKTVEpisodeIE,
793795
NRKTVEpisodesIE,
794796
NRKTVSeasonIE,
@@ -1057,6 +1059,7 @@
10571059
from .sky import (
10581060
SkyNewsIE,
10591061
SkySportsIE,
1062+
SkySportsNewsIE,
10601063
)
10611064
from .slideshare import SlideshareIE
10621065
from .slideslive import SlidesLiveIE

0 commit comments

Comments
 (0)