1# coding: utf-8 2from __future__ import unicode_literals 3 4import re 5 6from .common import InfoExtractor 7from ..utils import ( 8 extract_attributes, 9 int_or_none, 10 parse_iso8601, 11 try_get, 12) 13 14 15class ArcPublishingIE(InfoExtractor): 16 _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' 17 _VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX 18 _TESTS = [{ 19 # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ 20 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', 21 'only_matching': True, 22 }, { 23 # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ 24 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', 25 'only_matching': True, 26 }, { 27 # https://www.actionnewsjax.com/video/live-stream/ 28 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', 29 'only_matching': True, 30 }, { 31 # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ 32 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', 33 'only_matching': True, 34 }, { 35 # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ 36 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', 37 'only_matching': True, 38 }, { 39 # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ 40 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', 41 'only_matching': True, 42 }, { 43 # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ 44 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', 45 'only_matching': True, 46 }, { 47 # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ 48 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', 49 'only_matching': True, 50 }, { 51 # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ 52 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', 53 'only_matching': True, 54 }, { 55 # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ 56 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', 57 'only_matching': True, 58 }, { 59 # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ 60 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', 61 'only_matching': True, 62 }, { 63 # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html 64 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', 65 'only_matching': True, 66 }] 67 _POWA_DEFAULTS = [ 68 (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), 69 ([ 70 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', 71 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', 72 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', 73 ], 'video-api-cdn.%s.arcpublishing.com/api'), 74 ] 75 76 @staticmethod 77 def _extract_urls(webpage): 78 entries = [] 79 # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview 80 for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): 81 powa = extract_attributes(powa_el) or {} 82 org = powa.get('data-org') 83 uuid = powa.get('data-uuid') 84 if org and uuid: 85 entries.append('arcpublishing:%s:%s' % (org, uuid)) 86 return entries 87 88 def _real_extract(self, url): 89 org, uuid = self._match_valid_url(url).groups() 90 for orgs, tmpl in self._POWA_DEFAULTS: 91 if org in orgs: 92 base_api_tmpl = tmpl 93 break 94 else: 95 base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' 96 if org == 'wapo': 97 org = 'washpost' 98 video = self._download_json( 99 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), 100 uuid, query={'uuid': uuid})[0] 101 title = video['headlines']['basic'] 102 is_live = video.get('status') == 'live' 103 104 urls = [] 105 formats = [] 106 for s in video.get('streams', []): 107 s_url = s.get('url') 108 if not s_url or s_url in urls: 109 continue 110 urls.append(s_url) 111 stream_type = s.get('stream_type') 112 if stream_type == 'smil': 113 smil_formats = self._extract_smil_formats( 114 s_url, uuid, fatal=False) 115 for f in smil_formats: 116 if f['url'].endswith('/cfx/st'): 117 f['app'] = 'cfx/st' 118 if not f['play_path'].startswith('mp4:'): 119 f['play_path'] = 'mp4:' + f['play_path'] 120 if isinstance(f['tbr'], float): 121 f['vbr'] = f['tbr'] * 1000 122 del f['tbr'] 123 f['format_id'] = 'rtmp-%d' % f['vbr'] 124 formats.extend(smil_formats) 125 elif stream_type in ('ts', 'hls'): 126 m3u8_formats = self._extract_m3u8_formats( 127 s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', 128 m3u8_id='hls', fatal=False) 129 if all([f.get('acodec') == 'none' for f in m3u8_formats]): 130 continue 131 for f in m3u8_formats: 132 height = f.get('height') 133 if not height: 134 continue 135 vbr = self._search_regex( 136 r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) 137 if vbr: 138 f['vbr'] = int(vbr) 139 formats.extend(m3u8_formats) 140 else: 141 vbr = int_or_none(s.get('bitrate')) 142 formats.append({ 143 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, 144 'vbr': vbr, 145 'width': int_or_none(s.get('width')), 146 'height': int_or_none(s.get('height')), 147 'filesize': int_or_none(s.get('filesize')), 148 'url': s_url, 149 'quality': -10, 150 }) 151 self._sort_formats(formats) 152 153 subtitles = {} 154 for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): 155 subtitle_url = subtitle.get('url') 156 if subtitle_url: 157 subtitles.setdefault('en', []).append({'url': subtitle_url}) 158 159 return { 160 'id': uuid, 161 'title': title, 162 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), 163 'description': try_get(video, lambda x: x['subheadlines']['basic']), 164 'formats': formats, 165 'duration': int_or_none(video.get('duration'), 100), 166 'timestamp': parse_iso8601(video.get('created_date')), 167 'subtitles': subtitles, 168 'is_live': is_live, 169 } 170