Fix bbc.co.uk playlist support #23438

djcsdy commented Dec 17, 2019

This PR fixes two problems:

* Downloading playlists from BBC iPlayer did not work at all due to the page layout changing completely since the extractor was written.
* Downloading playlists from other parts of bbc.co.uk was partially broken – only the first page would be downloaded due to some whitespace being inserted between elements since the extractor was written.

Example URLs:

    https://www.bbc.co.uk/programmes/b00mfl7n/clips (All 142 clips should be downloaded, not just the first 24)
    https://www.bbc.co.uk/programmes/b00mfl7n/clips?page=3 (Only the 24 clips on the specified page should be downloaded)
    https://www.bbc.co.uk/iplayer/episodes/b07xd230/josh (All 18 episodes should be downloaded)
    https://www.bbc.co.uk/iplayer/episodes/b07xd230/josh?seriesId=p047lvcs (Only the six episodes of season 2 should be downloaded)
    https://www.bbc.co.uk/iplayer/episodes/b06kw5fq/scot-squad (Should download whatever episodes are currently available, time sensitive)

I pushed an additional change that fixes a crash when downloading shows that have multiple pages of episodes but only a single season, for example: https://www.bbc.co.uk/iplayer/episodes/b05qzmgd/louis-theroux

Fixes:
    https://github.com/ytdl-org/youtube-dl/issues/23438
    https://github.com/ytdl-org/youtube-dl/issues/23270
This commit is contained in:
df 2020-10-17 00:15:54 +00:00
parent d65d89183f
commit 133008c73e
1 changed files with 69 additions and 31 deletions

View File

@ -1247,31 +1247,14 @@ class BBCCoUkArticleIE(InfoExtractor):
class BBCCoUkPlaylistBaseIE(InfoExtractor):
def _entries(self, webpage, url, playlist_id):
single_page = 'page' in compat_urlparse.parse_qs(
compat_urlparse.urlparse(url).query)
for page_num in itertools.count(2):
for video_id in re.findall(
self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
yield self.url_result(
self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
if single_page:
return
next_page = self._search_regex(
r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
webpage, 'next page url', default=None, group='url')
if not next_page:
break
webpage = self._download_webpage(
compat_urlparse.urljoin(url, next_page), playlist_id,
'Downloading page %d' % page_num, page_num)
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
title, description = self._extract_title_and_description(webpage)
title = self._og_search_title(webpage, fatal=False)
description = self._og_search_description(webpage)
return self.playlist_result(
self._entries(webpage, url, playlist_id),
@ -1282,7 +1265,6 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
IE_NAME = 'bbc.co.uk:iplayer:playlist'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
_URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
_VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
_TESTS = [{
'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
'info_dict': {
@ -1303,13 +1285,55 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
'playlist_mincount': 10,
}]
def _extract_title_and_description(self, webpage):
title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
description = self._search_regex(
r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
webpage, 'description', fatal=False, group='value')
return title, description
def _entries(self, webpage, url, playlist_id):
query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
single_season = 'seriesId' in query
single_page = 'page' in query
redux_state = self._redux_state(webpage, playlist_id)
slices = redux_state.get('header', {}).get('availableSlices', [])
if slices:
season_ids = list(map(lambda s: s.get('id'), slices))
else:
season_ids = []
for season in itertools.count(1):
while True:
pagination = redux_state.get('pagination')
page_num = pagination.get('currentPage')
total_pages = pagination.get('totalPages')
for entity in redux_state.get('entities'):
video_id = entity.get('id')
yield self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
if single_page or page_num >= total_pages:
break
next_page_num = page_num + 1
page_url_template = pagination.get('pageUrl') or '?page=%s'
next_page_href = page_url_template % next_page_num
url = compat_urlparse.urljoin(url, next_page_href)
webpage = self._download_webpage(url, playlist_id,
'Downloading season %d page %d' % (season, next_page_num),
'season %d page %d' % (season, next_page_num))
redux_state = self._redux_state(webpage, playlist_id)
if single_season or season >= len(season_ids):
break
next_season_id = season_ids[season]
url = compat_urlparse.urljoin(url, '?seriesId=' + next_season_id)
webpage = self._download_webpage(url, playlist_id,
'Downloading season %d page 1' % (season + 1),
'season %d page 1' % (season + 1))
redux_state = self._redux_state(webpage, playlist_id)
def _redux_state(self, webpage, playlist_id):
redux_state_regex = r'<script[^>]*>\s*window.__IPLAYER_REDUX_STATE__\s*=\s*(.*?);?\s*</script>'
redux_state_json = self._search_regex(redux_state_regex, webpage, 'redux_state')
return self._parse_json(redux_state_json, playlist_id, transform_source=unescapeHTML)
class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
IE_NAME = 'bbc.co.uk:playlist'
@ -1353,7 +1377,21 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
'only_matching': True,
}]
def _extract_title_and_description(self, webpage):
title = self._og_search_title(webpage, fatal=False)
description = self._og_search_description(webpage)
return title, description
def _entries(self, webpage, url, playlist_id):
single_page = 'page' in compat_urlparse.parse_qs(
compat_urlparse.urlparse(url).query)
for page_num in itertools.count(2):
for video_id in re.findall(
self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
yield self.url_result(
self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
if single_page:
return
next_page = self._search_regex(
r'<li[^>]+class=(["\'])pagination_+next\1[^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
webpage, 'next page url', default=None, group='url')
if not next_page:
break
webpage = self._download_webpage(
compat_urlparse.urljoin(url, next_page), playlist_id,
'Downloading page %d' % page_num, page_num)