Allow for page_suitable() extraction in tests

Support Invidious YT front-end with any host name
* implement page-based detection and redirection using ytdlie:// * add support for playlists and channels * support extracting videos from further pages
2021-09-11 10:36:05 +01:00 · 2021-09-11 09:05:10 +01:00
5 changed files with 242 additions and 92 deletions
--- a/test/helper.py
+++ b/test/helper.py
@ -89,8 +89,10 @@ class FakeYDL(YoutubeDL):
        self.report_warning = types.MethodType(report_warning, self)
-def gettestcases(include_onlymatching=False):
+def gettestcases(include_onlymatching=False, include_pagesuitable=True):
    for ie in youtube_dl.extractor.gen_extractors():
        if not include_pagesuitable and callable(getattr(ie, 'page_suitable', None)):
            continue
        for tc in ie.get_testcases(include_onlymatching):
            yield tc
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@ -76,7 +76,7 @@ class TestAllURLsMatching(unittest.TestCase):
    def test_no_duplicates(self):
        ies = gen_extractors()
-        for tc in gettestcases(include_onlymatching=True):
+        for tc in gettestcases(include_onlymatching=True, include_pagesuitable=False):
            url = tc['url']
            for ie in ies:
                if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'):
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1613,6 +1613,7 @@ from .youtube import (
    YoutubeYtBeIE,
    YoutubeYtUserIE,
    YoutubeWatchLaterIE,
    InvidiousIE,
 )
 from .zapiks import ZapiksIE
 from .zattoo import (
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -7,7 +7,11 @@ import re
 import sys
 from .common import InfoExtractor
-from .youtube import YoutubeIE
+from .youtube import (
    YoutubeIE,
    InvidiousIE,
 )
 from ..compat import (
    compat_etree_fromstring,
    compat_str,
@ -2633,6 +2637,10 @@ class GenericIE(InfoExtractor):
            return self.playlist_from_matches(
                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
        # Invidious YT front-end
        if InvidiousIE.page_suitable(self, url, webpage):
            return InvidiousIE.page_url_result(url, video_id, video_title, webpage)
        matches = DailymotionIE._extract_urls(webpage)
        if matches:
            return self.playlist_from_matches(matches, video_id, video_title)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -63,7 +63,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
    # If True it will raise an error if no login info is provided
    _LOGIN_REQUIRED = False
-    _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
+    _VIDEO_ID_RE = r'[0-9A-Za-z_-]{11}'
    _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z_-]{10,}|RDMM)'
    _CHANNEL_ID_RE = r'(?:UC[0-9A-Za-z_-]{10,})'
    def _login(self):
        """
@ -342,65 +344,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
 class YoutubeIE(YoutubeBaseInfoExtractor):
    IE_DESC = 'YouTube.com'
-    _INVIDIOUS_SITES = (
+    _CANONICAL_VIDEO_RE = r'https://www\.youtube\.com/watch\?v=%s' % YoutubeBaseInfoExtractor._VIDEO_ID_RE
        # invidious-redirect websites
        r'(?:www\.)?redirect\.invidious\.io',
        r'(?:(?:www|dev)\.)?invidio\.us',
        # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
        r'(?:(?:www|no)\.)?invidiou\.sh',
        r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
        r'(?:www\.)?invidious\.kabi\.tk',
        r'(?:www\.)?invidious\.13ad\.de',
        r'(?:www\.)?invidious\.mastodon\.host',
        r'(?:www\.)?invidious\.zapashcanon\.fr',
        r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
        r'(?:www\.)?invidious\.tinfoil-hat\.net',
        r'(?:www\.)?invidious\.himiko\.cloud',
        r'(?:www\.)?invidious\.reallyancient\.tech',
        r'(?:www\.)?invidious\.tube',
        r'(?:www\.)?invidiou\.site',
        r'(?:www\.)?invidious\.site',
        r'(?:www\.)?invidious\.xyz',
        r'(?:www\.)?invidious\.nixnet\.xyz',
        r'(?:www\.)?invidious\.048596\.xyz',
        r'(?:www\.)?invidious\.drycat\.fr',
        r'(?:www\.)?inv\.skyn3t\.in',
        r'(?:www\.)?tube\.poal\.co',
        r'(?:www\.)?tube\.connect\.cafe',
        r'(?:www\.)?vid\.wxzm\.sx',
        r'(?:www\.)?vid\.mint\.lgbt',
        r'(?:www\.)?vid\.puffyan\.us',
        r'(?:www\.)?yewtu\.be',
        r'(?:www\.)?yt\.elukerio\.org',
        r'(?:www\.)?yt\.lelux\.fi',
        r'(?:www\.)?invidious\.ggc-project\.de',
        r'(?:www\.)?yt\.maisputain\.ovh',
        r'(?:www\.)?ytprivate\.com',
        r'(?:www\.)?invidious\.13ad\.de',
        r'(?:www\.)?invidious\.toot\.koeln',
        r'(?:www\.)?invidious\.fdn\.fr',
        r'(?:www\.)?watch\.nettohikari\.com',
        r'(?:www\.)?invidious\.namazso\.eu',
        r'(?:www\.)?invidious\.silkky\.cloud',
        r'(?:www\.)?invidious\.exonip\.de',
        r'(?:www\.)?invidious\.riverside\.rocks',
        r'(?:www\.)?invidious\.blamefran\.net',
        r'(?:www\.)?invidious\.moomoo\.de',
        r'(?:www\.)?ytb\.trom\.tf',
        r'(?:www\.)?yt\.cyberhost\.uk',
        r'(?:www\.)?kgg2m7yk5aybusll\.onion',
        r'(?:www\.)?qklhadlycap4cnod\.onion',
        r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
        r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
        r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
        r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
        r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
        r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
        r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
        r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
        r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
        r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
    )
    _VALID_URL = r"""(?x)^
                     (
                         (?:https?://|//)                                    # http(s):// or protocol-independent URL
@ -410,7 +354,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                            (?:www\.)?hooktube\.com|
                            (?:www\.)?yourepeat\.com|
                            tube\.majestyc\.net|
                            %(invidious)s|
                            youtube\.googleapis\.com)/                        # the various hostnames, with wildcard subdomains
                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                         (?:                                                  # the various things that can precede the ID:
@ -425,16 +368,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         |(?:
                            youtu\.be|                                        # just youtu.be/xxxx
                            vid\.plus|                                        # or vid.plus/xxxx
-                            zwearz\.com/watch|                                # or zwearz.com/watch/xxxx
+                            zwearz\.com/watch                                # or zwearz.com/watch/xxxx
                            %(invidious)s
                         )/
                         |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
                         )
                     )?                                                       # all until now is optional -> you can pass the naked ID
-                     (?P<id>[0-9A-Za-z_-]{11})                                # here is it! the YouTube video ID
+                     (?P<id>%(video_id)s)                                     # here is it! the YouTube video ID
                     (?(1).+)?                                                # if we found the ID, everything can follow
                     $""" % {
-        'invidious': '|'.join(_INVIDIOUS_SITES),
+        'video_id': YoutubeBaseInfoExtractor._VIDEO_ID_RE,
    }
    _PLAYER_INFO_RE = (
        r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
@ -944,19 +886,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'url': 'sJL6WA-aGkQ',
            'only_matching': True,
        },
        {
            'url': 'https://invidio.us/watch?v=BaW_jenozKc',
            'only_matching': True,
        },
        {
            'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
            'only_matching': True,
        },
        {
            # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
            'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
            'only_matching': True,
        },
        {
            # DRM protected
            'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
@ -2013,10 +1942,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:\w+\.)?
-                        (?:
+                        youtube(?:kids)?\.com
-                            youtube(?:kids)?\.com|
+                        /
                            invidio\.us
                        )/
                        (?:
                            (?:channel|c|user|feed|hashtag)/|
                            (?:playlist|watch)\?.*?\blist=|
@ -2139,9 +2066,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
        },
        'playlist_mincount': 138,
    }, {
        'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
        'only_matching': True,
    }, {
        'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
        'only_matching': True,
@ -2192,9 +2116,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
            'uploader': 'Computerphile',
        },
        'playlist_mincount': 11,
    }, {
        'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
        'only_matching': True,
    }, {
        # Playlist URL that does not actually serve a playlist
        'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
@ -2870,7 +2791,6 @@ class YoutubePlaylistIE(InfoExtractor):
                        (?:
                            (?:
                                youtube(?:kids)?\.com|
                                invidio\.us
                            )
                            /.*?\?.*?\blist=
                        )?
@ -3255,3 +3175,222 @@ class YoutubeTruncatedIDIE(InfoExtractor):
        raise ExtractorError(
            'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
            expected=True)
 class InvidiousIE(YoutubeIE):
    # Invidious Instances (#29885) per https://github.com/iv-org/invidious/pull/1730
    # Thanks: https://github.com/yt-dlp/yt-dlp/commit/df0c81513e0bb37986d00c532a5ad8cef31a24ea
    IE_NAME = 'invidious'
    IE_DESC = 'Invidious YT front-end videos, playlists, channels, searches'
    _VALID_URL = r'ytdlie://Invidious#(?P<id>.+)'
    _REAL_VALID_URL = r"""(?x)^
                     (?:(
                         (?:https?:)?//                                       # http(s):// or protocol-independent URL
                         (?:[a-zA-Z\d-]+\.)+[a-zA-Z\d-]+/                                        # any domain
                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                         (?:                                                  # the various things that can precede the ID:
                             (?:(?:v|embed|e)/(?!videoseries))?               # v/ or embed/ or e/, or nothing
                             |(?:                                             # or the v= param in all its forms
                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
                                 (?:\?|\#!?)                                  # the params delimiter ? or # or #!
                                 (?:.*?[&;])??                                # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
                                 v=
                             )
                         )
                       )?|(?:                                                     # all until now is optional -> you can pass the naked ID
                         (?:https?:)?//                                       # http(s):// or protocol-independent URL
                         (?:[a-zA-Z\d-]+\.)+[a-zA-Z\d-]+/                                        # any domain
                         (?:(
                             (?:playlist|watch)\?.*?\blist=
                           )|(
                             (?:feed/)?(?:channel|c|user)/
                           )|(
                             (?:feed|hashtag)/
                           )|(search\?(?=q=)
                           )
                         )
                       )
                     )
                     (?(2)IV)?(?P<id>(?(2)%(playlist_id)s|
                                         (?(3)%(channel_id)s|
                                              (?(4)%(feed_id)s|
                                                  (?(5).+|
                                                      %(video_id)s))))
                              )
                     (?(1).*)                                                 # if we found the ID, everything can follow
                     $""" % {
        'video_id': YoutubeBaseInfoExtractor._VIDEO_ID_RE,
        'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
        'channel_id': YoutubeBaseInfoExtractor._CHANNEL_ID_RE,
        'feed_id': r'[0-9a-zA-Z_-]{10,}',
    }
    _LINK_RE = r'<link\s[^>]*?%s[^>]*>'
    # <link title="Invidious">
    _LINK_TITLE_RE = _LINK_RE % r'title\s*=\s*(?P<q>"|\'|\b)Invidious(?P=q)'
    _TITLE_RE = r'<title\b[^>]*?>(.+)\s*-\s*Invidious</title>'
    _TESTS = [{
        # Invidious video page with standard link to YT
        'url': 'https://invidious.snopyta.org/watch?v=aU_jWooBxzI',
        'md5': 'fad656e510b491dcbefba0b0065ceb37',
        'info_dict': {
            'id': 'aU_jWooBxzI',
            'ext': 'mp4',
            'title': 'PCs are TOO Powerful… and it’s a problem',
            'thumbnail': r're:https?://i.ytimg.com/.+\.jpg',
            'upload_date': '20210818',
            'uploader': 'Linus Tech Tips',
            'uploader_id': 'LinusTechTips',
            'description': 'md5:749b04d3931048628191889dfb14c5ba',
            'duration': 708,
        },
    }, {
        # Invidious video page with standard link to YT
        'url': 'https://invidious-us.kavin.rocks/watch?v=15TvLqK29PU&list=IVPLxy40xZSaui6mZCrEUbd-MeMQD41-k6D',
        'md5': '7a7ab808f6cee434361463161c046d25',
        'info_dict': {
            'id': '15TvLqK29PU',
            'ext': 'mp4',
            'title': 'md5:5130b529083cd4a692c4917beb059428',
            'thumbnail': 're:https?://i.ytimg.com/.+',
            'upload_date': '20120921',
            'uploader': 'md5:42326ad7441688122b035175a51de385',
            'uploader_id': 'UCuel_9Lg9WH9P5dFnXZ0zKQ',
            'description': 'md5:541ed05829043b077d920029641ad831',
            'duration': 366,
        },
        'params': {
            # Cloudflare breaks HTTP if Chrome is mentioned in the UA (2021-08)
            'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/6533.18.5 (KHTML, like Gecko) Safari/6533.18.5',
        },
        'skip': 'test_download doesn\'t respect the user_agent option',
    }, {
        # Invidious playlist, more than one page
        # Results may be fewer than YoutubePlaylistIE because of global deduplication
        'url': 'https://invidious.snopyta.org/playlist?list=PL143B3D4078ECAD35',
        'info_dict': {
            'id': 'PL143B3D4078ECAD35',
            'title': 'yum yum gimme sum',
        },
        'playlist_mincount': 95,
    }, {
        # Invidious channel, more than one page
        'url': 'https://invidious.snopyta.org/channel/UCFEFodsnfvA2diJOn7xxd0g',
        'info_dict': {
            'id': 'UCFEFodsnfvA2diJOn7xxd0g',
            'title': 'waaghalsrecords',
        },
        'playlist_mincount': 90,
    }, {
        # Invidious search, more than one page
        'url': 'https://invidious.snopyta.org/search?q=dale+"hawkins"',
        'info_dict': {
            'id': 'q=dale+"hawkins"',
            'title': 'dale "hawkins"',
        },
        'playlist_mincount': 410,
    }]
    @classmethod
    def page_suitable(cls, caller, url, webpage):
        '''Return truthy iff the webpage at the URL is suitable for the extractor
        Arguments:
        cls -- the InfoExtractor class being tested
        caller -- the calling InfoExtractor instance
        url -- compat_str url to test against the class
        webpage -- compat_str text of the page at the url
        '''
        return caller._search_regex(cls._LINK_TITLE_RE, webpage, 'Invidious title', default=False)
    @classmethod
    def page_url_result(cls, url, video_id=None, video_title=None, webpage=None):
        '''Pass the URL to an extractor using the custom ytdlie:// scheme'''
        ie_key = cls.ie_key()
        url = smuggle_url('ytdlie://' + ie_key, {'url': url, })
        return cls.url_result(url, ie=ie_key, video_id=video_id, video_title=video_title)
    @classmethod
    def _real_match_id(cls, url):
        '''Return ID from url matched against _REAL_VALID_URL'''
        if '_REAL_VALID_URL_RE' not in cls.__dict__:
            cls._REAL_VALID_URL_RE = re.compile(cls._REAL_VALID_URL)
        m = re.match(cls._REAL_VALID_URL_RE, url)
        assert m
        return compat_str(m.group('id'))
    def _real_extract(self, url):
        '''Return YT URL of video in an Invidious single video page
        Arguments:
        caller -- an InfoExtractor
        webpage -- compat_str text of the video page
        '''
        url = unsmuggle_url(url, {})[1]['url']
        video_id = self._real_match_id(url)
        webpage = self._download_webpage(url, video_id)
        # single video page?
        REL_ALT_RE = r'(?P<rel%(n)d>rel\s*=\s*(?P<q%(n)d>"|\'|\b)alternate(?P=q%(n)d))'
        LINK_REL_ALT_TEMPL = (
            r'''
                %(rel_alt1)s                               # rel="alternate"
                \s[^>]*?
                href\s*=\s*(?P<q0>"|\'|\b)                 # href="invid URL"
                    (?P<invid_url>%(canonical_video_url)s)(?P=q0)
                (?(rel1)|\s[^>]*?%(rel_alt2)s)             # rel="alternate" if following
            ''')
        LINK_REL_ALT_RE = '(?x)' + self._LINK_RE % LINK_REL_ALT_TEMPL
        yt_url = self._search_regex(
            LINK_REL_ALT_RE
            % {
                'rel_alt1': REL_ALT_RE % {'n': 1, },
                'canonical_video_url': YoutubeIE._CANONICAL_VIDEO_RE,
                'rel_alt2': REL_ALT_RE % {'n': 2, },
            },
            webpage, 'youtube link', default=None, group='invid_url')
        if yt_url:
            return self.url_result(yt_url, ie=YoutubeIE.ie_key(), video_id=video_id)
        # perhaps it's a playlist or a channel?
        title = self._html_search_regex(self._TITLE_RE, webpage, 'page title', default=None)
        NEXT_PAGE_RE = r'''(?x)
                           <a\s[^>]*?href\s*=\s*(?P<q>"|\'|\b)
                               (?P<next_page>.+?[&?]page=(?P<page_num>%s))
                           (?P=q)>
                       '''
        # generate all video links from page and further pages
        def gen_extract(url, webpage, video_id):
            next_page_re = NEXT_PAGE_RE
            next_page = (int_or_none(
                self._search_regex(
                    next_page_re % r'\d+',
                    webpage, 'next page num', default=None, group='page_num'))
                or 2)
            next_page_re = next_page_re % '%d'
            VIDEO_LINK_RE = r'''(?x)
                                <a\s[^>]*?
                                    href\s*=\s*(?P<q>"|\'|\b)
                                    /watch\?v=(%s)(?!.+\blisten=1.*).*?
                                (?P=q)
                            ''' % self._VIDEO_ID_RE
            for n in itertools.count(next_page):
                video_ids = re.findall(VIDEO_LINK_RE, webpage)
                for _, vid in video_ids:
                    yield 'https://www.youtube.com/watch?v=%s' % vid
                next_page = self._search_regex(
                    next_page_re % n,
                    webpage, 'next page', default=None, group='next_page')
                webpage = (
                    next_page
                    and self._download_webpage(urljoin(url, next_page), video_id, fatal=False))
                if not webpage:
                    break
        return self.playlist_from_matches(
            gen_extract(url, webpage, video_id),
            playlist_id=video_id, playlist_title=title,
            ie=YoutubeIE.ie_key())
Author	SHA1	Message	Date
df	88b767d8e2	Allow for page_suitable() extraction in tests	2021-09-11 10:36:05 +01:00
df	fc92c793d2	Support Invidious YT front-end with any host name * implement page-based detection and redirection using ytdlie:// * add support for playlists and channels * support extracting videos from further pages	2021-09-11 09:05:10 +01:00