Allow for page_suitable() extraction in tests

Support Invidious YT front-end with any host name
* implement page-based detection and redirection using ytdlie:// * add support for playlists and channels * support extracting videos from further pages
2021-09-11 10:36:05 +01:00 · 2021-09-11 09:05:10 +01:00
5 changed files with 242 additions and 92 deletions
--- a/test/helper.py
+++ b/test/helper.py
@ -89,8 +89,10 @@ class FakeYDL(YoutubeDL):
        self.report_warning = types.MethodType(report_warning, self)


-def gettestcases(include_onlymatching=False):
+def gettestcases(include_onlymatching=False, include_pagesuitable=True):
    for ie in youtube_dl.extractor.gen_extractors():
+        if not include_pagesuitable and callable(getattr(ie, 'page_suitable', None)):
+            continue
        for tc in ie.get_testcases(include_onlymatching):
            yield tc

--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@ -76,7 +76,7 @@ class TestAllURLsMatching(unittest.TestCase):

    def test_no_duplicates(self):
        ies = gen_extractors()
-        for tc in gettestcases(include_onlymatching=True):
+        for tc in gettestcases(include_onlymatching=True, include_pagesuitable=False):
            url = tc['url']
            for ie in ies:
                if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'):
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1613,6 +1613,7 @@ from .youtube import (
    YoutubeYtBeIE,
    YoutubeYtUserIE,
    YoutubeWatchLaterIE,
+    InvidiousIE,
 )
 from .zapiks import ZapiksIE
 from .zattoo import (
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -7,7 +7,11 @@ import re
 import sys

 from .common import InfoExtractor
-from .youtube import YoutubeIE
+from .youtube import (
+    YoutubeIE,
+    InvidiousIE,
+)
+
 from ..compat import (
    compat_etree_fromstring,
    compat_str,
@ -2633,6 +2637,10 @@ class GenericIE(InfoExtractor):
            return self.playlist_from_matches(
                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())

+        # Invidious YT front-end
+        if InvidiousIE.page_suitable(self, url, webpage):
+            return InvidiousIE.page_url_result(url, video_id, video_title, webpage)
+
        matches = DailymotionIE._extract_urls(webpage)
        if matches:
            return self.playlist_from_matches(matches, video_id, video_title)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -63,7 +63,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
    # If True it will raise an error if no login info is provided
    _LOGIN_REQUIRED = False

-    _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
+    _VIDEO_ID_RE = r'[0-9A-Za-z_-]{11}'
+    _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z_-]{10,}|RDMM)'
+    _CHANNEL_ID_RE = r'(?:UC[0-9A-Za-z_-]{10,})'

    def _login(self):
        """
@ -342,65 +344,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):

 class YoutubeIE(YoutubeBaseInfoExtractor):
    IE_DESC = 'YouTube.com'
-    _INVIDIOUS_SITES = (
-        # invidious-redirect websites
-        r'(?:www\.)?redirect\.invidious\.io',
-        r'(?:(?:www|dev)\.)?invidio\.us',
-        # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
-        r'(?:(?:www|no)\.)?invidiou\.sh',
-        r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
-        r'(?:www\.)?invidious\.kabi\.tk',
-        r'(?:www\.)?invidious\.13ad\.de',
-        r'(?:www\.)?invidious\.mastodon\.host',
-        r'(?:www\.)?invidious\.zapashcanon\.fr',
-        r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
-        r'(?:www\.)?invidious\.tinfoil-hat\.net',
-        r'(?:www\.)?invidious\.himiko\.cloud',
-        r'(?:www\.)?invidious\.reallyancient\.tech',
-        r'(?:www\.)?invidious\.tube',
-        r'(?:www\.)?invidiou\.site',
-        r'(?:www\.)?invidious\.site',
-        r'(?:www\.)?invidious\.xyz',
-        r'(?:www\.)?invidious\.nixnet\.xyz',
-        r'(?:www\.)?invidious\.048596\.xyz',
-        r'(?:www\.)?invidious\.drycat\.fr',
-        r'(?:www\.)?inv\.skyn3t\.in',
-        r'(?:www\.)?tube\.poal\.co',
-        r'(?:www\.)?tube\.connect\.cafe',
-        r'(?:www\.)?vid\.wxzm\.sx',
-        r'(?:www\.)?vid\.mint\.lgbt',
-        r'(?:www\.)?vid\.puffyan\.us',
-        r'(?:www\.)?yewtu\.be',
-        r'(?:www\.)?yt\.elukerio\.org',
-        r'(?:www\.)?yt\.lelux\.fi',
-        r'(?:www\.)?invidious\.ggc-project\.de',
-        r'(?:www\.)?yt\.maisputain\.ovh',
-        r'(?:www\.)?ytprivate\.com',
-        r'(?:www\.)?invidious\.13ad\.de',
-        r'(?:www\.)?invidious\.toot\.koeln',
-        r'(?:www\.)?invidious\.fdn\.fr',
-        r'(?:www\.)?watch\.nettohikari\.com',
-        r'(?:www\.)?invidious\.namazso\.eu',
-        r'(?:www\.)?invidious\.silkky\.cloud',
-        r'(?:www\.)?invidious\.exonip\.de',
-        r'(?:www\.)?invidious\.riverside\.rocks',
-        r'(?:www\.)?invidious\.blamefran\.net',
-        r'(?:www\.)?invidious\.moomoo\.de',
-        r'(?:www\.)?ytb\.trom\.tf',
-        r'(?:www\.)?yt\.cyberhost\.uk',
-        r'(?:www\.)?kgg2m7yk5aybusll\.onion',
-        r'(?:www\.)?qklhadlycap4cnod\.onion',
-        r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
-        r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
-        r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
-        r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
-        r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
-        r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
-        r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
-        r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
-        r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
-        r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
-    )
+    _CANONICAL_VIDEO_RE = r'https://www\.youtube\.com/watch\?v=%s' % YoutubeBaseInfoExtractor._VIDEO_ID_RE
    _VALID_URL = r"""(?x)^
                     (
                         (?:https?://|//)                                    # http(s):// or protocol-independent URL
@ -410,7 +354,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                            (?:www\.)?hooktube\.com|
                            (?:www\.)?yourepeat\.com|
                            tube\.majestyc\.net|
-                            %(invidious)s|
                            youtube\.googleapis\.com)/                        # the various hostnames, with wildcard subdomains
                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                         (?:                                                  # the various things that can precede the ID:
@ -425,16 +368,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         |(?:
                            youtu\.be|                                        # just youtu.be/xxxx
                            vid\.plus|                                        # or vid.plus/xxxx
-                            zwearz\.com/watch|                                # or zwearz.com/watch/xxxx
-                            %(invidious)s
+                            zwearz\.com/watch                                # or zwearz.com/watch/xxxx
                         )/
                         |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
                         )
                     )?                                                       # all until now is optional -> you can pass the naked ID
-                     (?P<id>[0-9A-Za-z_-]{11})                                # here is it! the YouTube video ID
+                     (?P<id>%(video_id)s)                                     # here is it! the YouTube video ID
                     (?(1).+)?                                                # if we found the ID, everything can follow
                     $""" % {
-        'invidious': '|'.join(_INVIDIOUS_SITES),
+        'video_id': YoutubeBaseInfoExtractor._VIDEO_ID_RE,
    }
    _PLAYER_INFO_RE = (
        r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
@ -944,19 +886,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'url': 'sJL6WA-aGkQ',
            'only_matching': True,
        },
-        {
-            'url': 'https://invidio.us/watch?v=BaW_jenozKc',
-            'only_matching': True,
-        },
-        {
-            'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
-            'only_matching': True,
-        },
-        {
-            # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
-            'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
-            'only_matching': True,
-        },
        {
            # DRM protected
            'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
@ -2013,10 +1942,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:\w+\.)?
-                        (?:
-                            youtube(?:kids)?\.com|
-                            invidio\.us
-                        )/
+                        youtube(?:kids)?\.com
+                        /
                        (?:
                            (?:channel|c|user|feed|hashtag)/|
                            (?:playlist|watch)\?.*?\blist=|
@ -2139,9 +2066,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
        },
        'playlist_mincount': 138,
-    }, {
-        'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
-        'only_matching': True,
    }, {
        'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
        'only_matching': True,
@ -2192,9 +2116,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
            'uploader': 'Computerphile',
        },
        'playlist_mincount': 11,
-    }, {
-        'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
-        'only_matching': True,
    }, {
        # Playlist URL that does not actually serve a playlist
        'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
@ -2870,7 +2791,6 @@ class YoutubePlaylistIE(InfoExtractor):
                        (?:
                            (?:
                                youtube(?:kids)?\.com|
-                                invidio\.us
                            )
                            /.*?\?.*?\blist=
                        )?
@ -3255,3 +3175,222 @@ class YoutubeTruncatedIDIE(InfoExtractor):
        raise ExtractorError(
            'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
            expected=True)
+
+
+class InvidiousIE(YoutubeIE):
+    # Invidious Instances (#29885) per https://github.com/iv-org/invidious/pull/1730
+    # Thanks: https://github.com/yt-dlp/yt-dlp/commit/df0c81513e0bb37986d00c532a5ad8cef31a24ea
+    IE_NAME = 'invidious'
+    IE_DESC = 'Invidious YT front-end videos, playlists, channels, searches'
+    _VALID_URL = r'ytdlie://Invidious#(?P<id>.+)'
+    _REAL_VALID_URL = r"""(?x)^
+                     (?:(
+                         (?:https?:)?//                                       # http(s):// or protocol-independent URL
+                         (?:[a-zA-Z\d-]+\.)+[a-zA-Z\d-]+/                                        # any domain
+                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
+                         (?:                                                  # the various things that can precede the ID:
+                             (?:(?:v|embed|e)/(?!videoseries))?               # v/ or embed/ or e/, or nothing
+                             |(?:                                             # or the v= param in all its forms
+                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+                                 (?:\?|\#!?)                                  # the params delimiter ? or # or #!
+                                 (?:.*?[&;])??                                # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
+                                 v=
+                             )
+                         )
+                       )?|(?:                                                     # all until now is optional -> you can pass the naked ID
+                         (?:https?:)?//                                       # http(s):// or protocol-independent URL
+                         (?:[a-zA-Z\d-]+\.)+[a-zA-Z\d-]+/                                        # any domain
+                         (?:(
+                             (?:playlist|watch)\?.*?\blist=
+                           )|(
+                             (?:feed/)?(?:channel|c|user)/
+                           )|(
+                             (?:feed|hashtag)/
+                           )|(search\?(?=q=)
+                           )
+                         )
+                       )
+                     )
+                     (?(2)IV)?(?P<id>(?(2)%(playlist_id)s|
+                                         (?(3)%(channel_id)s|
+                                              (?(4)%(feed_id)s|
+                                                  (?(5).+|
+                                                      %(video_id)s))))
+                              )
+                     (?(1).*)                                                 # if we found the ID, everything can follow
+                     $""" % {
+        'video_id': YoutubeBaseInfoExtractor._VIDEO_ID_RE,
+        'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
+        'channel_id': YoutubeBaseInfoExtractor._CHANNEL_ID_RE,
+        'feed_id': r'[0-9a-zA-Z_-]{10,}',
+    }
+    _LINK_RE = r'<link\s[^>]*?%s[^>]*>'
+    # <link title="Invidious">
+    _LINK_TITLE_RE = _LINK_RE % r'title\s*=\s*(?P<q>"|\'|\b)Invidious(?P=q)'
+    _TITLE_RE = r'<title\b[^>]*?>(.+)\s*-\s*Invidious</title>'
+    _TESTS = [{
+        # Invidious video page with standard link to YT
+        'url': 'https://invidious.snopyta.org/watch?v=aU_jWooBxzI',
+        'md5': 'fad656e510b491dcbefba0b0065ceb37',
+        'info_dict': {
+            'id': 'aU_jWooBxzI',
+            'ext': 'mp4',
+            'title': 'PCs are TOO Powerful… and it’s a problem',
+            'thumbnail': r're:https?://i.ytimg.com/.+\.jpg',
+            'upload_date': '20210818',
+            'uploader': 'Linus Tech Tips',
+            'uploader_id': 'LinusTechTips',
+            'description': 'md5:749b04d3931048628191889dfb14c5ba',
+            'duration': 708,
+        },
+    }, {
+        # Invidious video page with standard link to YT
+        'url': 'https://invidious-us.kavin.rocks/watch?v=15TvLqK29PU&list=IVPLxy40xZSaui6mZCrEUbd-MeMQD41-k6D',
+        'md5': '7a7ab808f6cee434361463161c046d25',
+        'info_dict': {
+            'id': '15TvLqK29PU',
+            'ext': 'mp4',
+            'title': 'md5:5130b529083cd4a692c4917beb059428',
+            'thumbnail': 're:https?://i.ytimg.com/.+',
+            'upload_date': '20120921',
+            'uploader': 'md5:42326ad7441688122b035175a51de385',
+            'uploader_id': 'UCuel_9Lg9WH9P5dFnXZ0zKQ',
+            'description': 'md5:541ed05829043b077d920029641ad831',
+            'duration': 366,
+        },
+        'params': {
+            # Cloudflare breaks HTTP if Chrome is mentioned in the UA (2021-08)
+            'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/6533.18.5 (KHTML, like Gecko) Safari/6533.18.5',
+        },
+        'skip': 'test_download doesn\'t respect the user_agent option',
+    }, {
+        # Invidious playlist, more than one page
+        # Results may be fewer than YoutubePlaylistIE because of global deduplication
+        'url': 'https://invidious.snopyta.org/playlist?list=PL143B3D4078ECAD35',
+        'info_dict': {
+            'id': 'PL143B3D4078ECAD35',
+            'title': 'yum yum gimme sum',
+        },
+        'playlist_mincount': 95,
+    }, {
+        # Invidious channel, more than one page
+        'url': 'https://invidious.snopyta.org/channel/UCFEFodsnfvA2diJOn7xxd0g',
+        'info_dict': {
+            'id': 'UCFEFodsnfvA2diJOn7xxd0g',
+            'title': 'waaghalsrecords',
+        },
+        'playlist_mincount': 90,
+    }, {
+        # Invidious search, more than one page
+        'url': 'https://invidious.snopyta.org/search?q=dale+"hawkins"',
+        'info_dict': {
+            'id': 'q=dale+"hawkins"',
+            'title': 'dale "hawkins"',
+        },
+        'playlist_mincount': 410,
+    }]
+
+    @classmethod
+    def page_suitable(cls, caller, url, webpage):
+        '''Return truthy iff the webpage at the URL is suitable for the extractor
+
+        Arguments:
+        cls -- the InfoExtractor class being tested
+        caller -- the calling InfoExtractor instance
+        url -- compat_str url to test against the class
+        webpage -- compat_str text of the page at the url
+        '''
+
+        return caller._search_regex(cls._LINK_TITLE_RE, webpage, 'Invidious title', default=False)
+
+    @classmethod
+    def page_url_result(cls, url, video_id=None, video_title=None, webpage=None):
+        '''Pass the URL to an extractor using the custom ytdlie:// scheme'''
+        ie_key = cls.ie_key()
+        url = smuggle_url('ytdlie://' + ie_key, {'url': url, })
+        return cls.url_result(url, ie=ie_key, video_id=video_id, video_title=video_title)
+
+    @classmethod
+    def _real_match_id(cls, url):
+        '''Return ID from url matched against _REAL_VALID_URL'''
+        if '_REAL_VALID_URL_RE' not in cls.__dict__:
+            cls._REAL_VALID_URL_RE = re.compile(cls._REAL_VALID_URL)
+        m = re.match(cls._REAL_VALID_URL_RE, url)
+        assert m
+        return compat_str(m.group('id'))
+
+    def _real_extract(self, url):
+        '''Return YT URL of video in an Invidious single video page
+
+        Arguments:
+        caller -- an InfoExtractor
+        webpage -- compat_str text of the video page
+        '''
+        url = unsmuggle_url(url, {})[1]['url']
+        video_id = self._real_match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        # single video page?
+        REL_ALT_RE = r'(?P<rel%(n)d>rel\s*=\s*(?P<q%(n)d>"|\'|\b)alternate(?P=q%(n)d))'
+        LINK_REL_ALT_TEMPL = (
+            r'''
+                %(rel_alt1)s                               # rel="alternate"
+                \s[^>]*?
+                href\s*=\s*(?P<q0>"|\'|\b)                 # href="invid URL"
+                    (?P<invid_url>%(canonical_video_url)s)(?P=q0)
+                (?(rel1)|\s[^>]*?%(rel_alt2)s)             # rel="alternate" if following
+            ''')
+        LINK_REL_ALT_RE = '(?x)' + self._LINK_RE % LINK_REL_ALT_TEMPL
+        yt_url = self._search_regex(
+            LINK_REL_ALT_RE
+            % {
+                'rel_alt1': REL_ALT_RE % {'n': 1, },
+                'canonical_video_url': YoutubeIE._CANONICAL_VIDEO_RE,
+                'rel_alt2': REL_ALT_RE % {'n': 2, },
+            },
+            webpage, 'youtube link', default=None, group='invid_url')
+        if yt_url:
+            return self.url_result(yt_url, ie=YoutubeIE.ie_key(), video_id=video_id)
+
+        # perhaps it's a playlist or a channel?
+        title = self._html_search_regex(self._TITLE_RE, webpage, 'page title', default=None)
+
+        NEXT_PAGE_RE = r'''(?x)
+                           <a\s[^>]*?href\s*=\s*(?P<q>"|\'|\b)
+                               (?P<next_page>.+?[&?]page=(?P<page_num>%s))
+                           (?P=q)>
+                       '''
+
+        # generate all video links from page and further pages
+        def gen_extract(url, webpage, video_id):
+            next_page_re = NEXT_PAGE_RE
+            next_page = (int_or_none(
+                self._search_regex(
+                    next_page_re % r'\d+',
+                    webpage, 'next page num', default=None, group='page_num'))
+                or 2)
+            next_page_re = next_page_re % '%d'
+            VIDEO_LINK_RE = r'''(?x)
+                                <a\s[^>]*?
+                                    href\s*=\s*(?P<q>"|\'|\b)
+                                    /watch\?v=(%s)(?!.+\blisten=1.*).*?
+                                (?P=q)
+                            ''' % self._VIDEO_ID_RE
+            for n in itertools.count(next_page):
+                video_ids = re.findall(VIDEO_LINK_RE, webpage)
+                for _, vid in video_ids:
+                    yield 'https://www.youtube.com/watch?v=%s' % vid
+                next_page = self._search_regex(
+                    next_page_re % n,
+                    webpage, 'next page', default=None, group='next_page')
+                webpage = (
+                    next_page
+                    and self._download_webpage(urljoin(url, next_page), video_id, fatal=False))
+
+                if not webpage:
+                    break
+
+        return self.playlist_from_matches(
+            gen_extract(url, webpage, video_id),
+            playlist_id=video_id, playlist_title=title,
+            ie=YoutubeIE.ie_key())
Author	SHA1	Message	Date
df	88b767d8e2	Allow for page_suitable() extraction in tests	2021-09-11 10:36:05 +01:00
df	fc92c793d2	Support Invidious YT front-end with any host name * implement page-based detection and redirection using ytdlie:// * add support for playlists and channels * support extracting videos from further pages	2021-09-11 09:05:10 +01:00