Disambiguate 4-digit year and time-zone suffix

Restore check omitted from extract_timezone(); adjust DATE_FORMATS_DAY/MONTH_FIRST; add tests.
Fix urlhandle_detect_ext() non-ASCII error in Py2, with test
2021-09-13 01:15:48 +01:00 · 2021-08-29 06:27:54 +01:00 · 2021-08-23 18:02:26 +01:00
4 changed files with 76 additions and 16 deletions
--- a/test/test_compat.py
+++ b/test/test_compat.py
@ -39,8 +39,12 @@ class TestCompat(unittest.TestCase):
        self.assertEqual(compat_getenv(test_var), test_str)

    def test_compat_expanduser(self):
+        from youtube_dl.compat import compat_os_name
        old_home = os.environ.get('HOME')
-        test_str = r'C:\Documents and Settings\тест\Application Data'
+        if compat_os_name in ('nt', 'ce'):
+            test_str = r'C:\Documents and Settings\тест\Application Data'
+        else:
+            test_str = '/home/тест'
        compat_setenv('HOME', test_str)
        self.assertEqual(compat_expanduser('~'), test_str)
        compat_setenv('HOME', old_home or '')
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -105,6 +105,7 @@ from youtube_dl.utils import (
    cli_valueless_option,
    cli_bool_option,
    parse_codecs,
+    urlhandle_detect_ext,
 )
 from youtube_dl.compat import (
    compat_chr,
@ -370,6 +371,12 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
        self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
        self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
+        self.assertEqual(unified_timestamp('11:31 17-Jun-2021'), 1623929460)
+        self.assertEqual(unified_timestamp('11:31 17-Jun-2021-0000'), 1623929460)
+        from youtube_dl.utils import DATE_FORMATS_DAY_FIRST
+        DATE_FORMATS_DAY_FIRST.append('%H:%M %d-%m-%Y')
+        self.assertEqual(unified_timestamp('17:30 27-02-2016'), 1456594200)
+        self.assertEqual(unified_timestamp('17:30 27-02-2016-0000'), 1456594200)

    def test_determine_ext(self):
        self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
@ -480,7 +487,7 @@ class TestUtil(unittest.TestCase):
        args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')]
        self.assertEqual(
            shell_quote(args),
-            """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''')
+            """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if not(compat_os_name in ('nt', 'ce')) else '''ffmpeg -i "ñ€ß'.mp4"''')

    def test_float_or_none(self):
        self.assertEqual(float_or_none('42.42'), 42.42)
@ -1085,7 +1092,7 @@ class TestUtil(unittest.TestCase):
    def test_args_to_str(self):
        self.assertEqual(
            args_to_str(['foo', 'ba/r', '-baz', '2 be', '']),
-            'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""'
+            'foo ba/r -baz \'2 be\' \'\'' if not(compat_os_name in ('nt', 'ce')) else 'foo ba/r -baz "2 be" ""'
        )

    def test_parse_filesize(self):
@ -1475,6 +1482,30 @@ Line 1
        self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
        self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')

+    def test_urlhandle_detect_ext(self):
+
+        class UrlHandle(object):
+            _info = {}
+
+            def __init__(self, info):
+                self._info = info
+
+            @property
+            def headers(self):
+                return self._info
+
+        # header with non-ASCII character and contradictory Content-Type
+        urlh = UrlHandle({
+            'Content-Disposition': b'attachment; filename="Epis\xf3dio contains non-ASCI ISO 8859-1 character.mp3"',
+            'Content-Type': b'audio/aac',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with no Content-Disposition
+        urlh = UrlHandle({
+            'Content-Type': b'audio/mp3',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+

 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@ -2700,16 +2700,19 @@ else:
    # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)

    def compat_getenv(key, default=None):
-        from .utils import get_filesystem_encoding
        env = os.getenv(key, default)
        if env:
-            env = env.decode(get_filesystem_encoding())
+            from .utils import get_filesystem_encoding
+            encoding = get_filesystem_encoding()
+            env = env.decode(encoding)
+            if not encoding.lower().startswith('ut'):
+                env = env.encode('utf-8').decode('unicode-escape')
        return env

    def compat_setenv(key, value, env=os.environ):
        def encode(v):
            from .utils import get_filesystem_encoding
-            return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v
+            return v.encode(get_filesystem_encoding(), 'backslashreplace') if isinstance(v, compat_str) else v
        env[encode(key)] = encode(value)

    # HACK: The default implementations of os.path.expanduser from cpython do not decode
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding: utf-8

 from __future__ import unicode_literals
@ -1717,8 +1716,6 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙ
                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))

 DATE_FORMATS = (
-    '%d %B %Y',
-    '%d %b %Y',
    '%B %d %Y',
    '%B %dst %Y',
    '%B %dnd %Y',
@ -1763,6 +1760,11 @@ DATE_FORMATS_DAY_FIRST.extend([
    '%d/%m/%Y',
    '%d/%m/%y',
    '%d/%m/%Y %H:%M:%S',
+    '%d %B %Y',
+    '%d %b %Y',
+    '%d-%b-%Y',
+    '%H:%M %d-%b-%Y',
+    '%H:%M:%S %d-%b-%Y',
 ])

 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
@ -1772,6 +1774,11 @@ DATE_FORMATS_MONTH_FIRST.extend([
    '%m/%d/%Y',
    '%m/%d/%y',
    '%m/%d/%Y %H:%M:%S',
+    '%B %d %Y',
+    '%b %d %Y',
+    '%b-%d-%Y',
+    '%H:%M %b-%d-%Y',
+    '%H:%M:%S %b-%d-%Y',
 ])

 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
@ -2245,7 +2252,8 @@ def encodeFilename(s, for_subprocess=False):
    if sys.platform.startswith('java'):
        return s

-    return s.encode(get_subprocess_encoding(), 'ignore')
+    # If encoding is (eg) 'ascii', use escape sequences (allows round-trip test)
+    return s.encode(get_subprocess_encoding(), 'backslashreplace')


 def decodeFilename(b, for_subprocess=False):
@ -2938,7 +2946,16 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):

 def extract_timezone(date_str):
    m = re.search(
-        r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+        r'''(?x)
+            ^.{8,}?                                              # >=8 char non-TZ prefix, if present
+            (?P<tz>Z|                                            # just the UTC Z, or
+                (?:(?<=.\b\d{4}|\b\d{2}:\d\d)|                   # preceded by 4 digits or hh:mm or
+                   (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d))     # not preceded by 3 alpha word or >= 4 alpha or 2 digits
+                   [ ]?                                          # optional space
+                (?P<sign>\+|-)                                   # +/-
+                (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})       # hh[:]mm
+            $)
+        ''',
        date_str)
    if not m:
        timezone = datetime.timedelta()
@ -3354,8 +3371,7 @@ class locked_file(object):


 def get_filesystem_encoding():
-    encoding = sys.getfilesystemencoding()
-    return encoding if encoding is not None else 'utf-8'
+    return sys.getfilesystemencoding() or sys.getdefaultencoding() or 'utf-8'


 def shell_quote(args):
@ -3365,6 +3381,8 @@ def shell_quote(args):
        if isinstance(a, bytes):
            # We may get a filename encoded with 'encodeFilename'
            a = a.decode(encoding)
+            if not encoding.lower().startswith('ut'):
+                a = a.encode('utf-8').decode('unicode-escape')
        quoted_args.append(compat_shlex_quote(a))
    return ' '.join(quoted_args)

@ -4286,7 +4304,10 @@ def parse_codecs(codecs_str):
 def urlhandle_detect_ext(url_handle):
    getheader = url_handle.headers.get

-    cd = getheader('Content-Disposition')
+    def encode_compat_str_or_none(x, encoding='iso-8859-1', errors='ignore'):
+        return encode_compat_str(x, encoding=encoding, errors=errors) if x else None
+
+    cd = encode_compat_str_or_none(getheader('Content-Disposition'))
    if cd:
        m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
        if m:
@ -4294,7 +4315,8 @@ def urlhandle_detect_ext(url_handle):
            if e:
                return e

-    return mimetype2ext(getheader('Content-Type'))
+    ct = encode_compat_str_or_none(getheader('Content-Type'))
+    return mimetype2ext(ct)


 def encode_data_uri(data, mime_type):
@ -4610,7 +4632,7 @@ def dfxp2srt(dfxp_data):
            continue
        default_style.update(style)

-    for para, index in zip(paras, itertools.count(1)):
+    for index, para in enumerate(paras, 1):
        begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
        end_time = parse_dfxp_time_expr(para.attrib.get('end'))
        dur = parse_dfxp_time_expr(para.attrib.get('dur'))
Author	SHA1	Message	Date
df	f798b40cf3	Disambiguate 4-digit year and time-zone suffix Restore check omitted from extract_timezone(); adjust DATE_FORMATS_DAY/MONTH_FIRST; add tests.	2021-09-13 01:15:48 +01:00
df	1e222005ba	Fix urlhandle_detect_ext() non-ASCII error in Py2, with test	2021-08-29 06:27:54 +01:00
df	197215782b	Small fixes to utils and compat and test	2021-08-23 18:02:26 +01:00