Compare commits

...

3 Commits

Author SHA1 Message Date
df f798b40cf3 Disambiguate 4-digit year and time-zone suffix
Restore check omitted from extract_timezone(); adjust DATE_FORMATS_DAY/MONTH_FIRST; add tests.
2021-09-13 01:15:48 +01:00
df 1e222005ba Fix urlhandle_detect_ext() non-ASCII error in Py2, with test 2021-08-29 06:27:54 +01:00
df 197215782b Small fixes to utils and compat and test 2021-08-23 18:02:26 +01:00
4 changed files with 76 additions and 16 deletions

View File

@ -39,8 +39,12 @@ class TestCompat(unittest.TestCase):
self.assertEqual(compat_getenv(test_var), test_str)
def test_compat_expanduser(self):
from youtube_dl.compat import compat_os_name
old_home = os.environ.get('HOME')
test_str = r'C:\Documents and Settings\тест\Application Data'
if compat_os_name in ('nt', 'ce'):
test_str = r'C:\Documents and Settings\тест\Application Data'
else:
test_str = '/home/тест'
compat_setenv('HOME', test_str)
self.assertEqual(compat_expanduser('~'), test_str)
compat_setenv('HOME', old_home or '')

View File

@ -105,6 +105,7 @@ from youtube_dl.utils import (
cli_valueless_option,
cli_bool_option,
parse_codecs,
urlhandle_detect_ext,
)
from youtube_dl.compat import (
compat_chr,
@ -370,6 +371,12 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
self.assertEqual(unified_timestamp('11:31 17-Jun-2021'), 1623929460)
self.assertEqual(unified_timestamp('11:31 17-Jun-2021-0000'), 1623929460)
from youtube_dl.utils import DATE_FORMATS_DAY_FIRST
DATE_FORMATS_DAY_FIRST.append('%H:%M %d-%m-%Y')
self.assertEqual(unified_timestamp('17:30 27-02-2016'), 1456594200)
self.assertEqual(unified_timestamp('17:30 27-02-2016-0000'), 1456594200)
def test_determine_ext(self):
self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
@ -480,7 +487,7 @@ class TestUtil(unittest.TestCase):
args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')]
self.assertEqual(
shell_quote(args),
"""ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''')
"""ffmpeg -i 'ñ€ß'"'"'.mp4'""" if not(compat_os_name in ('nt', 'ce')) else '''ffmpeg -i "ñ€ß'.mp4"''')
def test_float_or_none(self):
self.assertEqual(float_or_none('42.42'), 42.42)
@ -1085,7 +1092,7 @@ class TestUtil(unittest.TestCase):
def test_args_to_str(self):
self.assertEqual(
args_to_str(['foo', 'ba/r', '-baz', '2 be', '']),
'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""'
'foo ba/r -baz \'2 be\' \'\'' if not(compat_os_name in ('nt', 'ce')) else 'foo ba/r -baz "2 be" ""'
)
def test_parse_filesize(self):
@ -1475,6 +1482,30 @@ Line 1
self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
def test_urlhandle_detect_ext(self):
class UrlHandle(object):
_info = {}
def __init__(self, info):
self._info = info
@property
def headers(self):
return self._info
# header with non-ASCII character and contradictory Content-Type
urlh = UrlHandle({
'Content-Disposition': b'attachment; filename="Epis\xf3dio contains non-ASCI ISO 8859-1 character.mp3"',
'Content-Type': b'audio/aac',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with no Content-Disposition
urlh = UrlHandle({
'Content-Type': b'audio/mp3',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
if __name__ == '__main__':
unittest.main()

View File

@ -2700,16 +2700,19 @@ else:
# Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
def compat_getenv(key, default=None):
from .utils import get_filesystem_encoding
env = os.getenv(key, default)
if env:
env = env.decode(get_filesystem_encoding())
from .utils import get_filesystem_encoding
encoding = get_filesystem_encoding()
env = env.decode(encoding)
if not encoding.lower().startswith('ut'):
env = env.encode('utf-8').decode('unicode-escape')
return env
def compat_setenv(key, value, env=os.environ):
def encode(v):
from .utils import get_filesystem_encoding
return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v
return v.encode(get_filesystem_encoding(), 'backslashreplace') if isinstance(v, compat_str) else v
env[encode(key)] = encode(value)
# HACK: The default implementations of os.path.expanduser from cpython do not decode

View File

@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals
@ -1717,8 +1716,6 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙ
'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
DATE_FORMATS = (
'%d %B %Y',
'%d %b %Y',
'%B %d %Y',
'%B %dst %Y',
'%B %dnd %Y',
@ -1763,6 +1760,11 @@ DATE_FORMATS_DAY_FIRST.extend([
'%d/%m/%Y',
'%d/%m/%y',
'%d/%m/%Y %H:%M:%S',
'%d %B %Y',
'%d %b %Y',
'%d-%b-%Y',
'%H:%M %d-%b-%Y',
'%H:%M:%S %d-%b-%Y',
])
DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
@ -1772,6 +1774,11 @@ DATE_FORMATS_MONTH_FIRST.extend([
'%m/%d/%Y',
'%m/%d/%y',
'%m/%d/%Y %H:%M:%S',
'%B %d %Y',
'%b %d %Y',
'%b-%d-%Y',
'%H:%M %b-%d-%Y',
'%H:%M:%S %b-%d-%Y',
])
PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
@ -2245,7 +2252,8 @@ def encodeFilename(s, for_subprocess=False):
if sys.platform.startswith('java'):
return s
return s.encode(get_subprocess_encoding(), 'ignore')
# If encoding is (eg) 'ascii', use escape sequences (allows round-trip test)
return s.encode(get_subprocess_encoding(), 'backslashreplace')
def decodeFilename(b, for_subprocess=False):
@ -2938,7 +2946,16 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
def extract_timezone(date_str):
m = re.search(
r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
r'''(?x)
^.{8,}? # >=8 char non-TZ prefix, if present
(?P<tz>Z| # just the UTC Z, or
(?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
(?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
[ ]? # optional space
(?P<sign>\+|-) # +/-
(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
$)
''',
date_str)
if not m:
timezone = datetime.timedelta()
@ -3354,8 +3371,7 @@ class locked_file(object):
def get_filesystem_encoding():
encoding = sys.getfilesystemencoding()
return encoding if encoding is not None else 'utf-8'
return sys.getfilesystemencoding() or sys.getdefaultencoding() or 'utf-8'
def shell_quote(args):
@ -3365,6 +3381,8 @@ def shell_quote(args):
if isinstance(a, bytes):
# We may get a filename encoded with 'encodeFilename'
a = a.decode(encoding)
if not encoding.lower().startswith('ut'):
a = a.encode('utf-8').decode('unicode-escape')
quoted_args.append(compat_shlex_quote(a))
return ' '.join(quoted_args)
@ -4286,7 +4304,10 @@ def parse_codecs(codecs_str):
def urlhandle_detect_ext(url_handle):
getheader = url_handle.headers.get
cd = getheader('Content-Disposition')
def encode_compat_str_or_none(x, encoding='iso-8859-1', errors='ignore'):
return encode_compat_str(x, encoding=encoding, errors=errors) if x else None
cd = encode_compat_str_or_none(getheader('Content-Disposition'))
if cd:
m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
if m:
@ -4294,7 +4315,8 @@ def urlhandle_detect_ext(url_handle):
if e:
return e
return mimetype2ext(getheader('Content-Type'))
ct = encode_compat_str_or_none(getheader('Content-Type'))
return mimetype2ext(ct)
def encode_data_uri(data, mime_type):
@ -4610,7 +4632,7 @@ def dfxp2srt(dfxp_data):
continue
default_style.update(style)
for para, index in zip(paras, itertools.count(1)):
for index, para in enumerate(paras, 1):
begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
end_time = parse_dfxp_time_expr(para.attrib.get('end'))
dur = parse_dfxp_time_expr(para.attrib.get('dur'))