Disambiguate 4-digit year and time-zone suffix

Restore check omitted from extract_timezone(); adjust DATE_FORMATS_DAY/MONTH_FIRST; add tests.
This commit is contained in:
df 2021-09-13 01:00:04 +01:00
parent 1e222005ba
commit f798b40cf3
2 changed files with 26 additions and 4 deletions

View File

@ -371,6 +371,12 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
self.assertEqual(unified_timestamp('11:31 17-Jun-2021'), 1623929460)
self.assertEqual(unified_timestamp('11:31 17-Jun-2021-0000'), 1623929460)
from youtube_dl.utils import DATE_FORMATS_DAY_FIRST
DATE_FORMATS_DAY_FIRST.append('%H:%M %d-%m-%Y')
self.assertEqual(unified_timestamp('17:30 27-02-2016'), 1456594200)
self.assertEqual(unified_timestamp('17:30 27-02-2016-0000'), 1456594200)
def test_determine_ext(self):
self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')

View File

@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals
@ -1717,8 +1716,6 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙ
'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
DATE_FORMATS = (
'%d %B %Y',
'%d %b %Y',
'%B %d %Y',
'%B %dst %Y',
'%B %dnd %Y',
@ -1763,6 +1760,11 @@ DATE_FORMATS_DAY_FIRST.extend([
'%d/%m/%Y',
'%d/%m/%y',
'%d/%m/%Y %H:%M:%S',
'%d %B %Y',
'%d %b %Y',
'%d-%b-%Y',
'%H:%M %d-%b-%Y',
'%H:%M:%S %d-%b-%Y',
])
DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
@ -1772,6 +1774,11 @@ DATE_FORMATS_MONTH_FIRST.extend([
'%m/%d/%Y',
'%m/%d/%y',
'%m/%d/%Y %H:%M:%S',
'%B %d %Y',
'%b %d %Y',
'%b-%d-%Y',
'%H:%M %b-%d-%Y',
'%H:%M:%S %b-%d-%Y',
])
PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
@ -2939,7 +2946,16 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
def extract_timezone(date_str):
m = re.search(
r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
r'''(?x)
^.{8,}? # >=8 char non-TZ prefix, if present
(?P<tz>Z| # just the UTC Z, or
(?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
(?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
[ ]? # optional space
(?P<sign>\+|-) # +/-
(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
$)
''',
date_str)
if not m:
timezone = datetime.timedelta()