Shorten proposed file name on create if too long

2021-09-19 05:06:21 +01:00 · 2021-09-19 05:06:21 +01:00 · 4af599143a
commit 4af599143a
parent a803582717
4 changed files with 113 additions and 8 deletions
--- a/test/test_compat.py
+++ b/test/test_compat.py
@ -19,6 +19,7 @@ from youtube_dl.compat import (
    compat_shlex_split,
    compat_str,
    compat_struct_unpack,
+    compat_textwrap_shorten,
    compat_urllib_parse_unquote,
    compat_urllib_parse_unquote_plus,
    compat_urllib_parse_urlencode,
@ -121,6 +122,9 @@ class TestCompat(unittest.TestCase):
    def test_struct_unpack(self):
        self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,))

+    def test_compat_textwrap_shorten(self):
+        self.assertEqual(compat_textwrap_shorten('Hello  world!', width=11), 'Hello [...]')
+

 if __name__ == '__main__':
    unittest.main()
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -63,6 +63,7 @@ from youtube_dl.utils import (
    pkcs1pad,
    read_batch_urls,
    sanitize_filename,
+    sanitize_open,
    sanitize_path,
    sanitize_url,
    expand_path,
@ -118,6 +119,16 @@ from youtube_dl.compat import (


 class TestUtil(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.tearDown()
+
+    @classmethod
+    def tearDown(cls):
+        for tf in os.listdir('.'):
+            if os.path.splitext(tf)[1] == '.test':
+                os.remove(tf)
+
    def test_timeconvert(self):
        self.assertTrue(timeconvert('') is None)
        self.assertTrue(timeconvert('bougrg') is None)
@ -231,6 +242,21 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(sanitize_path('./abc'), 'abc')
        self.assertEqual(sanitize_path('./../abc'), '..\\abc')

+    def test_sanitize_open(self):
+        long_name = " I'm a lumberjack ".join(['I sleep all night and I work all day %d' % n for n in range(50)])
+        if sys.platform == 'win32':
+            result = sanitize_open('.\\' + long_name + '.test', open_mode='w')
+            result[0].close()
+            self.assertEqual(
+                result[1],
+                "I sleep all night and I work all day 0 I'm a lumberjack I sleep all night and I work all day 1 I'm a lumberjack I sleep all night and I work all day 2 I'm a lumberjack[...].test")
+        else:
+            result = sanitize_open('./' + long_name + '.test', open_mode='w')
+            result[0].close()
+            self.assertEqual(
+                result[1],
+                "./I sleep all night and I work all day 0 I'm a lumberjack I sleep all night and I work all day 1 I'm a lumberjack I sleep all night and I work all day 2 I'm a lumberjack[...].test")
+
    def test_sanitize_url(self):
        self.assertEqual(sanitize_url('//foo.bar'), 'http://foo.bar')
        self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar')
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@ -2997,6 +2997,29 @@ else:
    def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
        return ctypes.WINFUNCTYPE(*args, **kwargs)

+# Compat version of textwrap.shorten(), not in Py2 textwrap
+# Extractors can use this to précis a long metadata field, eg
+# to make a title from a description
+try:
+    from textwrap import shorten as compat_textwrap_shorten
+except ImportError:  # Python 2
+    def compat_textwrap_shorten(
+            text, width, fix_sentence_endings=False, break_long_words=True,
+            break_on_hyphens=True, placeholder=' [...]'):
+        import textwrap
+        try_text = textwrap.wrap(
+            text, width,
+            fix_sentence_endings=fix_sentence_endings,
+            break_long_words=break_long_words,
+            break_on_hyphens=break_on_hyphens)
+        if len(try_text) == 1:
+            return try_text[0]
+        return textwrap.wrap(
+            text, width - len(placeholder),
+            fix_sentence_endings=fix_sentence_endings,
+            break_long_words=break_long_words,
+            break_on_hyphens=break_on_hyphens)[0] + placeholder
+

 __all__ = [
    'compat_HTMLParseError',
@ -3040,6 +3063,7 @@ __all__ = [
    'compat_struct_pack',
    'compat_struct_unpack',
    'compat_subprocess_get_DEVNULL',
+    'compat_textwrap_shorten',
    'compat_tokenize_tokenize',
    'compat_urllib_error',
    'compat_urllib_parse',
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -57,6 +57,7 @@ from .compat import (
    compat_str,
    compat_struct_pack,
    compat_struct_unpack,
+    compat_textwrap_shorten,
    compat_urllib_error,
    compat_urllib_parse,
    compat_urllib_parse_urlencode,
@ -2036,6 +2037,28 @@ def clean_html(html):
    return html.strip()


+def reduce_filename(path, reduction=0.5, min_length=20, ellipsis='[...]'):
+    """Try to reduce the filename by a specified reduction factor
+
+    Arguments:
+    path -- the path name to reduce
+    reduction -- factor by which to reduce its filename component
+    ellipsis -- placeholder for removed text
+
+    Returns path name with reduced filename, or None
+    """
+
+    fname = os.path.split(path)
+    fname = list(fname[:1] + os.path.splitext(fname[1]))
+    fname[1] = remove_end(fname[1], ellipsis)
+    flen = len(fname[1])
+    if flen < min_length:
+        # give up
+        return None
+    fname[1] = compat_textwrap_shorten(fname[1], int(1 + reduction * flen), placeholder=ellipsis)
+    return os.path.join(fname[0], ''.join(fname[1:]))
+
+
 def sanitize_open(filename, open_mode):
    """Try to open the given filename, and slightly tweak it if this fails.

@ -2046,26 +2069,54 @@ def sanitize_open(filename, open_mode):

    It returns the tuple (stream, definitive_file_name).
    """
+    def openfile(filename, open_mode):
+        stream = open(encodeFilename(filename), open_mode)
+        return (stream, filename)
+
    try:
        if filename == '-':
            if sys.platform == 'win32':
                import msvcrt
                msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
            return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
-        stream = open(encodeFilename(filename), open_mode)
-        return (stream, filename)
+        return openfile(filename, open_mode)
    except (IOError, OSError) as err:
        if err.errno in (errno.EACCES,):
            raise

-        # In case of error, try to remove win32 forbidden chars
-        alt_filename = sanitize_path(filename)
-        if alt_filename == filename:
+        if 'w' not in open_mode or '+' in open_mode:
+            # only mung filename when creating the file
            raise
+
+        org_err = err
+
+        # In case of error, try to remove win32 forbidden chars
+        if err.errno in (errno.EINVAL, ):
+            alt_filename = sanitize_path(filename)
+            if alt_filename != filename:
+                try:
+                    return openfile(alt_filename, open_mode)
+                except (IOError, OSError) as new_err:
+                    err = new_err
        else:
-            # An exception here should be caught in the caller
-            stream = open(encodeFilename(alt_filename), open_mode)
-            return (stream, alt_filename)
+            alt_filename = filename
+
+        # Windows: an over-long file name can be detected by the CreateFile()
+        # API, and then get EINVAL, or by the filesystem, and then perhaps
+        # ENAMETOOLONG
+        # POSIX: ENAMETOOLONG in general
+        while err.errno in (errno.ENAMETOOLONG, errno.EINVAL, ):
+            alt_filename = reduce_filename(alt_filename)
+            if not alt_filename:
+                break
+            try:
+                return openfile(alt_filename, open_mode)
+            except (IOError, OSError) as new_err:
+                err = new_err
+
+        # Reduction didn't help; give up and report what initially went wrong
+        # This exception should be caught in the caller
+        raise org_err


 def timeconvert(timestr):