posix: use getutf8char to handle OS X filename percent-escaping
authorMatt Mackall <mpm@selenic.com>
Thu, 05 Nov 2015 17:09:00 -0600
changeset 28071 b8381832ce2b4e791ba7bb6d0822e82203475d8e
parent 28070 cf47bdb2183caaaa5daa2acb0d7ab7e88c12b9f9
child 28072 cb467a9d759321b114a388005491ba9667642025
push id52
push usergszorc@mozilla.com
push dateTue, 10 Nov 2015 20:57:45 +0000
posix: use getutf8char to handle OS X filename percent-escaping This replaces an open-coded utf-8 parser that was ignoring subtle issues like overlong encodings.
mercurial/posix.py
--- a/mercurial/posix.py
+++ b/mercurial/posix.py
@@ -250,50 +250,27 @@ if sys.platform == 'darwin':
     normcasespec = encoding.normcasespecs.lower
 
     def normcasefallback(path):
         try:
             u = path.decode('utf-8')
         except UnicodeDecodeError:
             # OS X percent-encodes any bytes that aren't valid utf-8
             s = ''
-            g = ''
-            l = 0
-            for c in path:
-                o = ord(c)
-                if l and o < 128 or o >= 192:
-                    # we want a continuation byte, but didn't get one
-                    s += ''.join(["%%%02X" % ord(x) for x in g])
-                    g = ''
-                    l = 0
-                if l == 0 and o < 128:
-                    # ascii
-                    s += c
-                elif l == 0 and 194 <= o < 245:
-                    # valid leading bytes
-                    if o < 224:
-                        l = 1
-                    elif o < 240:
-                        l = 2
-                    else:
-                        l = 3
-                    g = c
-                elif l > 0 and 128 <= o < 192:
-                    # valid continuations
-                    g += c
-                    l -= 1
-                    if not l:
-                        s += g
-                        g = ''
-                else:
-                    # invalid
-                    s += "%%%02X" % o
+            pos = 0
+            l = len(s)
+            while pos < l:
+                try:
+                    c = encoding.getutf8char(path, pos)
+                    pos += len(c)
+                except ValueError:
+                    c = '%%%%02X' % path[pos]
+                    pos += 1
+                s += c
 
-            # any remaining partial characters
-            s += ''.join(["%%%02X" % ord(x) for x in g])
             u = s.decode('utf-8')
 
         # Decompose then lowercase (HFS+ technote specifies lower)
         enc = unicodedata.normalize('NFD', u).lower().encode('utf-8')
         # drop HFS+ ignored characters
         return encoding.hfsignoreclean(enc)
 
 if sys.platform == 'cygwin':