Bug 368516: UTF-8 encoded scripts that contain a BOM result in an "illegal character" error, r=mrbkap, r=brendan, a=schrep
authorcrowder@fiverocks.com
Wed, 05 Dec 2007 21:09:38 -0800
changeset 8786 b715fa5f6f3ef53c048cc3bf376bbeb1cb2b433b
parent 8785 ca7d556a4ed1a0ade1560c733d5c29459142e399
child 8787 3610b4419c52c0c28214c29f0e0f08f9aa3f7e72
push idunknown
push userunknown
push dateunknown
reviewersmrbkap, brendan, schrep
bugs368516
milestone1.9b2pre
Bug 368516: UTF-8 encoded scripts that contain a BOM result in an "illegal character" error, r=mrbkap, r=brendan, a=schrep
js/src/jsscan.c
--- a/js/src/jsscan.c
+++ b/js/src/jsscan.c
@@ -293,154 +293,160 @@ GetChar(JSTokenStream *ts)
     ptrdiff_t i, j, len, olen;
     JSBool crflag;
     char cbuf[JS_LINE_LIMIT];
     jschar *ubuf, *nl;
 
     if (ts->ungetpos != 0) {
         c = ts->ungetbuf[--ts->ungetpos];
     } else {
-        if (ts->linebuf.ptr == ts->linebuf.limit) {
-            len = PTRDIFF(ts->userbuf.limit, ts->userbuf.ptr, jschar);
-            if (len <= 0) {
-                if (!ts->file) {
-                    ts->flags |= TSF_EOF;
-                    return EOF;
-                }
-
-                /* Fill ts->userbuf so that \r and \r\n convert to \n. */
-                crflag = (ts->flags & TSF_CRFLAG) != 0;
-                len = js_fgets(cbuf, JS_LINE_LIMIT - crflag, ts->file);
+        do {
+            if (ts->linebuf.ptr == ts->linebuf.limit) {
+                len = PTRDIFF(ts->userbuf.limit, ts->userbuf.ptr, jschar);
                 if (len <= 0) {
-                    ts->flags |= TSF_EOF;
-                    return EOF;
-                }
-                olen = len;
-                ubuf = ts->userbuf.base;
-                i = 0;
-                if (crflag) {
-                    ts->flags &= ~TSF_CRFLAG;
-                    if (cbuf[0] != '\n') {
-                        ubuf[i++] = '\n';
-                        len++;
-                        ts->linepos--;
+                    if (!ts->file) {
+                        ts->flags |= TSF_EOF;
+                        return EOF;
+                    }
+            
+                    /* Fill ts->userbuf so that \r and \r\n convert to \n. */
+                    crflag = (ts->flags & TSF_CRFLAG) != 0;
+                    len = js_fgets(cbuf, JS_LINE_LIMIT - crflag, ts->file);
+                    if (len <= 0) {
+                        ts->flags |= TSF_EOF;
+                        return EOF;
                     }
+                    olen = len;
+                    ubuf = ts->userbuf.base;
+                    i = 0;
+                    if (crflag) {
+                        ts->flags &= ~TSF_CRFLAG;
+                        if (cbuf[0] != '\n') {
+                            ubuf[i++] = '\n';
+                            len++;
+                            ts->linepos--;
+                        }
+                    }
+                    for (j = 0; i < len; i++, j++)
+                        ubuf[i] = (jschar) (unsigned char) cbuf[j];
+                    ts->userbuf.limit = ubuf + len;
+                    ts->userbuf.ptr = ubuf;
                 }
-                for (j = 0; i < len; i++, j++)
-                    ubuf[i] = (jschar) (unsigned char) cbuf[j];
-                ts->userbuf.limit = ubuf + len;
-                ts->userbuf.ptr = ubuf;
-            }
-            if (ts->listener) {
-                ts->listener(ts->filename, ts->lineno, ts->userbuf.ptr, len,
-                             &ts->listenerTSData, ts->listenerData);
-            }
-
-            nl = ts->saveEOL;
-            if (!nl) {
-                /*
-                 * Any one of \n, \r, or \r\n ends a line (the longest
-                 * match wins).  Also allow the Unicode line and paragraph
-                 * separators.
-                 */
-                for (nl = ts->userbuf.ptr; nl < ts->userbuf.limit; nl++) {
+                if (ts->listener) {
+                    ts->listener(ts->filename, ts->lineno, ts->userbuf.ptr, len,
+                                 &ts->listenerTSData, ts->listenerData);
+                }
+            
+                nl = ts->saveEOL;
+                if (!nl) {
                     /*
-                     * Try to prevent value-testing on most characters by
-                     * filtering out characters that aren't 000x or 202x.
+                     * Any one of \n, \r, or \r\n ends a line (the longest
+                     * match wins).  Also allow the Unicode line and paragraph
+                     * separators.
                      */
-                    if ((*nl & 0xDFD0) == 0) {
-                        if (*nl == '\n')
-                            break;
-                        if (*nl == '\r') {
-                            if (nl + 1 < ts->userbuf.limit && nl[1] == '\n')
-                                nl++;
-                            break;
+                    for (nl = ts->userbuf.ptr; nl < ts->userbuf.limit; nl++) {
+                        /*
+                         * Try to prevent value-testing on most characters by
+                         * filtering out characters that aren't 000x or 202x.
+                         */
+                        if ((*nl & 0xDFD0) == 0) {
+                            if (*nl == '\n')
+                                break;
+                            if (*nl == '\r') {
+                                if (nl + 1 < ts->userbuf.limit && nl[1] == '\n')
+                                    nl++;
+                                break;
+                            }
+                            if (*nl == LINE_SEPARATOR || *nl == PARA_SEPARATOR)
+                                break;
                         }
-                        if (*nl == LINE_SEPARATOR || *nl == PARA_SEPARATOR)
-                            break;
                     }
                 }
-            }
-
-            /*
-             * If there was a line terminator, copy thru it into linebuf.
-             * Else copy JS_LINE_LIMIT-1 bytes into linebuf.
-             */
-            if (nl < ts->userbuf.limit)
-                len = PTRDIFF(nl, ts->userbuf.ptr, jschar) + 1;
-            if (len >= JS_LINE_LIMIT) {
-                len = JS_LINE_LIMIT - 1;
-                ts->saveEOL = nl;
-            } else {
-                ts->saveEOL = NULL;
-            }
-            js_strncpy(ts->linebuf.base, ts->userbuf.ptr, len);
-            ts->userbuf.ptr += len;
-            olen = len;
-
-            /*
-             * Make sure linebuf contains \n for EOL (don't do this in
-             * userbuf because the user's string might be readonly).
-             */
-            if (nl < ts->userbuf.limit) {
-                if (*nl == '\r') {
-                    if (ts->linebuf.base[len-1] == '\r') {
-                        /*
-                         * Does the line segment end in \r?  We must check
-                         * for a \n at the front of the next segment before
-                         * storing a \n into linebuf.  This case matters
-                         * only when we're reading from a file.
-                         */
-                        if (nl + 1 == ts->userbuf.limit && ts->file) {
+            
+                /*
+                 * If there was a line terminator, copy thru it into linebuf.
+                 * Else copy JS_LINE_LIMIT-1 bytes into linebuf.
+                 */
+                if (nl < ts->userbuf.limit)
+                    len = PTRDIFF(nl, ts->userbuf.ptr, jschar) + 1;
+                if (len >= JS_LINE_LIMIT) {
+                    len = JS_LINE_LIMIT - 1;
+                    ts->saveEOL = nl;
+                } else {
+                    ts->saveEOL = NULL;
+                }
+                js_strncpy(ts->linebuf.base, ts->userbuf.ptr, len);
+                ts->userbuf.ptr += len;
+                olen = len;
+            
+                /*
+                 * Make sure linebuf contains \n for EOL (don't do this in
+                 * userbuf because the user's string might be readonly).
+                 */
+                if (nl < ts->userbuf.limit) {
+                    if (*nl == '\r') {
+                        if (ts->linebuf.base[len-1] == '\r') {
+                            /*
+                             * Does the line segment end in \r?  We must check
+                             * for a \n at the front of the next segment before
+                             * storing a \n into linebuf.  This case matters
+                             * only when we're reading from a file.
+                             */
+                            if (nl + 1 == ts->userbuf.limit && ts->file) {
+                                len--;
+                                ts->flags |= TSF_CRFLAG; /* clear NLFLAG? */
+                                if (len == 0) {
+                                    /*
+                                     * This can happen when a segment ends in
+                                     * \r\r.  Start over.  ptr == limit in this
+                                     * case, so we'll fall into buffer-filling
+                                     * code.
+                                     */
+                                    return GetChar(ts);
+                                }
+                            } else {
+                                ts->linebuf.base[len-1] = '\n';
+                            }
+                        }
+                    } else if (*nl == '\n') {
+                        if (nl > ts->userbuf.base &&
+                            nl[-1] == '\r' &&
+                            ts->linebuf.base[len-2] == '\r') {
                             len--;
-                            ts->flags |= TSF_CRFLAG; /* clear NLFLAG? */
-                            if (len == 0) {
-                                /*
-                                 * This can happen when a segment ends in
-                                 * \r\r.  Start over.  ptr == limit in this
-                                 * case, so we'll fall into buffer-filling
-                                 * code.
-                                 */
-                                return GetChar(ts);
-                            }
-                        } else {
+                            JS_ASSERT(ts->linebuf.base[len] == '\n');
                             ts->linebuf.base[len-1] = '\n';
                         }
-                    }
-                } else if (*nl == '\n') {
-                    if (nl > ts->userbuf.base &&
-                        nl[-1] == '\r' &&
-                        ts->linebuf.base[len-2] == '\r') {
-                        len--;
-                        JS_ASSERT(ts->linebuf.base[len] == '\n');
+                    } else if (*nl == LINE_SEPARATOR || *nl == PARA_SEPARATOR) {
                         ts->linebuf.base[len-1] = '\n';
                     }
-                } else if (*nl == LINE_SEPARATOR || *nl == PARA_SEPARATOR) {
-                    ts->linebuf.base[len-1] = '\n';
                 }
+            
+                /* Reset linebuf based on adjusted segment length. */
+                ts->linebuf.limit = ts->linebuf.base + len;
+                ts->linebuf.ptr = ts->linebuf.base;
+            
+                /* Update position of linebuf within physical userbuf line. */
+                if (!(ts->flags & TSF_NLFLAG))
+                    ts->linepos += ts->linelen;
+                else
+                    ts->linepos = 0;
+                if (ts->linebuf.limit[-1] == '\n')
+                    ts->flags |= TSF_NLFLAG;
+                else
+                    ts->flags &= ~TSF_NLFLAG;
+            
+                /* Update linelen from original segment length. */
+                ts->linelen = olen;
             }
-
-            /* Reset linebuf based on adjusted segment length. */
-            ts->linebuf.limit = ts->linebuf.base + len;
-            ts->linebuf.ptr = ts->linebuf.base;
-
-            /* Update position of linebuf within physical userbuf line. */
-            if (!(ts->flags & TSF_NLFLAG))
-                ts->linepos += ts->linelen;
-            else
-                ts->linepos = 0;
-            if (ts->linebuf.limit[-1] == '\n')
-                ts->flags |= TSF_NLFLAG;
-            else
-                ts->flags &= ~TSF_NLFLAG;
-
-            /* Update linelen from original segment length. */
-            ts->linelen = olen;
-        }
-        c = *ts->linebuf.ptr++;
+            c = *ts->linebuf.ptr++;
+        /*
+         * In the hopes of being liberal in what we accept, we toss out little-
+         * and big-endian byte order markers here, see bug 368516.
+         */
+        } while (c == 0xfffe || c == 0xfeff);
     }
     if (c == '\n')
         ts->lineno++;
     return c;
 }
 
 static void
 UngetChar(JSTokenStream *ts, int32 c)