Bug 1176668 - Fix overflow avoidance in numeric character reference handling. r=wchen.
authorHenri Sivonen <hsivonen@hsivonen.fi>
Tue, 25 Aug 2015 18:05:45 +0300
changeset 259264 bb56d50195c4835545282c518ce7571807ee5883
parent 259263 7b3c5dd7ad30fde4bf6dad4a850ebf90a6f2511e
child 259265 c5c201250f2a2d9774b802b6d7345b31a74fa96a
push id64177
push userhsivonen@mozilla.com
push dateTue, 25 Aug 2015 15:06:52 +0000
treeherdermozilla-inbound@470bee720c97 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerswchen
bugs1176668
milestone43.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1176668 - Fix overflow avoidance in numeric character reference handling. r=wchen.
parser/html/javasrc/Tokenizer.java
parser/html/nsHtml5Tokenizer.cpp
parser/html/nsHtml5Tokenizer.h
parser/htmlparser/tests/mochitest/html5lib_tree_construction/entities01.dat
--- a/parser/html/javasrc/Tokenizer.java
+++ b/parser/html/javasrc/Tokenizer.java
@@ -352,18 +352,16 @@ public class Tokenizer implements Locato
     private int lo;
 
     private int hi;
 
     private int candidate;
 
     private int charRefBufMark;
 
-    private int prevValue;
-
     protected int value;
 
     private boolean seenDigits;
 
     protected int cstart;
 
     /**
      * The SAX public id for the resource being tokenized. (Only passed to back
@@ -3212,17 +3210,16 @@ public class Tokenizer implements Locato
                          */
                     }
                     // XXX reorder point
                 case CONSUME_NCR:
                     if (++pos == endPos) {
                         break stateloop;
                     }
                     c = checkChar(buf, pos);
-                    prevValue = -1;
                     value = 0;
                     seenDigits = false;
                     /*
                      * The behavior further depends on the character after the
                      * U+0023 NUMBER SIGN:
                      */
                     switch (c) {
                         case 'x':
@@ -3264,31 +3261,28 @@ public class Tokenizer implements Locato
                         if (reconsume) {
                             reconsume = false;
                         } else {
                             if (++pos == endPos) {
                                 break stateloop;
                             }
                             c = checkChar(buf, pos);
                         }
-                        // Deal with overflow gracefully
-                        if (value < prevValue) {
-                            value = 0x110000; // Value above Unicode range but
-                            // within int
-                            // range
-                        }
-                        prevValue = value;
                         /*
                          * Consume as many characters as match the range of
                          * characters given above.
                          */
+                        assert value >= 0: "value must not become negative.";
                         if (c >= '0' && c <= '9') {
                             seenDigits = true;
-                            value *= 10;
-                            value += c - '0';
+                            // Avoid overflow
+                            if (value <= 0x10FFFF) {
+                                value *= 10;
+                                value += c - '0';
+                            }
                             continue;
                         } else if (c == ';') {
                             if (seenDigits) {
                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
                                     cstart = pos + 1;
                                 }
                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
                                 // FALL THROUGH continue stateloop;
@@ -3345,41 +3339,44 @@ public class Tokenizer implements Locato
                     continue stateloop;
                     // XXX reorder point
                 case HEX_NCR_LOOP:
                     for (;;) {
                         if (++pos == endPos) {
                             break stateloop;
                         }
                         c = checkChar(buf, pos);
-                        // Deal with overflow gracefully
-                        if (value < prevValue) {
-                            value = 0x110000; // Value above Unicode range but
-                            // within int
-                            // range
-                        }
-                        prevValue = value;
                         /*
                          * Consume as many characters as match the range of
                          * characters given above.
                          */
+                        assert value >= 0: "value must not become negative.";
                         if (c >= '0' && c <= '9') {
                             seenDigits = true;
-                            value *= 16;
-                            value += c - '0';
+                            // Avoid overflow
+                            if (value <= 0x10FFFF) {
+                                value *= 16;
+                                value += c - '0';
+                            }
                             continue;
                         } else if (c >= 'A' && c <= 'F') {
                             seenDigits = true;
-                            value *= 16;
-                            value += c - 'A' + 10;
+                            // Avoid overflow
+                            if (value <= 0x10FFFF) {
+                                value *= 16;
+                                value += c - 'A' + 10;
+                            }
                             continue;
                         } else if (c >= 'a' && c <= 'f') {
                             seenDigits = true;
-                            value *= 16;
-                            value += c - 'a' + 10;
+                            // Avoid overflow
+                            if (value <= 0x10FFFF) {
+                                value *= 16;
+                                value += c - 'a' + 10;
+                            }
                             continue;
                         } else if (c == ';') {
                             if (seenDigits) {
                                 if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
                                     cstart = pos + 1;
                                 }
                                 state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos);
                                 continue stateloop;
@@ -6608,17 +6605,16 @@ public class Tokenizer implements Locato
         forceQuirks = false;
         additional = '\u0000';
         entCol = -1;
         firstCharKey = -1;
         lo = 0;
         hi = 0; // will always be overwritten before use anyway
         candidate = -1;
         charRefBufMark = 0;
-        prevValue = -1;
         value = 0;
         seenDigits = false;
         endTag = false;
         shouldSuspend = false;
         initDoctypeFields();
         if (tagName != null) {
             tagName.release();
             tagName = null;
@@ -6658,17 +6654,16 @@ public class Tokenizer implements Locato
         forceQuirks = other.forceQuirks;
         additional = other.additional;
         entCol = other.entCol;
         firstCharKey = other.firstCharKey;
         lo = other.lo;
         hi = other.hi;
         candidate = other.candidate;
         charRefBufMark = other.charRefBufMark;
-        prevValue = other.prevValue;
         value = other.value;
         seenDigits = other.seenDigits;
         endTag = other.endTag;
         shouldSuspend = false;
 
         if (other.doctypeName == null) {
             doctypeName = null;
         } else {
--- a/parser/html/nsHtml5Tokenizer.cpp
+++ b/parser/html/nsHtml5Tokenizer.cpp
@@ -1653,17 +1653,16 @@ nsHtml5Tokenizer::stateLoop(int32_t stat
           NS_HTML5_CONTINUE(stateloop);
         }
       }
       case NS_HTML5TOKENIZER_CONSUME_NCR: {
         if (++pos == endPos) {
           NS_HTML5_BREAK(stateloop);
         }
         c = checkChar(buf, pos);
-        prevValue = -1;
         value = 0;
         seenDigits = false;
         switch(c) {
           case 'x':
           case 'X': {
             appendCharRefBuf(c);
             state = P::transition(mViewSource, NS_HTML5TOKENIZER_HEX_NCR_LOOP, reconsume, pos);
             NS_HTML5_CONTINUE(stateloop);
@@ -1679,24 +1678,23 @@ nsHtml5Tokenizer::stateLoop(int32_t stat
           if (reconsume) {
             reconsume = false;
           } else {
             if (++pos == endPos) {
               NS_HTML5_BREAK(stateloop);
             }
             c = checkChar(buf, pos);
           }
-          if (value < prevValue) {
-            value = 0x110000;
-          }
-          prevValue = value;
+          MOZ_ASSERT(value >= 0, "value must not become negative.");
           if (c >= '0' && c <= '9') {
             seenDigits = true;
-            value *= 10;
-            value += c - '0';
+            if (value <= 0x10FFFF) {
+              value *= 10;
+              value += c - '0';
+            }
             continue;
           } else if (c == ';') {
             if (seenDigits) {
               if (!(returnState & NS_HTML5TOKENIZER_DATA_AND_RCDATA_MASK)) {
                 cstart = pos + 1;
               }
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_HANDLE_NCR_VALUE, reconsume, pos);
               NS_HTML5_BREAK(decimalloop);
@@ -1745,34 +1743,37 @@ nsHtml5Tokenizer::stateLoop(int32_t stat
         NS_HTML5_CONTINUE(stateloop);
       }
       case NS_HTML5TOKENIZER_HEX_NCR_LOOP: {
         for (; ; ) {
           if (++pos == endPos) {
             NS_HTML5_BREAK(stateloop);
           }
           c = checkChar(buf, pos);
-          if (value < prevValue) {
-            value = 0x110000;
-          }
-          prevValue = value;
+          MOZ_ASSERT(value >= 0, "value must not become negative.");
           if (c >= '0' && c <= '9') {
             seenDigits = true;
-            value *= 16;
-            value += c - '0';
+            if (value <= 0x10FFFF) {
+              value *= 16;
+              value += c - '0';
+            }
             continue;
           } else if (c >= 'A' && c <= 'F') {
             seenDigits = true;
-            value *= 16;
-            value += c - 'A' + 10;
+            if (value <= 0x10FFFF) {
+              value *= 16;
+              value += c - 'A' + 10;
+            }
             continue;
           } else if (c >= 'a' && c <= 'f') {
             seenDigits = true;
-            value *= 16;
-            value += c - 'a' + 10;
+            if (value <= 0x10FFFF) {
+              value *= 16;
+              value += c - 'a' + 10;
+            }
             continue;
           } else if (c == ';') {
             if (seenDigits) {
               if (!(returnState & NS_HTML5TOKENIZER_DATA_AND_RCDATA_MASK)) {
                 cstart = pos + 1;
               }
               state = P::transition(mViewSource, NS_HTML5TOKENIZER_HANDLE_NCR_VALUE, reconsume, pos);
               NS_HTML5_CONTINUE(stateloop);
@@ -3945,17 +3946,16 @@ nsHtml5Tokenizer::resetToDataState()
   forceQuirks = false;
   additional = '\0';
   entCol = -1;
   firstCharKey = -1;
   lo = 0;
   hi = 0;
   candidate = -1;
   charRefBufMark = 0;
-  prevValue = -1;
   value = 0;
   seenDigits = false;
   endTag = false;
   shouldSuspend = false;
   initDoctypeFields();
   if (tagName) {
     tagName->release();
     tagName = nullptr;
@@ -3994,17 +3994,16 @@ nsHtml5Tokenizer::loadState(nsHtml5Token
   forceQuirks = other->forceQuirks;
   additional = other->additional;
   entCol = other->entCol;
   firstCharKey = other->firstCharKey;
   lo = other->lo;
   hi = other->hi;
   candidate = other->candidate;
   charRefBufMark = other->charRefBufMark;
-  prevValue = other->prevValue;
   value = other->value;
   seenDigits = other->seenDigits;
   endTag = other->endTag;
   shouldSuspend = false;
   if (!other->doctypeName) {
     doctypeName = nullptr;
   } else {
     doctypeName = nsHtml5Portability::newLocalFromLocal(other->doctypeName, interner);
--- a/parser/html/nsHtml5Tokenizer.h
+++ b/parser/html/nsHtml5Tokenizer.h
@@ -94,17 +94,16 @@ class nsHtml5Tokenizer
     bool forceQuirks;
     char16_t additional;
     int32_t entCol;
     int32_t firstCharKey;
     int32_t lo;
     int32_t hi;
     int32_t candidate;
     int32_t charRefBufMark;
-    int32_t prevValue;
   protected:
     int32_t value;
   private:
     bool seenDigits;
   protected:
     int32_t cstart;
   private:
     nsString* publicId;
--- a/parser/htmlparser/tests/mochitest/html5lib_tree_construction/entities01.dat
+++ b/parser/htmlparser/tests/mochitest/html5lib_tree_construction/entities01.dat
@@ -716,8 +716,77 @@ FOO&#xFFFFFF;ZOO
 #errors
 (1,3): expected-doctype-but-got-chars
 (1,13): illegal-codepoint-for-numeric-entity
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO�ZOO"
+
+#data
+FOO&#11111111111
+#errors
+(1,3): expected-doctype-but-got-chars
+(1,13): illegal-codepoint-for-numeric-entity
+(1,13): eof-in-numeric-entity
+#document
+| <html>
+|   <head>
+|   <body>
+|     "FOO�"
+
+#data
+FOO&#1111111111
+#errors
+(1,3): expected-doctype-but-got-chars
+(1,13): illegal-codepoint-for-numeric-entity
+(1,13): eof-in-numeric-entity
+#document
+| <html>
+|   <head>
+|   <body>
+|     "FOO�"
+
+#data
+FOO&#111111111111
+#errors
+(1,3): expected-doctype-but-got-chars
+(1,13): illegal-codepoint-for-numeric-entity
+(1,13): eof-in-numeric-entity
+#document
+| <html>
+|   <head>
+|   <body>
+|     "FOO�"
+
+#data
+FOO&#11111111111ZOO
+#errors
+(1,3): expected-doctype-but-got-chars
+(1,13): illegal-codepoint-for-numeric-entity
+#document
+| <html>
+|   <head>
+|   <body>
+|     "FOO�ZOO"
+
+#data
+FOO&#1111111111ZOO
+#errors
+(1,3): expected-doctype-but-got-chars
+(1,13): illegal-codepoint-for-numeric-entity
+#document
+| <html>
+|   <head>
+|   <body>
+|     "FOO�ZOO"
+
+#data
+FOO&#111111111111ZOO
+#errors
+(1,3): expected-doctype-but-got-chars
+(1,13): illegal-codepoint-for-numeric-entity
+#document
+| <html>
+|   <head>
+|   <body>
+|     "FOO�ZOO"