Fix unicode conversion error recovery in HTML5 parser
authorHenri Sivonen <hsivonen@iki.fi>
Tue, 21 Apr 2009 14:35:02 +0300
changeset 26758 fdbedef2a8c2
parent 26757 3ab996dae85f
child 26759 435227e1a5a9
push id48
push userhsivonen@iki.fi
push dateTue, 21 Apr 2009 11:35:25 +0000
milestone1.9.2a1pre
Fix unicode conversion error recovery in HTML5 parser
content/html/parser/src/nsHtml5Parser.cpp
--- a/content/html/parser/src/nsHtml5Parser.cpp
+++ b/content/html/parser/src/nsHtml5Parser.cpp
@@ -594,16 +594,17 @@ nsHtml5Parser::OnStartRequest(nsIRequest
 //    }
 //  }
 
   if (mCharsetSource >= kCharsetFromChannel) {
     nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
     NS_ENSURE_SUCCESS(rv, rv);
     rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
     NS_ENSURE_SUCCESS(rv, rv);
+    mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
   }
 
   return rv;
 }
 
 /**
  *  This is called by the networking library once the last block of data
  *  has been collected from the net.
@@ -920,16 +921,17 @@ nsHtml5Parser::SetupDecodingAndWriteSnif
   NS_ENSURE_SUCCESS(rv, rv);
   rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
   if (rv == NS_ERROR_UCONV_NOCONV) {
     mCharset.Assign("windows-1252"); // lower case the raw form
     mCharsetSource = kCharsetFromWeakDocTypeDefault;
     rv = convManager->GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));  
   }
   NS_ENSURE_SUCCESS(rv, rv);
+  mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
   return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
 }
 
 nsresult
 nsHtml5Parser::WriteSniffingBufferAndCurrentSegment(const PRUint8* aFromSegment,
                                                     PRUint32 aCount,
                                                     PRUint32* aWriteCount)
 {
@@ -1051,16 +1053,17 @@ nsHtml5Parser::SniffStreamBytes(const PR
     mMetaScanner = new nsHtml5MetaScanner();
   }
   if (mSniffingLength + aCount >= NS_HTML5_PARSER_SNIFFING_BUFFER_SIZE) {
     // this is the last buffer
     PRUint32 countToSniffingLimit = NS_HTML5_PARSER_SNIFFING_BUFFER_SIZE - mSniffingLength;
     nsHtml5ByteReadable readable(aFromSegment, aFromSegment + countToSniffingLimit);
     mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
     if (mUnicodeDecoder) {
+      mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Recover);
       // meta scan successful
       mCharsetSource = kCharsetFromMetaPrescan;
       delete mMetaScanner;
       mMetaScanner = nsnull;
       return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
     }
     // meta scan failed.
     if (mCharsetSource >= kCharsetFromHintPrevDoc) {
@@ -1132,28 +1135,32 @@ nsHtml5Parser::WriteStreamBytes(const PR
     PRInt32 end = mLastBuffer->getEnd();
     PRInt32 byteCount = aCount - totalByteCount;
     PRInt32 utf16Count = NS_HTML5_PARSER_READ_BUFFER_SIZE - end;
 
     NS_ASSERTION(utf16Count, "Trying to convert into a buffer with no free space!");
 
     nsresult convResult = mUnicodeDecoder->Convert((const char*)aFromSegment, &byteCount, mLastBuffer->getBuffer() + end, &utf16Count);  
 
-    mLastBuffer->setEnd(end + utf16Count);
+    end += utf16Count;
+    mLastBuffer->setEnd(end);
     totalByteCount += byteCount;
     aFromSegment += byteCount;
 
     NS_ASSERTION((mLastBuffer->getEnd() <= NS_HTML5_PARSER_READ_BUFFER_SIZE), "The Unicode decoder wrote too much data.");
 
     if (NS_FAILED(convResult)) {
-      ++totalByteCount;
-      ++aFromSegment;
+      if (totalByteCount < aCount) { // mimicking nsScanner even though this seems wrong
+        ++totalByteCount;
+        ++aFromSegment;
+      }
       mLastBuffer->getBuffer()[end] = 0xFFFD;
-      mLastBuffer->setEnd(end + 1);
-      if (mLastBuffer->getEnd() == NS_HTML5_PARSER_READ_BUFFER_SIZE) {
+      ++end;
+      mLastBuffer->setEnd(end);
+      if (end == NS_HTML5_PARSER_READ_BUFFER_SIZE) {
           mLastBuffer = (mLastBuffer->next = new nsHtml5UTF16Buffer(NS_HTML5_PARSER_READ_BUFFER_SIZE));
       }
       mUnicodeDecoder->Reset();
       if (totalByteCount == aCount) {
         *aWriteCount = totalByteCount;
         return NS_OK;            
       }
     } else if (convResult == NS_PARTIAL_MORE_OUTPUT) {