author Henri Sivonen <hsivonen@hsivonen.fi>
Fri, 06 Jul 2018 10:44:43 +0300
changeset 489140 4ef0f163fdeb9afeddd87b37bfd987298c038542
parent 352885 360df5276cf6e2d7294ce0e122952cf56bc5372e
child 538050 020c8c871c0d3b3920fe95935cfef06501976c0f
permissions -rw-r--r--
Bug 1402247 - Use encoding_rs for XPCOM string encoding conversions. r=Nika,erahm,froydnj. Correctness improvements: * UTF errors are handled safely per spec instead of dangerously truncating strings. * There are fewer converter implementations. Performance improvements: * The old code did exact buffer length math, which meant doing UTF math twice on each input string (once for length calculation and another time for conversion). Exact length math is more complicated when handling errors properly, which the old code didn't do. The new code does UTF math on the string content only once (when converting) but risks allocating more than once. There are heuristics in place to lower the probability of reallocation in cases where the double math avoidance isn't enough of a saving to absorb an allocation and memcpy. * Previously, in UTF-16 <-> UTF-8 conversions, an ASCII prefix was optimized but a single non-ASCII code point pessimized the rest of the string. The new code tries to get back on the fast ASCII path. * UTF-16 to Latin1 conversion guarantees less about handling of out-of-range input to eliminate an operation from the inner loop on x86/x86_64. * When assigning to a pre-existing string, the new code tries to reuse the old buffer instead of first releasing the old buffer and then allocating a new one. * When reallocating from the new code, the memcpy covers only the data that is part of the logical length of the old string instead of memcpying the whole capacity. (For old callers old excess memcpy behavior is preserved due to bogus callers. See bug 1472113.) * UTF-8 strings in XPConnect that are in the Latin1 range are passed to SpiderMonkey as Latin1. New features: * Conversion between UTF-8 and Latin1 is added in order to enable faster future interop between Rust code (or otherwise UTF-8-using code) and text node and SpiderMonkey code that uses Latin1. MozReview-Commit-ID: JaJuExfILM9

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsISupports.idl"

interface nsIUnicharInputStream;
interface nsIInputStream;

 * The signature of the writer function passed to ReadSegments. This
 * is the "consumer" of data that gets read from the stream's buffer.
 * @param aInStream stream being read
 * @param aClosure opaque parameter passed to ReadSegments
 * @param aFromSegment pointer to memory owned by the input stream
 * @param aToOffset amount already read (since ReadSegments was called)
 * @param aCount length of fromSegment
 * @param aWriteCount number of bytes read
 * Implementers should return the following:
 * @throws <any-error> if not interested in consuming any data
 * Errors are never passed to the caller of ReadSegments.
 * NOTE: returning NS_OK and (*aWriteCount = 0) has undefined behavior.
typedef nsresult (*nsWriteUnicharSegmentFun)(nsIUnicharInputStream *aInStream,
                                             void *aClosure,
                                             const char16_t *aFromSegment,
                                             uint32_t aToOffset,
                                             uint32_t aCount,
                                             uint32_t *aWriteCount);
native nsWriteUnicharSegmentFun(nsWriteUnicharSegmentFun);

 * Abstract unicode character input stream
 * @see nsIInputStream
[scriptable, uuid(d5e3bd80-6723-4b92-b0c9-22f6162fd94f)]
interface nsIUnicharInputStream : nsISupports {
   * Reads into a caller-provided character array.
   * @return The number of characters that were successfully read. May be less
   *         than aCount, even if there is more data in the input stream.
   *         A return value of 0 means EOF.
   * @note To read more than 2^32 characters, call this method multiple times.
  [noscript] unsigned long read([array, size_is(aCount)] in char16_t aBuf,
                                in unsigned long aCount);

   * Low-level read method that has access to the stream's underlying buffer.
   * The writer function may be called multiple times for segmented buffers.
   * ReadSegments is expected to keep calling the writer until either there is
   * nothing left to read or the writer returns an error.  ReadSegments should
   * not call the writer with zero characters to consume.
   * @param aWriter the "consumer" of the data to be read
   * @param aClosure opaque parameter passed to writer 
   * @param aCount the maximum number of characters to be read
   * @return number of characters read (may be less than aCount)
   * @return 0 if reached end of file (or if aWriter refused to consume data)
   * @throws NS_BASE_STREAM_WOULD_BLOCK if reading from the input stream would
   *   block the calling thread (non-blocking mode only)
   * @throws <other-error> on failure
   * NOTE: this function may be unimplemented if a stream has no underlying
   * buffer
  [noscript] unsigned long readSegments(in nsWriteUnicharSegmentFun aWriter,
                                        in voidPtr aClosure,
                                        in unsigned long aCount);

   * Read into a string object.
   * @param aCount The number of characters that should be read
   * @return The number of characters that were read.
  unsigned long readString(in unsigned long aCount, out AString aString);

  * Close the stream and free associated resources. This also closes the
  * underlying stream, if any.
  void close();