Bug 586698 - Add SSE versions of LossyConvertEncoding; r=tterribe,jst
authorJustin Lebar <justin.lebar@gmail.com>
Fri, 13 Aug 2010 09:15:44 -0700
changeset 64047 d0b98f8c4734e5b45502b7778b0648baf71e04c0
parent 64046 901f41d1142ec8b0cc31f19c782c50d6a9d95f4e
child 64048 bbacb90f4c63401400b5403689cce9d851d29677
push idunknown
push userunknown
push dateunknown
reviewerstterribe, jst
bugs586698
milestone2.2a1pre
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 586698 - Add SSE versions of LossyConvertEncoding; r=tterribe,jst
content/base/src/nsTextFragment.cpp
xpcom/string/public/nsUTF8Utils.h
xpcom/string/src/Makefile.in
xpcom/string/src/nsReadableUtils.cpp
xpcom/string/src/nsUTF8UtilsSSE2.cpp
--- a/content/base/src/nsTextFragment.cpp
+++ b/content/base/src/nsTextFragment.cpp
@@ -222,19 +222,17 @@ nsTextFragment::SetTo(const PRUnichar* a
   } else {
     // Use 1 byte storage because we can
     char* buff = (char *)nsMemory::Alloc(aLength * sizeof(char));
     if (!buff) {
       return;
     }
 
     // Copy data
-    // Use the same copying code we use elsewhere; it's likely to be
-    // carefully tuned.
-    LossyConvertEncoding<PRUnichar, char> converter(buff);
+    LossyConvertEncoding16to8 converter(buff);
     copy_string(aBuffer, aBuffer+aLength, converter);
     m1b = buff;
   }
 
   // Setup our fields
   mState.mInHeap = PR_TRUE;
   mState.mIs2b = need2;
   mState.mLength = aLength;
@@ -255,19 +253,18 @@ nsTextFragment::CopyTo(PRUnichar *aDest,
   }
 
   if (aCount != 0) {
     if (mState.mIs2b) {
       memcpy(aDest, m2b + aOffset, sizeof(PRUnichar) * aCount);
     } else {
       const char *cp = m1b + aOffset;
       const char *end = cp + aCount;
-      while (cp < end) {
-        *aDest++ = (unsigned char)(*cp++);
-      }
+      LossyConvertEncoding8to16 converter(aDest);
+      copy_string(cp, end, converter);
     }
   }
 }
 
 void
 nsTextFragment::Append(const PRUnichar* aBuffer, PRUint32 aLength)
 {
   // This is a common case because some callsites create a textnode
@@ -311,21 +308,20 @@ nsTextFragment::Append(const PRUnichar* 
     // The old data was 1-byte, but the new is not so we have to expand it
     // all to 2-byte
     PRUnichar* buff = (PRUnichar*)nsMemory::Alloc((mState.mLength + aLength) *
                                                   sizeof(PRUnichar));
     if (!buff) {
       return;
     }
 
-    // Copy data
-    for (PRUint32 i = 0; i < mState.mLength; ++i) {
-      buff[i] = (unsigned char)m1b[i];
-    }
-    
+    // Copy data into buff
+    LossyConvertEncoding8to16 converter(buff);
+    copy_string(m1b, m1b+mState.mLength, converter);
+
     memcpy(buff + mState.mLength, aBuffer, aLength * sizeof(PRUnichar));
 
     mState.mLength += aLength;
     mState.mIs2b = PR_TRUE;
 
     if (mState.mInHeap) {
       nsMemory::Free(m2b);
     }
@@ -349,20 +345,20 @@ nsTextFragment::Append(const PRUnichar* 
     buff = (char*)nsMemory::Alloc((mState.mLength + aLength) * sizeof(char));
     if (!buff) {
       return;
     }
 
     memcpy(buff, m1b, mState.mLength);
     mState.mInHeap = PR_TRUE;
   }
-    
-  for (PRUint32 i = 0; i < aLength; ++i) {
-    buff[mState.mLength + i] = (char)aBuffer[i];
-  }
+
+  // Copy aBuffer into buff.
+  LossyConvertEncoding16to8 converter(buff + mState.mLength);
+  copy_string(aBuffer, aBuffer + aLength, converter);
 
   m1b = buff;
   mState.mLength += aLength;
 
 }
 
 // To save time we only do this when we really want to know, not during
 // every allocation
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@@ -38,16 +38,17 @@
 #ifndef nsUTF8Utils_h_
 #define nsUTF8Utils_h_
 
 // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
 // file will provide signatures for the Mozilla abstract string types. It will
 // use XPCOM assertion/debugging macros, etc.
 
 #include "nscore.h"
+#include "mozilla/SSE.h"
 
 #include "nsCharTraits.h"
 
 class UTF8traits
   {
     public:
       static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
       static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
@@ -657,45 +658,95 @@ class CalculateUTF8Size
       }
 
     private:
       size_t mSize;
   };
 
 #ifdef MOZILLA_INTERNAL_API
 /**
- * A character sink that performs a |reinterpret_cast| style conversion
- * between character types.
+ * A character sink that performs a |reinterpret_cast|-style conversion
+ * from char to PRUnichar.
  */
-template <class FromCharT, class ToCharT>
-class LossyConvertEncoding
+class LossyConvertEncoding8to16
   {
     public:
-      typedef FromCharT value_type;
- 
-      typedef FromCharT input_type;
-      typedef ToCharT   output_type;
-
-      typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type;
+      typedef char      value_type;
+      typedef char      input_type;
+      typedef PRUnichar output_type;
 
     public:
-      LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { }
+      LossyConvertEncoding8to16( PRUnichar* aDestination ) :
+        mDestination(aDestination) { }
 
       void
-      write( const input_type* aSource, PRUint32 aSourceLength )
+      write( const char* aSource, PRUint32 aSourceLength )
         {
-          const input_type* done_writing = aSource + aSourceLength;
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+          if (mozilla::supports_sse2())
+            {
+              write_sse2(aSource, aSourceLength);
+              return;
+            }
+#endif
+          const char* done_writing = aSource + aSourceLength;
           while ( aSource < done_writing )
-            *mDestination++ = (output_type)(unsigned_input_type)(*aSource++);  // use old-style cast to mimic old |ns[C]String| behavior
+            *mDestination++ = (PRUnichar)(unsigned char)(*aSource++);
         }
 
       void
+      write_sse2( const char* aSource, PRUint32 aSourceLength );
+
+      void
       write_terminator()
         {
-          *mDestination = output_type(0);
+          *mDestination = (PRUnichar)(0);
         }
 
     private:
-      output_type* mDestination;
+      PRUnichar* mDestination;
+  };
+
+/**
+ * A character sink that performs a |reinterpret_cast|-style conversion
+ * from PRUnichar to char.
+ */
+class LossyConvertEncoding16to8
+  {
+    public:
+      typedef PRUnichar value_type;
+      typedef PRUnichar input_type;
+      typedef char      output_type;
+
+      LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
+
+      void
+      write( const PRUnichar* aSource, PRUint32 aSourceLength)
+        {
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+          if (mozilla::supports_sse2())
+            {
+              write_sse2(aSource, aSourceLength);
+              return;
+            }
+#endif
+            const PRUnichar* done_writing = aSource + aSourceLength;
+            while ( aSource < done_writing )
+              *mDestination++ = (char)(*aSource++);
+        }
+
+#ifdef MOZILLA_MAY_SUPPORT_SSE2
+      void
+      write_sse2( const PRUnichar* aSource, PRUint32 aSourceLength );
+#endif
+
+      void
+      write_terminator()
+        {
+          *mDestination = '\0';
+        }
+
+    private:
+      char *mDestination;
   };
 #endif // MOZILLA_INTERNAL_API
 
 #endif /* !defined(nsUTF8Utils_h_) */
--- a/xpcom/string/src/Makefile.in
+++ b/xpcom/string/src/Makefile.in
@@ -65,11 +65,23 @@ CPPSRCS		=				\
 
 # we don't want the shared lib, but we want to force the creation of a
 # static lib.
 FORCE_STATIC_LIB = 1
 
 # Force use of PIC
 FORCE_USE_PIC	= 1
 
+# Are we targeting x86 or x86-64?  If so, compile the SSE2 functions for
+# nsUTF8Utils.cpp.
+ifneq (,$(INTEL_ARCHITECTURE))
+CPPSRCS += nsUTF8UtilsSSE2.cpp
+
+# gcc requires -msse2 on nsUTF8UtilsSSE2.cpp since it uses SSE2 intrinsics.
+# (See bug 585538 comment 12.)
+ifdef GNU_CC
+nsUTF8UtilsSSE2.$(OBJ_SUFFIX): CXXFLAGS+=-msse2
+endif
+endif
+
 include $(topsrcdir)/config/rules.mk
 
 DEFINES		+= -D_IMPL_NS_COM
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@@ -140,19 +140,19 @@ LossyAppendUTF16toASCII( const nsAString
 
     nsAString::const_iterator fromBegin, fromEnd;
 
     nsACString::iterator dest;
     aDest.BeginWriting(dest);
 
     dest.advance(old_dest_length);
 
-      // right now, this won't work on multi-fragment destinations
-    LossyConvertEncoding<PRUnichar, char> converter(dest.get());
-    
+    // right now, this won't work on multi-fragment destinations
+    LossyConvertEncoding16to8 converter(dest.get());
+
     copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd), converter);
   }
 
 NS_COM
 void
 AppendASCIItoUTF16( const nsACString& aSource, nsAString& aDest )
   {
     PRUint32 old_dest_length = aDest.Length();
@@ -162,17 +162,17 @@ AppendASCIItoUTF16( const nsACString& aS
     nsACString::const_iterator fromBegin, fromEnd;
 
     nsAString::iterator dest;
     aDest.BeginWriting(dest);
 
     dest.advance(old_dest_length);
 
       // right now, this won't work on multi-fragment destinations
-    LossyConvertEncoding<char, PRUnichar> converter(dest.get());
+    LossyConvertEncoding8to16 converter(dest.get());
 
     copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd), converter);
   }
 
 NS_COM
 void
 LossyAppendUTF16toASCII( const PRUnichar* aSource, nsACString& aDest )
   {
@@ -298,17 +298,17 @@ NS_COM
 char*
 ToNewCString( const nsAString& aSource )
   {
     char* result = AllocateStringCopy(aSource, (char*)0);
     if (!result)
       return nsnull;
 
     nsAString::const_iterator fromBegin, fromEnd;
-    LossyConvertEncoding<PRUnichar, char> converter(result);
+    LossyConvertEncoding16to8 converter(result);
     copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd), converter).write_terminator();
     return result;
   }
 
 NS_COM
 char*
 ToNewUTF8String( const nsAString& aSource, PRUint32 *aUTF8Count )
   {
@@ -369,17 +369,17 @@ NS_COM
 PRUnichar*
 ToNewUnicode( const nsACString& aSource )
   {
     PRUnichar* result = AllocateStringCopy(aSource, (PRUnichar*)0);
     if (!result)
       return nsnull;
 
     nsACString::const_iterator fromBegin, fromEnd;
-    LossyConvertEncoding<char, PRUnichar> converter(result);
+    LossyConvertEncoding8to16 converter(result);
     copy_string(aSource.BeginReading(fromBegin), aSource.EndReading(fromEnd), converter).write_terminator();
     return result;
   }
 
 NS_COM
 PRUnichar*
 UTF8ToNewUnicode( const nsACString& aSource, PRUint32 *aUTF16Count )
   {
new file mode 100644
--- /dev/null
+++ b/xpcom/string/src/nsUTF8UtilsSSE2.cpp
@@ -0,0 +1,96 @@
+#include "nscore.h"
+#include <emmintrin.h>
+#include <nsUTF8Utils.h>
+
+void
+LossyConvertEncoding16to8::write_sse2(const PRUnichar* aSource,
+                                      PRUint32 aSourceLength)
+{
+  char* dest = mDestination;
+
+  // Align source to a 16-byte boundary.
+  PRUint32 i = 0;
+  PRUint32 alignLen =
+    PR_MIN(aSourceLength, (-NS_PTR_TO_UINT32(aSource) & 0xf) / sizeof(PRUnichar));
+  for (; i < alignLen; i++) {
+    dest[i] = static_cast<unsigned char>(aSource[i]);
+  }
+
+  // Walk 64 bytes (four XMM registers) at a time.
+  __m128i vectmask = _mm_set1_epi16(0x00ff);
+  for (; aSourceLength - i > 31; i += 32) {
+    __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
+    source1 = _mm_and_si128(source1, vectmask);
+
+    __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
+    source2 = _mm_and_si128(source2, vectmask);
+
+    __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
+    source3 = _mm_and_si128(source3, vectmask);
+
+    __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
+    source4 = _mm_and_si128(source4, vectmask);
+
+
+    // Pack the source data.  SSE2 views this as a saturating uint16 to
+    // uint8 conversion, but since we masked off the high-order byte of every
+    // uint16, we're really just grabbing the low-order bytes of source1 and
+    // source2.
+    __m128i packed1 = _mm_packus_epi16(source1, source2);
+    __m128i packed2 = _mm_packus_epi16(source3, source4);
+
+    // This store needs to be unaligned since there's no guarantee that the
+    // alignment we did above for the source will align the destination.
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      packed1);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
+  }
+
+  // Finish up the rest.
+  for (; i < aSourceLength; i++) {
+    dest[i] = static_cast<unsigned char>(aSource[i]);
+  }
+
+  mDestination += i;
+}
+
+void
+LossyConvertEncoding8to16::write_sse2(const char* aSource,
+                                      PRUint32 aSourceLength)
+{
+  PRUnichar *dest = mDestination;
+
+  // Align source to a 16-byte boundary.  We choose to align source rather than
+  // dest because we'd rather have our loads than our stores be fast. You have
+  // to wait for a load to complete, but you can keep on moving after issuing a
+  // store.
+  PRUint32 i = 0;
+  PRUint32 alignLen = PR_MIN(aSourceLength, (-NS_PTR_TO_UINT32(aSource) & 0xf));
+  for (; i < alignLen; i++) {
+    dest[i] = static_cast<unsigned char>(aSource[i]);
+  }
+
+  // Walk 32 bytes (two XMM registers) at a time.
+  for (; aSourceLength - i > 31; i += 32) {
+    __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
+    __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
+
+    // Interleave 0s in with the bytes of source to create lo and hi.
+    __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
+    __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
+    __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
+    __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
+
+    // store lo and hi into dest.
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      lo1);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8),  hi1);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
+  }
+
+  // Finish up whatever's left.
+  for (; i < aSourceLength; i++) {
+    dest[i] = static_cast<unsigned char>(aSource[i]);
+  }
+
+  mDestination += i;
+}