Bug 1364215 - Use pmovmskb for allTrue and anyTrue. r=bbouvier
authorJakob Stoklund Olesen <jolesen@mozilla.com>
Thu, 11 May 2017 14:27:59 -0700
changeset 406359 d9353a6d3d1ae01150832ffc7d550f3c164fdf4f
parent 406358 54e6f320ab5d506be774e47c1c14ca15d64880c8
child 406360 a69cbcf4cf42786787169a22fe7069c7e1dacb02
push id7391
push usermtabara@mozilla.com
push dateMon, 12 Jun 2017 13:08:53 +0000
treeherdermozilla-beta@2191d7f87e2e [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbbouvier
bugs1364215
milestone55.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1364215 - Use pmovmskb for allTrue and anyTrue. r=bbouvier The movmskps SSE instruction only transfers 4 bits from the xmm register. This work for Bool32x4 and Bool64x2 vectors, but it misses lanes of the Bool16x8 and Bool8x16 types. Use a pmovmskb SSE2 instruction instead which transfers 16 byte sign bits from the xmm register. This lets us resolve even Bool8x16 lanes correctly. We know that the input vector is a boolean type, so each lane is known to be either 0 or -1. There is no harm in checking too many bits of the types with lanes wider than 8 bits. It won't affect the result.
js/src/jit-test/tests/SIMD/anyall.js
js/src/jit/x86-shared/Assembler-x86-shared.h
js/src/jit/x86-shared/BaseAssembler-x86-shared.h
js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
js/src/jit/x86-shared/Encoding-x86-shared.h
new file mode 100644
--- /dev/null
+++ b/js/src/jit-test/tests/SIMD/anyall.js
@@ -0,0 +1,38 @@
+load(libdir + 'simd.js');
+
+setJitCompilerOption("ion.warmup.trigger", 50);
+
+function all(B, n) {
+    var a = B.splat(true);
+    for (var i = 0; i < n; i++) {
+        var b = B.replaceLane(a, i, false);
+        assertEq(B.allTrue(b), false);
+        var c = B.replaceLane(b, i, true);
+        assertEq(B.allTrue(c), true);
+    }
+}
+
+function any(B, n) {
+    var a = B.splat(false);
+    for (var i = 0; i < n; i++) {
+        var b = B.replaceLane(a, i, true);
+        assertEq(B.anyTrue(b), true);
+        var c = B.replaceLane(b, i, false);
+        assertEq(B.anyTrue(c), false);
+    }
+}
+
+function f() {
+    for (var j = 0; j < 200; j++) {
+        all(SIMD.Bool64x2, 2)
+        any(SIMD.Bool64x2, 2)
+        all(SIMD.Bool32x4, 4)
+        any(SIMD.Bool32x4, 4)
+        all(SIMD.Bool16x8, 8)
+        any(SIMD.Bool16x8, 8)
+        all(SIMD.Bool8x16, 16)
+        any(SIMD.Bool8x16, 16)
+    }
+}
+
+f()
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@@ -2291,16 +2291,20 @@ class AssemblerX86Shared : public Assemb
     void vmovmskpd(FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vmovmskpd_rr(src.encoding(), dest.encoding());
     }
     void vmovmskps(FloatRegister src, Register dest) {
         MOZ_ASSERT(HasSSE2());
         masm.vmovmskps_rr(src.encoding(), dest.encoding());
     }
+    void vpmovmskb(FloatRegister src, Register dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpmovmskb_rr(src.encoding(), dest.encoding());
+    }
     void vptest(FloatRegister rhs, FloatRegister lhs) {
         MOZ_ASSERT(HasSSE41());
         masm.vptest_rr(rhs.encoding(), lhs.encoding());
     }
     void vucomisd(FloatRegister rhs, FloatRegister lhs) {
         MOZ_ASSERT(HasSSE2());
         masm.vucomisd_rr(rhs.encoding(), lhs.encoding());
     }
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@@ -3085,16 +3085,21 @@ public:
         twoByteOpSimdInt32("vmovmskpd", VEX_PD, OP2_MOVMSKPD_EdVd, src, dst);
     }
 
     void vmovmskps_rr(XMMRegisterID src, RegisterID dst)
     {
         twoByteOpSimdInt32("vmovmskps", VEX_PS, OP2_MOVMSKPD_EdVd, src, dst);
     }
 
+    void vpmovmskb_rr(XMMRegisterID src, RegisterID dst)
+    {
+        twoByteOpSimdInt32("vpmovmskb", VEX_PD, OP2_PMOVMSKB_EdVd, src, dst);
+    }
+
     void vptest_rr(XMMRegisterID rhs, XMMRegisterID lhs) {
         threeByteOpSimd("vptest", VEX_PD, OP3_PTEST_VdVd, ESCAPE_38, rhs, invalid_xmm, lhs);
     }
 
     void vmovd_rr(XMMRegisterID src, RegisterID dst)
     {
         twoByteOpSimdInt32("vmovd", VEX_PD, OP2_MOVD_EdVd, (XMMRegisterID)dst, (RegisterID)src);
     }
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -2958,28 +2958,30 @@ CodeGeneratorX86Shared::visitSimdInsertE
 }
 
 void
 CodeGeneratorX86Shared::visitSimdAllTrue(LSimdAllTrue* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
 
-    masm.vmovmskps(input, output);
-    masm.cmp32(output, Imm32(0xf));
+    // We know that the input lanes are boolean, so they are either 0 or -1.
+    // The all-true vector has all 128 bits set, no matter the lane geometry.
+    masm.vpmovmskb(input, output);
+    masm.cmp32(output, Imm32(0xffff));
     masm.emitSet(Assembler::Zero, output);
 }
 
 void
 CodeGeneratorX86Shared::visitSimdAnyTrue(LSimdAnyTrue* ins)
 {
     FloatRegister input = ToFloatRegister(ins->input());
     Register output = ToRegister(ins->output());
 
-    masm.vmovmskps(input, output);
+    masm.vpmovmskb(input, output);
     masm.cmp32(output, Imm32(0x0));
     masm.emitSet(Assembler::NonZero, output);
 }
 
 template <class T, class Reg> void
 CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Reg tempRegister)
 {
     MSimdGeneralShuffle* mir = ins->mir();
--- a/js/src/jit/x86-shared/Encoding-x86-shared.h
+++ b/js/src/jit/x86-shared/Encoding-x86-shared.h
@@ -261,16 +261,17 @@ enum TwoByteOpcodeID {
     OP2_CMPPS_VpsWps    = 0xC2,
     OP2_PINSRW          = 0xC4,
     OP2_PEXTRW_GdUdIb   = 0xC5,
     OP2_SHUFPS_VpsWpsIb = 0xC6,
     OP2_PSRLW_VdqWdq    = 0xD1,
     OP2_PSRLD_VdqWdq    = 0xD2,
     OP2_PMULLW_VdqWdq   = 0xD5,
     OP2_MOVQ_WdVd       = 0xD6,
+    OP2_PMOVMSKB_EdVd   = 0xD7,
     OP2_PSUBUSB_VdqWdq  = 0xD8,
     OP2_PSUBUSW_VdqWdq  = 0xD9,
     OP2_PANDDQ_VdqWdq   = 0xDB,
     OP2_PADDUSB_VdqWdq  = 0xDC,
     OP2_PADDUSW_VdqWdq  = 0xDD,
     OP2_PANDNDQ_VdqWdq  = 0xDF,
     OP2_PSRAW_VdqWdq    = 0xE1,
     OP2_PSRAD_VdqWdq    = 0xE2,