js/src/jit/shared/CodeGenerator-x86-shared.cpp
author Jan de Mooij <jdemooij@mozilla.com>
Sat, 28 Mar 2015 23:22:11 +0100
changeset 236396 02f2f4c75007651c63bbc0791d9a58dea88f545f
parent 236377 5b892d8ef4538ea84378ebe4a352c49d8b9aa366
permissions -rw-r--r--
Bug 1144366 - Switch SpiderMonkey and XPConnect style from |T *t| to |T* t|. r=jorendorff

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 * vim: set ts=8 sts=4 et sw=4 tw=99:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "jit/shared/CodeGenerator-x86-shared.h"

#include "mozilla/DebugOnly.h"
#include "mozilla/MathAlgorithms.h"

#include "jsmath.h"

#include "jit/JitCompartment.h"
#include "jit/JitFrames.h"
#include "jit/Linker.h"
#include "jit/RangeAnalysis.h"
#include "vm/TraceLogging.h"

#include "jit/shared/CodeGenerator-shared-inl.h"

using namespace js;
using namespace js::jit;

using mozilla::Abs;
using mozilla::FloatingPoint;
using mozilla::FloorLog2;
using mozilla::NegativeInfinity;
using mozilla::SpecificNaN;

using JS::GenericNaN;

namespace js {
namespace jit {

CodeGeneratorX86Shared::CodeGeneratorX86Shared(MIRGenerator* gen, LIRGraph* graph, MacroAssembler* masm)
  : CodeGeneratorShared(gen, graph, masm)
{
}

bool
CodeGeneratorX86Shared::generatePrologue()
{
    MOZ_ASSERT(masm.framePushed() == 0);
    MOZ_ASSERT(!gen->compilingAsmJS());

    // If profiling, save the current frame pointer to a per-thread global field.
    if (isProfilerInstrumentationEnabled())
        masm.profilerEnterFrame(StackPointer, CallTempReg0);

    // Ensure that the Ion frames is properly aligned.
    masm.assertStackAlignment(JitStackAlignment, 0);

    // Note that this automatically sets MacroAssembler::framePushed().
    masm.reserveStack(frameSize());

    emitTracelogIonStart();

    return true;
}

bool
CodeGeneratorX86Shared::generateEpilogue()
{
    MOZ_ASSERT(!gen->compilingAsmJS());

    masm.bind(&returnLabel_);

    emitTracelogIonStop();

    // Pop the stack we allocated at the start of the function.
    masm.freeStack(frameSize());
    MOZ_ASSERT(masm.framePushed() == 0);

    // If profiling, reset the per-thread global lastJitFrame to point to
    // the previous frame.
    if (isProfilerInstrumentationEnabled())
        masm.profilerExitFrame();

    masm.ret();
    return true;
}

void
OutOfLineBailout::accept(CodeGeneratorX86Shared* codegen)
{
    codegen->visitOutOfLineBailout(this);
}

void
CodeGeneratorX86Shared::emitBranch(Assembler::Condition cond, MBasicBlock* mirTrue,
                                   MBasicBlock* mirFalse, Assembler::NaNCond ifNaN)
{
    if (ifNaN == Assembler::NaN_IsFalse)
        jumpToBlock(mirFalse, Assembler::Parity);
    else if (ifNaN == Assembler::NaN_IsTrue)
        jumpToBlock(mirTrue, Assembler::Parity);

    if (isNextBlock(mirFalse->lir())) {
        jumpToBlock(mirTrue, cond);
    } else {
        jumpToBlock(mirFalse, Assembler::InvertCondition(cond));
        jumpToBlock(mirTrue);
    }
}

void
CodeGeneratorX86Shared::visitDouble(LDouble* ins)
{
    const LDefinition* out = ins->getDef(0);
    masm.loadConstantDouble(ins->getDouble(), ToFloatRegister(out));
}

void
CodeGeneratorX86Shared::visitFloat32(LFloat32* ins)
{
    const LDefinition* out = ins->getDef(0);
    masm.loadConstantFloat32(ins->getFloat(), ToFloatRegister(out));
}

void
CodeGeneratorX86Shared::visitTestIAndBranch(LTestIAndBranch* test)
{
    const LAllocation* opd = test->input();

    // Test the operand
    masm.test32(ToRegister(opd), ToRegister(opd));
    emitBranch(Assembler::NonZero, test->ifTrue(), test->ifFalse());
}

void
CodeGeneratorX86Shared::visitTestDAndBranch(LTestDAndBranch* test)
{
    const LAllocation* opd = test->input();

    // vucomisd flags:
    //             Z  P  C
    //            ---------
    //      NaN    1  1  1
    //        >    0  0  0
    //        <    0  0  1
    //        =    1  0  0
    //
    // NaN is falsey, so comparing against 0 and then using the Z flag is
    // enough to determine which branch to take.
    masm.zeroDouble(ScratchDoubleReg);
    masm.vucomisd(ScratchDoubleReg, ToFloatRegister(opd));
    emitBranch(Assembler::NotEqual, test->ifTrue(), test->ifFalse());
}

void
CodeGeneratorX86Shared::visitTestFAndBranch(LTestFAndBranch* test)
{
    const LAllocation* opd = test->input();
    // vucomiss flags are the same as doubles; see comment above
    masm.zeroFloat32(ScratchFloat32Reg);
    masm.vucomiss(ScratchFloat32Reg, ToFloatRegister(opd));
    emitBranch(Assembler::NotEqual, test->ifTrue(), test->ifFalse());
}

void
CodeGeneratorX86Shared::visitBitAndAndBranch(LBitAndAndBranch* baab)
{
    if (baab->right()->isConstant())
        masm.test32(ToRegister(baab->left()), Imm32(ToInt32(baab->right())));
    else
        masm.test32(ToRegister(baab->left()), ToRegister(baab->right()));
    emitBranch(Assembler::NonZero, baab->ifTrue(), baab->ifFalse());
}

void
CodeGeneratorX86Shared::emitCompare(MCompare::CompareType type, const LAllocation* left, const LAllocation* right)
{
#ifdef JS_CODEGEN_X64
    if (type == MCompare::Compare_Object) {
        masm.cmpPtr(ToRegister(left), ToOperand(right));
        return;
    }
#endif

    if (right->isConstant())
        masm.cmp32(ToRegister(left), Imm32(ToInt32(right)));
    else
        masm.cmp32(ToRegister(left), ToOperand(right));
}

void
CodeGeneratorX86Shared::visitCompare(LCompare* comp)
{
    MCompare* mir = comp->mir();
    emitCompare(mir->compareType(), comp->left(), comp->right());
    masm.emitSet(JSOpToCondition(mir->compareType(), comp->jsop()), ToRegister(comp->output()));
}

void
CodeGeneratorX86Shared::visitCompareAndBranch(LCompareAndBranch* comp)
{
    MCompare* mir = comp->cmpMir();
    emitCompare(mir->compareType(), comp->left(), comp->right());
    Assembler::Condition cond = JSOpToCondition(mir->compareType(), comp->jsop());
    emitBranch(cond, comp->ifTrue(), comp->ifFalse());
}

void
CodeGeneratorX86Shared::visitCompareD(LCompareD* comp)
{
    FloatRegister lhs = ToFloatRegister(comp->left());
    FloatRegister rhs = ToFloatRegister(comp->right());

    Assembler::DoubleCondition cond = JSOpToDoubleCondition(comp->mir()->jsop());

    Assembler::NaNCond nanCond = Assembler::NaNCondFromDoubleCondition(cond);
    if (comp->mir()->operandsAreNeverNaN())
        nanCond = Assembler::NaN_HandledByCond;

    masm.compareDouble(cond, lhs, rhs);
    masm.emitSet(Assembler::ConditionFromDoubleCondition(cond), ToRegister(comp->output()), nanCond);
}

void
CodeGeneratorX86Shared::visitCompareF(LCompareF* comp)
{
    FloatRegister lhs = ToFloatRegister(comp->left());
    FloatRegister rhs = ToFloatRegister(comp->right());

    Assembler::DoubleCondition cond = JSOpToDoubleCondition(comp->mir()->jsop());

    Assembler::NaNCond nanCond = Assembler::NaNCondFromDoubleCondition(cond);
    if (comp->mir()->operandsAreNeverNaN())
        nanCond = Assembler::NaN_HandledByCond;

    masm.compareFloat(cond, lhs, rhs);
    masm.emitSet(Assembler::ConditionFromDoubleCondition(cond), ToRegister(comp->output()), nanCond);
}

void
CodeGeneratorX86Shared::visitNotI(LNotI* ins)
{
    masm.cmp32(ToRegister(ins->input()), Imm32(0));
    masm.emitSet(Assembler::Equal, ToRegister(ins->output()));
}

void
CodeGeneratorX86Shared::visitNotD(LNotD* ins)
{
    FloatRegister opd = ToFloatRegister(ins->input());

    // Not returns true if the input is a NaN. We don't have to worry about
    // it if we know the input is never NaN though.
    Assembler::NaNCond nanCond = Assembler::NaN_IsTrue;
    if (ins->mir()->operandIsNeverNaN())
        nanCond = Assembler::NaN_HandledByCond;

    masm.zeroDouble(ScratchDoubleReg);
    masm.compareDouble(Assembler::DoubleEqualOrUnordered, opd, ScratchDoubleReg);
    masm.emitSet(Assembler::Equal, ToRegister(ins->output()), nanCond);
}

void
CodeGeneratorX86Shared::visitNotF(LNotF* ins)
{
    FloatRegister opd = ToFloatRegister(ins->input());

    // Not returns true if the input is a NaN. We don't have to worry about
    // it if we know the input is never NaN though.
    Assembler::NaNCond nanCond = Assembler::NaN_IsTrue;
    if (ins->mir()->operandIsNeverNaN())
        nanCond = Assembler::NaN_HandledByCond;

    masm.zeroFloat32(ScratchFloat32Reg);
    masm.compareFloat(Assembler::DoubleEqualOrUnordered, opd, ScratchFloat32Reg);
    masm.emitSet(Assembler::Equal, ToRegister(ins->output()), nanCond);
}

void
CodeGeneratorX86Shared::visitCompareDAndBranch(LCompareDAndBranch* comp)
{
    FloatRegister lhs = ToFloatRegister(comp->left());
    FloatRegister rhs = ToFloatRegister(comp->right());

    Assembler::DoubleCondition cond = JSOpToDoubleCondition(comp->cmpMir()->jsop());

    Assembler::NaNCond nanCond = Assembler::NaNCondFromDoubleCondition(cond);
    if (comp->cmpMir()->operandsAreNeverNaN())
        nanCond = Assembler::NaN_HandledByCond;

    masm.compareDouble(cond, lhs, rhs);
    emitBranch(Assembler::ConditionFromDoubleCondition(cond), comp->ifTrue(), comp->ifFalse(), nanCond);
}

void
CodeGeneratorX86Shared::visitCompareFAndBranch(LCompareFAndBranch* comp)
{
    FloatRegister lhs = ToFloatRegister(comp->left());
    FloatRegister rhs = ToFloatRegister(comp->right());

    Assembler::DoubleCondition cond = JSOpToDoubleCondition(comp->cmpMir()->jsop());

    Assembler::NaNCond nanCond = Assembler::NaNCondFromDoubleCondition(cond);
    if (comp->cmpMir()->operandsAreNeverNaN())
        nanCond = Assembler::NaN_HandledByCond;

    masm.compareFloat(cond, lhs, rhs);
    emitBranch(Assembler::ConditionFromDoubleCondition(cond), comp->ifTrue(), comp->ifFalse(), nanCond);
}

void
CodeGeneratorX86Shared::visitAsmJSPassStackArg(LAsmJSPassStackArg* ins)
{
    const MAsmJSPassStackArg* mir = ins->mir();
    Address dst(StackPointer, mir->spOffset());
    if (ins->arg()->isConstant()) {
        masm.storePtr(ImmWord(ToInt32(ins->arg())), dst);
    } else {
        if (ins->arg()->isGeneralReg()) {
            masm.storePtr(ToRegister(ins->arg()), dst);
        } else {
            switch (mir->input()->type()) {
              case MIRType_Double:
              case MIRType_Float32:
                masm.storeDouble(ToFloatRegister(ins->arg()), dst);
                return;
              // StackPointer is SIMD-aligned and ABIArgGenerator guarantees
              // stack offsets are SIMD-aligned.
              case MIRType_Int32x4:
                masm.storeAlignedInt32x4(ToFloatRegister(ins->arg()), dst);
                return;
              case MIRType_Float32x4:
                masm.storeAlignedFloat32x4(ToFloatRegister(ins->arg()), dst);
                return;
              default: break;
            }
            MOZ_MAKE_COMPILER_ASSUME_IS_UNREACHABLE("unexpected mir type in AsmJSPassStackArg");
        }
    }
}

void
CodeGeneratorX86Shared::visitOutOfLineLoadTypedArrayOutOfBounds(OutOfLineLoadTypedArrayOutOfBounds* ool)
{
    switch (ool->viewType()) {
      case Scalar::Float32x4:
      case Scalar::Int32x4:
      case Scalar::MaxTypedArrayViewType:
        MOZ_CRASH("unexpected array type");
      case Scalar::Float32:
        masm.loadConstantFloat32(float(GenericNaN()), ool->dest().fpu());
        break;
      case Scalar::Float64:
        masm.loadConstantDouble(GenericNaN(), ool->dest().fpu());
        break;
      case Scalar::Int8:
      case Scalar::Uint8:
      case Scalar::Int16:
      case Scalar::Uint16:
      case Scalar::Int32:
      case Scalar::Uint32:
      case Scalar::Uint8Clamped:
        Register destReg = ool->dest().gpr();
        masm.mov(ImmWord(0), destReg);
        break;
    }
    masm.jmp(ool->rejoin());
}

void
CodeGeneratorX86Shared::visitOffsetBoundsCheck(OffsetBoundsCheck* oolCheck)
{
    // The access is heap[ptr + offset]. The inline code checks that
    // ptr < heap.length - offset. We get here when that fails. We need to check
    // for the case where ptr + offset >= 0, in which case the access is still
    // in bounds.
    MOZ_ASSERT(oolCheck->offset() != 0,
               "An access without a constant offset doesn't need a separate OffsetBoundsCheck");
    masm.cmp32(oolCheck->ptrReg(), Imm32(-uint32_t(oolCheck->offset())));
    masm.j(Assembler::Below, oolCheck->outOfBounds());

#ifdef JS_CODEGEN_X64
    // In order to get the offset to wrap properly, we must sign-extend the
    // pointer to 32-bits. We'll zero out the sign extension immediately
    // after the access to restore asm.js invariants.
    masm.movslq(oolCheck->ptrReg(), oolCheck->ptrReg());
#endif

    masm.jmp(oolCheck->rejoin());
}

uint32_t
CodeGeneratorX86Shared::emitAsmJSBoundsCheckBranch(const MAsmJSHeapAccess* access,
                                                   const MInstruction* mir,
                                                   Register ptr, Label* fail)
{
    // Emit a bounds-checking branch for |access|.

    MOZ_ASSERT(gen->needsAsmJSBoundsCheckBranch(access));

    Label* pass = nullptr;

    // If we have a non-zero offset, it's possible that |ptr| itself is out of
    // bounds, while adding the offset computes an in-bounds address. To catch
    // this case, we need a second branch, which we emit out of line since it's
    // unlikely to be needed in normal programs.
    if (access->offset() != 0) {
        OffsetBoundsCheck* oolCheck = new(alloc()) OffsetBoundsCheck(fail, ptr, access->offset());
        fail = oolCheck->entry();
        pass = oolCheck->rejoin();
        addOutOfLineCode(oolCheck, mir);
    }

    // The bounds check is a comparison with an immediate value. The asm.js
    // module linking process will add the length of the heap to the immediate
    // field, so -access->endOffset() will turn into
    // (heapLength - access->endOffset()), allowing us to test whether the end
    // of the access is beyond the end of the heap.
    uint32_t maybeCmpOffset = masm.cmp32WithPatch(ptr, Imm32(-access->endOffset())).offset();
    masm.j(Assembler::Above, fail);

    if (pass)
        masm.bind(pass);

    return maybeCmpOffset;
}

void
CodeGeneratorX86Shared::cleanupAfterAsmJSBoundsCheckBranch(const MAsmJSHeapAccess* access,
                                                           Register ptr)
{
    // Clean up after performing a heap access checked by a branch.

    MOZ_ASSERT(gen->needsAsmJSBoundsCheckBranch(access));

#ifdef JS_CODEGEN_X64
    // If the offset is 0, we don't use an OffsetBoundsCheck.
    if (access->offset() != 0) {
        // Zero out the high 32 bits, in case the OffsetBoundsCheck code had to
        // sign-extend (movslq) the pointer value to get wraparound to work.
        masm.movl(ptr, ptr);
    }
#endif
}

bool
CodeGeneratorX86Shared::generateOutOfLineCode()
{
    if (!CodeGeneratorShared::generateOutOfLineCode())
        return false;

    if (deoptLabel_.used()) {
        // All non-table-based bailouts will go here.
        masm.bind(&deoptLabel_);

        // Push the frame size, so the handler can recover the IonScript.
        masm.push(Imm32(frameSize()));

        JitCode* handler = gen->jitRuntime()->getGenericBailoutHandler();
        masm.jmp(ImmPtr(handler->raw()), Relocation::JITCODE);
    }

    return true;
}

class BailoutJump {
    Assembler::Condition cond_;

  public:
    explicit BailoutJump(Assembler::Condition cond) : cond_(cond)
    { }
#ifdef JS_CODEGEN_X86
    void operator()(MacroAssembler& masm, uint8_t* code) const {
        masm.j(cond_, ImmPtr(code), Relocation::HARDCODED);
    }
#endif
    void operator()(MacroAssembler& masm, Label* label) const {
        masm.j(cond_, label);
    }
};

class BailoutLabel {
    Label* label_;

  public:
    explicit BailoutLabel(Label* label) : label_(label)
    { }
#ifdef JS_CODEGEN_X86
    void operator()(MacroAssembler& masm, uint8_t* code) const {
        masm.retarget(label_, ImmPtr(code), Relocation::HARDCODED);
    }
#endif
    void operator()(MacroAssembler& masm, Label* label) const {
        masm.retarget(label_, label);
    }
};

template <typename T> void
CodeGeneratorX86Shared::bailout(const T& binder, LSnapshot* snapshot)
{
    encode(snapshot);

    // Though the assembler doesn't track all frame pushes, at least make sure
    // the known value makes sense. We can't use bailout tables if the stack
    // isn't properly aligned to the static frame size.
    MOZ_ASSERT_IF(frameClass_ != FrameSizeClass::None() && deoptTable_,
                  frameClass_.frameSize() == masm.framePushed());

#ifdef JS_CODEGEN_X86
    // On x64, bailout tables are pointless, because 16 extra bytes are
    // reserved per external jump, whereas it takes only 10 bytes to encode a
    // a non-table based bailout.
    if (assignBailoutId(snapshot)) {
        binder(masm, deoptTable_->raw() + snapshot->bailoutId() * BAILOUT_TABLE_ENTRY_SIZE);
        return;
    }
#endif

    // We could not use a jump table, either because all bailout IDs were
    // reserved, or a jump table is not optimal for this frame size or
    // platform. Whatever, we will generate a lazy bailout.
    //
    // All bailout code is associated with the bytecodeSite of the block we are
    // bailing out from.
    InlineScriptTree* tree = snapshot->mir()->block()->trackedTree();
    OutOfLineBailout* ool = new(alloc()) OutOfLineBailout(snapshot);
    addOutOfLineCode(ool, new(alloc()) BytecodeSite(tree, tree->script()->code()));

    binder(masm, ool->entry());
}

void
CodeGeneratorX86Shared::bailoutIf(Assembler::Condition condition, LSnapshot* snapshot)
{
    bailout(BailoutJump(condition), snapshot);
}

void
CodeGeneratorX86Shared::bailoutIf(Assembler::DoubleCondition condition, LSnapshot* snapshot)
{
    MOZ_ASSERT(Assembler::NaNCondFromDoubleCondition(condition) == Assembler::NaN_HandledByCond);
    bailoutIf(Assembler::ConditionFromDoubleCondition(condition), snapshot);
}

void
CodeGeneratorX86Shared::bailoutFrom(Label* label, LSnapshot* snapshot)
{
    MOZ_ASSERT(label->used() && !label->bound());
    bailout(BailoutLabel(label), snapshot);
}

void
CodeGeneratorX86Shared::bailout(LSnapshot* snapshot)
{
    Label label;
    masm.jump(&label);
    bailoutFrom(&label, snapshot);
}

void
CodeGeneratorX86Shared::visitOutOfLineBailout(OutOfLineBailout* ool)
{
    masm.push(Imm32(ool->snapshot()->snapshotOffset()));
    masm.jmp(&deoptLabel_);
}

void
CodeGeneratorX86Shared::visitMinMaxD(LMinMaxD* ins)
{
    FloatRegister first = ToFloatRegister(ins->first());
    FloatRegister second = ToFloatRegister(ins->second());
#ifdef DEBUG
    FloatRegister output = ToFloatRegister(ins->output());
    MOZ_ASSERT(first == output);
#endif

    Label done, nan, minMaxInst;

    // Do a vucomisd to catch equality and NaNs, which both require special
    // handling. If the operands are ordered and inequal, we branch straight to
    // the min/max instruction. If we wanted, we could also branch for less-than
    // or greater-than here instead of using min/max, however these conditions
    // will sometimes be hard on the branch predictor.
    masm.vucomisd(second, first);
    masm.j(Assembler::NotEqual, &minMaxInst);
    if (!ins->mir()->range() || ins->mir()->range()->canBeNaN())
        masm.j(Assembler::Parity, &nan);

    // Ordered and equal. The operands are bit-identical unless they are zero
    // and negative zero. These instructions merge the sign bits in that
    // case, and are no-ops otherwise.
    if (ins->mir()->isMax())
        masm.vandpd(second, first, first);
    else
        masm.vorpd(second, first, first);
    masm.jump(&done);

    // x86's min/max are not symmetric; if either operand is a NaN, they return
    // the read-only operand. We need to return a NaN if either operand is a
    // NaN, so we explicitly check for a NaN in the read-write operand.
    if (!ins->mir()->range() || ins->mir()->range()->canBeNaN()) {
        masm.bind(&nan);
        masm.vucomisd(first, first);
        masm.j(Assembler::Parity, &done);
    }

    // When the values are inequal, or second is NaN, x86's min and max will
    // return the value we need.
    masm.bind(&minMaxInst);
    if (ins->mir()->isMax())
        masm.vmaxsd(second, first, first);
    else
        masm.vminsd(second, first, first);

    masm.bind(&done);
}

void
CodeGeneratorX86Shared::visitMinMaxF(LMinMaxF* ins)
{
    FloatRegister first = ToFloatRegister(ins->first());
    FloatRegister second = ToFloatRegister(ins->second());
#ifdef DEBUG
    FloatRegister output = ToFloatRegister(ins->output());
    MOZ_ASSERT(first == output);
#endif

    Label done, nan, minMaxInst;

    // Do a vucomiss to catch equality and NaNs, which both require special
    // handling. If the operands are ordered and inequal, we branch straight to
    // the min/max instruction. If we wanted, we could also branch for less-than
    // or greater-than here instead of using min/max, however these conditions
    // will sometimes be hard on the branch predictor.
    masm.vucomiss(second, first);
    masm.j(Assembler::NotEqual, &minMaxInst);
    if (!ins->mir()->range() || ins->mir()->range()->canBeNaN())
        masm.j(Assembler::Parity, &nan);

    // Ordered and equal. The operands are bit-identical unless they are zero
    // and negative zero. These instructions merge the sign bits in that
    // case, and are no-ops otherwise.
    if (ins->mir()->isMax())
        masm.vandps(second, first, first);
    else
        masm.vorps(second, first, first);
    masm.jump(&done);

    // x86's min/max are not symmetric; if either operand is a NaN, they return
    // the read-only operand. We need to return a NaN if either operand is a
    // NaN, so we explicitly check for a NaN in the read-write operand.
    if (!ins->mir()->range() || ins->mir()->range()->canBeNaN()) {
        masm.bind(&nan);
        masm.vucomiss(first, first);
        masm.j(Assembler::Parity, &done);
    }

    // When the values are inequal, or second is NaN, x86's min and max will
    // return the value we need.
    masm.bind(&minMaxInst);
    if (ins->mir()->isMax())
        masm.vmaxss(second, first, first);
    else
        masm.vminss(second, first, first);

    masm.bind(&done);
}

void
CodeGeneratorX86Shared::visitAbsD(LAbsD* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    MOZ_ASSERT(input == ToFloatRegister(ins->output()));
    // Load a value which is all ones except for the sign bit.
    masm.loadConstantDouble(SpecificNaN<double>(0, FloatingPoint<double>::kSignificandBits),
                            ScratchDoubleReg);
    masm.vandpd(ScratchDoubleReg, input, input);
}

void
CodeGeneratorX86Shared::visitAbsF(LAbsF* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    MOZ_ASSERT(input == ToFloatRegister(ins->output()));
    // Same trick as visitAbsD above.
    masm.loadConstantFloat32(SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits),
                             ScratchFloat32Reg);
    masm.vandps(ScratchFloat32Reg, input, input);
}

void
CodeGeneratorX86Shared::visitClzI(LClzI* ins)
{
    Register input = ToRegister(ins->input());
    Register output = ToRegister(ins->output());

    // bsr is undefined on 0
    Label done, nonzero;
    if (!ins->mir()->operandIsNeverZero()) {
        masm.test32(input, input);
        masm.j(Assembler::NonZero, &nonzero);
        masm.move32(Imm32(32), output);
        masm.jump(&done);
    }

    masm.bind(&nonzero);
    masm.bsr(input, output);
    masm.xor32(Imm32(0x1F), output);
    masm.bind(&done);
}

void
CodeGeneratorX86Shared::visitSqrtD(LSqrtD* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());
    masm.vsqrtsd(input, output, output);
}

void
CodeGeneratorX86Shared::visitSqrtF(LSqrtF* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());
    masm.vsqrtss(input, output, output);
}

void
CodeGeneratorX86Shared::visitPowHalfD(LPowHalfD* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());

    Label done, sqrt;

    if (!ins->mir()->operandIsNeverNegativeInfinity()) {
        // Branch if not -Infinity.
        masm.loadConstantDouble(NegativeInfinity<double>(), ScratchDoubleReg);

        Assembler::DoubleCondition cond = Assembler::DoubleNotEqualOrUnordered;
        if (ins->mir()->operandIsNeverNaN())
            cond = Assembler::DoubleNotEqual;
        masm.branchDouble(cond, input, ScratchDoubleReg, &sqrt);

        // Math.pow(-Infinity, 0.5) == Infinity.
        masm.zeroDouble(input);
        masm.subDouble(ScratchDoubleReg, input);
        masm.jump(&done);

        masm.bind(&sqrt);
    }

    if (!ins->mir()->operandIsNeverNegativeZero()) {
        // Math.pow(-0, 0.5) == 0 == Math.pow(0, 0.5). Adding 0 converts any -0 to 0.
        masm.zeroDouble(ScratchDoubleReg);
        masm.addDouble(ScratchDoubleReg, input);
    }

    masm.vsqrtsd(input, output, output);

    masm.bind(&done);
}

class OutOfLineUndoALUOperation : public OutOfLineCodeBase<CodeGeneratorX86Shared>
{
    LInstruction* ins_;

  public:
    explicit OutOfLineUndoALUOperation(LInstruction* ins)
        : ins_(ins)
    { }

    virtual void accept(CodeGeneratorX86Shared* codegen) {
        codegen->visitOutOfLineUndoALUOperation(this);
    }
    LInstruction* ins() const {
        return ins_;
    }
};

void
CodeGeneratorX86Shared::visitAddI(LAddI* ins)
{
    if (ins->rhs()->isConstant())
        masm.addl(Imm32(ToInt32(ins->rhs())), ToOperand(ins->lhs()));
    else
        masm.addl(ToOperand(ins->rhs()), ToRegister(ins->lhs()));

    if (ins->snapshot()) {
        if (ins->recoversInput()) {
            OutOfLineUndoALUOperation* ool = new(alloc()) OutOfLineUndoALUOperation(ins);
            addOutOfLineCode(ool, ins->mir());
            masm.j(Assembler::Overflow, ool->entry());
        } else {
            bailoutIf(Assembler::Overflow, ins->snapshot());
        }
    }
}

void
CodeGeneratorX86Shared::visitSubI(LSubI* ins)
{
    if (ins->rhs()->isConstant())
        masm.subl(Imm32(ToInt32(ins->rhs())), ToOperand(ins->lhs()));
    else
        masm.subl(ToOperand(ins->rhs()), ToRegister(ins->lhs()));

    if (ins->snapshot()) {
        if (ins->recoversInput()) {
            OutOfLineUndoALUOperation* ool = new(alloc()) OutOfLineUndoALUOperation(ins);
            addOutOfLineCode(ool, ins->mir());
            masm.j(Assembler::Overflow, ool->entry());
        } else {
            bailoutIf(Assembler::Overflow, ins->snapshot());
        }
    }
}

void
CodeGeneratorX86Shared::visitOutOfLineUndoALUOperation(OutOfLineUndoALUOperation* ool)
{
    LInstruction* ins = ool->ins();
    Register reg = ToRegister(ins->getDef(0));

    mozilla::DebugOnly<LAllocation*> lhs = ins->getOperand(0);
    LAllocation* rhs = ins->getOperand(1);

    MOZ_ASSERT(reg == ToRegister(lhs));
    MOZ_ASSERT_IF(rhs->isGeneralReg(), reg != ToRegister(rhs));

    // Undo the effect of the ALU operation, which was performed on the output
    // register and overflowed. Writing to the output register clobbered an
    // input reg, and the original value of the input needs to be recovered
    // to satisfy the constraint imposed by any RECOVERED_INPUT operands to
    // the bailout snapshot.

    if (rhs->isConstant()) {
        Imm32 constant(ToInt32(rhs));
        if (ins->isAddI())
            masm.subl(constant, reg);
        else
            masm.addl(constant, reg);
    } else {
        if (ins->isAddI())
            masm.subl(ToOperand(rhs), reg);
        else
            masm.addl(ToOperand(rhs), reg);
    }

    bailout(ool->ins()->snapshot());
}

class MulNegativeZeroCheck : public OutOfLineCodeBase<CodeGeneratorX86Shared>
{
    LMulI* ins_;

  public:
    explicit MulNegativeZeroCheck(LMulI* ins)
      : ins_(ins)
    { }

    virtual void accept(CodeGeneratorX86Shared* codegen) {
        codegen->visitMulNegativeZeroCheck(this);
    }
    LMulI* ins() const {
        return ins_;
    }
};

void
CodeGeneratorX86Shared::visitMulI(LMulI* ins)
{
    const LAllocation* lhs = ins->lhs();
    const LAllocation* rhs = ins->rhs();
    MMul* mul = ins->mir();
    MOZ_ASSERT_IF(mul->mode() == MMul::Integer, !mul->canBeNegativeZero() && !mul->canOverflow());

    if (rhs->isConstant()) {
        // Bailout on -0.0
        int32_t constant = ToInt32(rhs);
        if (mul->canBeNegativeZero() && constant <= 0) {
            Assembler::Condition bailoutCond = (constant == 0) ? Assembler::Signed : Assembler::Equal;
            masm.test32(ToRegister(lhs), ToRegister(lhs));
            bailoutIf(bailoutCond, ins->snapshot());
        }

        switch (constant) {
          case -1:
            masm.negl(ToOperand(lhs));
            break;
          case 0:
            masm.xorl(ToOperand(lhs), ToRegister(lhs));
            return; // escape overflow check;
          case 1:
            // nop
            return; // escape overflow check;
          case 2:
            masm.addl(ToOperand(lhs), ToRegister(lhs));
            break;
          default:
            if (!mul->canOverflow() && constant > 0) {
                // Use shift if cannot overflow and constant is power of 2
                int32_t shift = FloorLog2(constant);
                if ((1 << shift) == constant) {
                    masm.shll(Imm32(shift), ToRegister(lhs));
                    return;
                }
            }
            masm.imull(Imm32(ToInt32(rhs)), ToRegister(lhs));
        }

        // Bailout on overflow
        if (mul->canOverflow())
            bailoutIf(Assembler::Overflow, ins->snapshot());
    } else {
        masm.imull(ToOperand(rhs), ToRegister(lhs));

        // Bailout on overflow
        if (mul->canOverflow())
            bailoutIf(Assembler::Overflow, ins->snapshot());

        if (mul->canBeNegativeZero()) {
            // Jump to an OOL path if the result is 0.
            MulNegativeZeroCheck* ool = new(alloc()) MulNegativeZeroCheck(ins);
            addOutOfLineCode(ool, mul);

            masm.test32(ToRegister(lhs), ToRegister(lhs));
            masm.j(Assembler::Zero, ool->entry());
            masm.bind(ool->rejoin());
        }
    }
}

class ReturnZero : public OutOfLineCodeBase<CodeGeneratorX86Shared>
{
    Register reg_;

  public:
    explicit ReturnZero(Register reg)
      : reg_(reg)
    { }

    virtual void accept(CodeGeneratorX86Shared* codegen) {
        codegen->visitReturnZero(this);
    }
    Register reg() const {
        return reg_;
    }
};

void
CodeGeneratorX86Shared::visitReturnZero(ReturnZero* ool)
{
    masm.mov(ImmWord(0), ool->reg());
    masm.jmp(ool->rejoin());
}

void
CodeGeneratorX86Shared::visitUDivOrMod(LUDivOrMod* ins)
{
    Register lhs = ToRegister(ins->lhs());
    Register rhs = ToRegister(ins->rhs());
    Register output = ToRegister(ins->output());

    MOZ_ASSERT_IF(lhs != rhs, rhs != eax);
    MOZ_ASSERT(rhs != edx);
    MOZ_ASSERT_IF(output == eax, ToRegister(ins->remainder()) == edx);

    ReturnZero* ool = nullptr;

    // Put the lhs in eax.
    if (lhs != eax)
        masm.mov(lhs, eax);

    // Prevent divide by zero.
    if (ins->canBeDivideByZero()) {
        masm.test32(rhs, rhs);
        if (ins->mir()->isTruncated()) {
            if (!ool)
                ool = new(alloc()) ReturnZero(output);
            masm.j(Assembler::Zero, ool->entry());
        } else {
            bailoutIf(Assembler::Zero, ins->snapshot());
        }
    }

    // Zero extend the lhs into edx to make (edx:eax), since udiv is 64-bit.
    masm.mov(ImmWord(0), edx);
    masm.udiv(rhs);

    // If the remainder is > 0, bailout since this must be a double.
    if (ins->mir()->isDiv() && !ins->mir()->toDiv()->canTruncateRemainder()) {
        Register remainder = ToRegister(ins->remainder());
        masm.test32(remainder, remainder);
        bailoutIf(Assembler::NonZero, ins->snapshot());
    }

    // Unsigned div or mod can return a value that's not a signed int32.
    // If our users aren't expecting that, bail.
    if (!ins->mir()->isTruncated()) {
        masm.test32(output, output);
        bailoutIf(Assembler::Signed, ins->snapshot());
    }

    if (ool) {
        addOutOfLineCode(ool, ins->mir());
        masm.bind(ool->rejoin());
    }
}

void
CodeGeneratorX86Shared::visitMulNegativeZeroCheck(MulNegativeZeroCheck* ool)
{
    LMulI* ins = ool->ins();
    Register result = ToRegister(ins->output());
    Operand lhsCopy = ToOperand(ins->lhsCopy());
    Operand rhs = ToOperand(ins->rhs());
    MOZ_ASSERT_IF(lhsCopy.kind() == Operand::REG, lhsCopy.reg() != result.code());

    // Result is -0 if lhs or rhs is negative.
    masm.movl(lhsCopy, result);
    masm.orl(rhs, result);
    bailoutIf(Assembler::Signed, ins->snapshot());

    masm.mov(ImmWord(0), result);
    masm.jmp(ool->rejoin());
}

void
CodeGeneratorX86Shared::visitDivPowTwoI(LDivPowTwoI* ins)
{
    Register lhs = ToRegister(ins->numerator());
    mozilla::DebugOnly<Register> output = ToRegister(ins->output());

    int32_t shift = ins->shift();
    bool negativeDivisor = ins->negativeDivisor();
    MDiv* mir = ins->mir();

    // We use defineReuseInput so these should always be the same, which is
    // convenient since all of our instructions here are two-address.
    MOZ_ASSERT(lhs == output);

    if (!mir->isTruncated() && negativeDivisor) {
        // 0 divided by a negative number must return a double.
        masm.test32(lhs, lhs);
        bailoutIf(Assembler::Zero, ins->snapshot());
    }

    if (shift != 0) {
        if (!mir->isTruncated()) {
            // If the remainder is != 0, bailout since this must be a double.
            masm.test32(lhs, Imm32(UINT32_MAX >> (32 - shift)));
            bailoutIf(Assembler::NonZero, ins->snapshot());
        }

        // Adjust the value so that shifting produces a correctly rounded result
        // when the numerator is negative. See 10-1 "Signed Division by a Known
        // Power of 2" in Henry S. Warren, Jr.'s Hacker's Delight.
        if (mir->canBeNegativeDividend()) {
            Register lhsCopy = ToRegister(ins->numeratorCopy());
            MOZ_ASSERT(lhsCopy != lhs);
            if (shift > 1)
                masm.sarl(Imm32(31), lhs);
            masm.shrl(Imm32(32 - shift), lhs);
            masm.addl(lhsCopy, lhs);
        }

        masm.sarl(Imm32(shift), lhs);
        if (negativeDivisor)
            masm.negl(lhs);
    } else if (shift == 0 && negativeDivisor) {
        // INT32_MIN / -1 overflows.
        masm.negl(lhs);
        if (!mir->isTruncated())
            bailoutIf(Assembler::Overflow, ins->snapshot());
    }
}

void
CodeGeneratorX86Shared::visitDivOrModConstantI(LDivOrModConstantI* ins) {
    Register lhs = ToRegister(ins->numerator());
    Register output = ToRegister(ins->output());
    int32_t d = ins->denominator();

    // This emits the division answer into edx or the modulus answer into eax.
    MOZ_ASSERT(output == eax || output == edx);
    MOZ_ASSERT(lhs != eax && lhs != edx);
    bool isDiv = (output == edx);

    // The absolute value of the denominator isn't a power of 2 (see LDivPowTwoI
    // and LModPowTwoI).
    MOZ_ASSERT((Abs(d) & (Abs(d) - 1)) != 0);

    // We will first divide by Abs(d), and negate the answer if d is negative.
    // If desired, this can be avoided by generalizing computeDivisionConstants.
    ReciprocalMulConstants rmc = computeDivisionConstants(Abs(d));

    // As explained in the comments of computeDivisionConstants, we first compute
    // X >> (32 + shift), where X is either (rmc.multiplier * n) if the multiplier
    // is non-negative or (rmc.multiplier * n) + (2^32 * n) otherwise. This is the
    // desired division result if n is non-negative, and is one less than the result
    // otherwise.
    masm.movl(Imm32(rmc.multiplier), eax);
    masm.imull(lhs);
    if (rmc.multiplier < 0)
        masm.addl(lhs, edx);
    masm.sarl(Imm32(rmc.shiftAmount), edx);

    // We'll subtract -1 instead of adding 1, because (n < 0 ? -1 : 0) can be
    // computed with just a sign-extending shift of 31 bits.
    if (ins->canBeNegativeDividend()) {
        masm.movl(lhs, eax);
        masm.sarl(Imm32(31), eax);
        masm.subl(eax, edx);
    }

    // After this, edx contains the correct truncated division result.
    if (d < 0)
        masm.negl(edx);

    if (!isDiv) {
        masm.imull(Imm32(-d), edx, eax);
        masm.addl(lhs, eax);
    }

    if (!ins->mir()->isTruncated()) {
        if (isDiv) {
            // This is a division op. Multiply the obtained value by d to check if
            // the correct answer is an integer. This cannot overflow, since |d| > 1.
            masm.imull(Imm32(d), edx, eax);
            masm.cmp32(lhs, eax);
            bailoutIf(Assembler::NotEqual, ins->snapshot());

            // If lhs is zero and the divisor is negative, the answer should have
            // been -0.
            if (d < 0) {
                masm.test32(lhs, lhs);
                bailoutIf(Assembler::Zero, ins->snapshot());
            }
        } else if (ins->canBeNegativeDividend()) {
            // This is a mod op. If the computed value is zero and lhs
            // is negative, the answer should have been -0.
            Label done;

            masm.cmp32(lhs, Imm32(0));
            masm.j(Assembler::GreaterThanOrEqual, &done);

            masm.test32(eax, eax);
            bailoutIf(Assembler::Zero, ins->snapshot());

            masm.bind(&done);
        }
    }
}

void
CodeGeneratorX86Shared::visitDivI(LDivI* ins)
{
    Register remainder = ToRegister(ins->remainder());
    Register lhs = ToRegister(ins->lhs());
    Register rhs = ToRegister(ins->rhs());
    Register output = ToRegister(ins->output());

    MDiv* mir = ins->mir();

    MOZ_ASSERT_IF(lhs != rhs, rhs != eax);
    MOZ_ASSERT(rhs != edx);
    MOZ_ASSERT(remainder == edx);
    MOZ_ASSERT(output == eax);

    Label done;
    ReturnZero* ool = nullptr;

    // Put the lhs in eax, for either the negative overflow case or the regular
    // divide case.
    if (lhs != eax)
        masm.mov(lhs, eax);

    // Handle divide by zero.
    if (mir->canBeDivideByZero()) {
        masm.test32(rhs, rhs);
        if (mir->canTruncateInfinities()) {
            // Truncated division by zero is zero (Infinity|0 == 0)
            if (!ool)
                ool = new(alloc()) ReturnZero(output);
            masm.j(Assembler::Zero, ool->entry());
        } else {
            MOZ_ASSERT(mir->fallible());
            bailoutIf(Assembler::Zero, ins->snapshot());
        }
    }

    // Handle an integer overflow exception from -2147483648 / -1.
    if (mir->canBeNegativeOverflow()) {
        Label notmin;
        masm.cmp32(lhs, Imm32(INT32_MIN));
        masm.j(Assembler::NotEqual, &notmin);
        masm.cmp32(rhs, Imm32(-1));
        if (mir->canTruncateOverflow()) {
            // (-INT32_MIN)|0 == INT32_MIN and INT32_MIN is already in the
            // output register (lhs == eax).
            masm.j(Assembler::Equal, &done);
        } else {
            MOZ_ASSERT(mir->fallible());
            bailoutIf(Assembler::Equal, ins->snapshot());
        }
        masm.bind(&notmin);
    }

    // Handle negative 0.
    if (!mir->canTruncateNegativeZero() && mir->canBeNegativeZero()) {
        Label nonzero;
        masm.test32(lhs, lhs);
        masm.j(Assembler::NonZero, &nonzero);
        masm.cmp32(rhs, Imm32(0));
        bailoutIf(Assembler::LessThan, ins->snapshot());
        masm.bind(&nonzero);
    }

    // Sign extend the lhs into edx to make (edx:eax), since idiv is 64-bit.
    if (lhs != eax)
        masm.mov(lhs, eax);
    masm.cdq();
    masm.idiv(rhs);

    if (!mir->canTruncateRemainder()) {
        // If the remainder is > 0, bailout since this must be a double.
        masm.test32(remainder, remainder);
        bailoutIf(Assembler::NonZero, ins->snapshot());
    }

    masm.bind(&done);

    if (ool) {
        addOutOfLineCode(ool, mir);
        masm.bind(ool->rejoin());
    }
}

void
CodeGeneratorX86Shared::visitModPowTwoI(LModPowTwoI* ins)
{
    Register lhs = ToRegister(ins->getOperand(0));
    int32_t shift = ins->shift();

    Label negative;

    if (ins->mir()->canBeNegativeDividend()) {
        // Switch based on sign of the lhs.
        // Positive numbers are just a bitmask
        masm.branchTest32(Assembler::Signed, lhs, lhs, &negative);
    }

    masm.andl(Imm32((uint32_t(1) << shift) - 1), lhs);

    if (ins->mir()->canBeNegativeDividend()) {
        Label done;
        masm.jump(&done);

        // Negative numbers need a negate, bitmask, negate
        masm.bind(&negative);

        // Unlike in the visitModI case, we are not computing the mod by means of a
        // division. Therefore, the divisor = -1 case isn't problematic (the andl
        // always returns 0, which is what we expect).
        //
        // The negl instruction overflows if lhs == INT32_MIN, but this is also not
        // a problem: shift is at most 31, and so the andl also always returns 0.
        masm.negl(lhs);
        masm.andl(Imm32((uint32_t(1) << shift) - 1), lhs);
        masm.negl(lhs);

        // Since a%b has the same sign as b, and a is negative in this branch,
        // an answer of 0 means the correct result is actually -0. Bail out.
        if (!ins->mir()->isTruncated())
            bailoutIf(Assembler::Zero, ins->snapshot());
        masm.bind(&done);
    }
}

class ModOverflowCheck : public OutOfLineCodeBase<CodeGeneratorX86Shared>
{
    Label done_;
    LModI* ins_;
    Register rhs_;

  public:
    explicit ModOverflowCheck(LModI* ins, Register rhs)
      : ins_(ins), rhs_(rhs)
    { }

    virtual void accept(CodeGeneratorX86Shared* codegen) {
        codegen->visitModOverflowCheck(this);
    }
    Label* done() {
        return &done_;
    }
    LModI* ins() const {
        return ins_;
    }
    Register rhs() const {
        return rhs_;
    }
};

void
CodeGeneratorX86Shared::visitModOverflowCheck(ModOverflowCheck* ool)
{
    masm.cmp32(ool->rhs(), Imm32(-1));
    if (ool->ins()->mir()->isTruncated()) {
        masm.j(Assembler::NotEqual, ool->rejoin());
        masm.mov(ImmWord(0), edx);
        masm.jmp(ool->done());
    } else {
        bailoutIf(Assembler::Equal, ool->ins()->snapshot());
        masm.jmp(ool->rejoin());
    }
}

void
CodeGeneratorX86Shared::visitModI(LModI* ins)
{
    Register remainder = ToRegister(ins->remainder());
    Register lhs = ToRegister(ins->lhs());
    Register rhs = ToRegister(ins->rhs());

    // Required to use idiv.
    MOZ_ASSERT_IF(lhs != rhs, rhs != eax);
    MOZ_ASSERT(rhs != edx);
    MOZ_ASSERT(remainder == edx);
    MOZ_ASSERT(ToRegister(ins->getTemp(0)) == eax);

    Label done;
    ReturnZero* ool = nullptr;
    ModOverflowCheck* overflow = nullptr;

    // Set up eax in preparation for doing a div.
    if (lhs != eax)
        masm.mov(lhs, eax);

    // Prevent divide by zero.
    if (ins->mir()->canBeDivideByZero()) {
        masm.test32(rhs, rhs);
        if (ins->mir()->isTruncated()) {
            if (!ool)
                ool = new(alloc()) ReturnZero(edx);
            masm.j(Assembler::Zero, ool->entry());
        } else {
            bailoutIf(Assembler::Zero, ins->snapshot());
        }
    }

    Label negative;

    // Switch based on sign of the lhs.
    if (ins->mir()->canBeNegativeDividend())
        masm.branchTest32(Assembler::Signed, lhs, lhs, &negative);

    // If lhs >= 0 then remainder = lhs % rhs. The remainder must be positive.
    {
        // Check if rhs is a power-of-two.
        if (ins->mir()->canBePowerOfTwoDivisor()) {
            MOZ_ASSERT(rhs != remainder);

            // Rhs y is a power-of-two if (y & (y-1)) == 0. Note that if
            // y is any negative number other than INT32_MIN, both y and
            // y-1 will have the sign bit set so these are never optimized
            // as powers-of-two. If y is INT32_MIN, y-1 will be INT32_MAX
            // and because lhs >= 0 at this point, lhs & INT32_MAX returns
            // the correct value.
            Label notPowerOfTwo;
            masm.mov(rhs, remainder);
            masm.subl(Imm32(1), remainder);
            masm.branchTest32(Assembler::NonZero, remainder, rhs, &notPowerOfTwo);
            {
                masm.andl(lhs, remainder);
                masm.jmp(&done);
            }
            masm.bind(&notPowerOfTwo);
        }

        // Since lhs >= 0, the sign-extension will be 0
        masm.mov(ImmWord(0), edx);
        masm.idiv(rhs);
    }

    // Otherwise, we have to beware of two special cases:
    if (ins->mir()->canBeNegativeDividend()) {
        masm.jump(&done);

        masm.bind(&negative);

        // Prevent an integer overflow exception from -2147483648 % -1
        Label notmin;
        masm.cmp32(lhs, Imm32(INT32_MIN));
        overflow = new(alloc()) ModOverflowCheck(ins, rhs);
        masm.j(Assembler::Equal, overflow->entry());
        masm.bind(overflow->rejoin());
        masm.cdq();
        masm.idiv(rhs);

        if (!ins->mir()->isTruncated()) {
            // A remainder of 0 means that the rval must be -0, which is a double.
            masm.test32(remainder, remainder);
            bailoutIf(Assembler::Zero, ins->snapshot());
        }
    }

    masm.bind(&done);

    if (overflow) {
        addOutOfLineCode(overflow, ins->mir());
        masm.bind(overflow->done());
    }

    if (ool) {
        addOutOfLineCode(ool, ins->mir());
        masm.bind(ool->rejoin());
    }
}

void
CodeGeneratorX86Shared::visitBitNotI(LBitNotI* ins)
{
    const LAllocation* input = ins->getOperand(0);
    MOZ_ASSERT(!input->isConstant());

    masm.notl(ToOperand(input));
}

void
CodeGeneratorX86Shared::visitBitOpI(LBitOpI* ins)
{
    const LAllocation* lhs = ins->getOperand(0);
    const LAllocation* rhs = ins->getOperand(1);

    switch (ins->bitop()) {
        case JSOP_BITOR:
            if (rhs->isConstant())
                masm.orl(Imm32(ToInt32(rhs)), ToOperand(lhs));
            else
                masm.orl(ToOperand(rhs), ToRegister(lhs));
            break;
        case JSOP_BITXOR:
            if (rhs->isConstant())
                masm.xorl(Imm32(ToInt32(rhs)), ToOperand(lhs));
            else
                masm.xorl(ToOperand(rhs), ToRegister(lhs));
            break;
        case JSOP_BITAND:
            if (rhs->isConstant())
                masm.andl(Imm32(ToInt32(rhs)), ToOperand(lhs));
            else
                masm.andl(ToOperand(rhs), ToRegister(lhs));
            break;
        default:
            MOZ_CRASH("unexpected binary opcode");
    }
}

void
CodeGeneratorX86Shared::visitShiftI(LShiftI* ins)
{
    Register lhs = ToRegister(ins->lhs());
    const LAllocation* rhs = ins->rhs();

    if (rhs->isConstant()) {
        int32_t shift = ToInt32(rhs) & 0x1F;
        switch (ins->bitop()) {
          case JSOP_LSH:
            if (shift)
                masm.shll(Imm32(shift), lhs);
            break;
          case JSOP_RSH:
            if (shift)
                masm.sarl(Imm32(shift), lhs);
            break;
          case JSOP_URSH:
            if (shift) {
                masm.shrl(Imm32(shift), lhs);
            } else if (ins->mir()->toUrsh()->fallible()) {
                // x >>> 0 can overflow.
                masm.test32(lhs, lhs);
                bailoutIf(Assembler::Signed, ins->snapshot());
            }
            break;
          default:
            MOZ_CRASH("Unexpected shift op");
        }
    } else {
        MOZ_ASSERT(ToRegister(rhs) == ecx);
        switch (ins->bitop()) {
          case JSOP_LSH:
            masm.shll_cl(lhs);
            break;
          case JSOP_RSH:
            masm.sarl_cl(lhs);
            break;
          case JSOP_URSH:
            masm.shrl_cl(lhs);
            if (ins->mir()->toUrsh()->fallible()) {
                // x >>> 0 can overflow.
                masm.test32(lhs, lhs);
                bailoutIf(Assembler::Signed, ins->snapshot());
            }
            break;
          default:
            MOZ_CRASH("Unexpected shift op");
        }
    }
}

void
CodeGeneratorX86Shared::visitUrshD(LUrshD* ins)
{
    Register lhs = ToRegister(ins->lhs());
    MOZ_ASSERT(ToRegister(ins->temp()) == lhs);

    const LAllocation* rhs = ins->rhs();
    FloatRegister out = ToFloatRegister(ins->output());

    if (rhs->isConstant()) {
        int32_t shift = ToInt32(rhs) & 0x1F;
        if (shift)
            masm.shrl(Imm32(shift), lhs);
    } else {
        MOZ_ASSERT(ToRegister(rhs) == ecx);
        masm.shrl_cl(lhs);
    }

    masm.convertUInt32ToDouble(lhs, out);
}

MoveOperand
CodeGeneratorX86Shared::toMoveOperand(const LAllocation* a) const
{
    if (a->isGeneralReg())
        return MoveOperand(ToRegister(a));
    if (a->isFloatReg())
        return MoveOperand(ToFloatRegister(a));
    return MoveOperand(StackPointer, ToStackOffset(a));
}

class OutOfLineTableSwitch : public OutOfLineCodeBase<CodeGeneratorX86Shared>
{
    MTableSwitch* mir_;
    CodeLabel jumpLabel_;

    void accept(CodeGeneratorX86Shared* codegen) {
        codegen->visitOutOfLineTableSwitch(this);
    }

  public:
    explicit OutOfLineTableSwitch(MTableSwitch* mir)
      : mir_(mir)
    {}

    MTableSwitch* mir() const {
        return mir_;
    }

    CodeLabel* jumpLabel() {
        return &jumpLabel_;
    }
};

void
CodeGeneratorX86Shared::visitOutOfLineTableSwitch(OutOfLineTableSwitch* ool)
{
    MTableSwitch* mir = ool->mir();

    masm.haltingAlign(sizeof(void*));
    masm.bind(ool->jumpLabel()->src());
    masm.addCodeLabel(*ool->jumpLabel());

    for (size_t i = 0; i < mir->numCases(); i++) {
        LBlock* caseblock = skipTrivialBlocks(mir->getCase(i))->lir();
        Label* caseheader = caseblock->label();
        uint32_t caseoffset = caseheader->offset();

        // The entries of the jump table need to be absolute addresses and thus
        // must be patched after codegen is finished.
        CodeLabel cl;
        masm.writeCodePointer(cl.dest());
        cl.src()->bind(caseoffset);
        masm.addCodeLabel(cl);
    }
}

void
CodeGeneratorX86Shared::emitTableSwitchDispatch(MTableSwitch* mir, Register index, Register base)
{
    Label* defaultcase = skipTrivialBlocks(mir->getDefault())->lir()->label();

    // Lower value with low value
    if (mir->low() != 0)
        masm.subl(Imm32(mir->low()), index);

    // Jump to default case if input is out of range
    int32_t cases = mir->numCases();
    masm.cmp32(index, Imm32(cases));
    masm.j(AssemblerX86Shared::AboveOrEqual, defaultcase);

    // To fill in the CodeLabels for the case entries, we need to first
    // generate the case entries (we don't yet know their offsets in the
    // instruction stream).
    OutOfLineTableSwitch* ool = new(alloc()) OutOfLineTableSwitch(mir);
    addOutOfLineCode(ool, mir);

    // Compute the position where a pointer to the right case stands.
    masm.mov(ool->jumpLabel()->dest(), base);
    Operand pointer = Operand(base, index, ScalePointer);

    // Jump to the right case
    masm.jmp(pointer);
}

void
CodeGeneratorX86Shared::visitMathD(LMathD* math)
{
    FloatRegister lhs = ToFloatRegister(math->lhs());
    Operand rhs = ToOperand(math->rhs());
    FloatRegister output = ToFloatRegister(math->output());

    switch (math->jsop()) {
      case JSOP_ADD:
        masm.vaddsd(rhs, lhs, output);
        break;
      case JSOP_SUB:
        masm.vsubsd(rhs, lhs, output);
        break;
      case JSOP_MUL:
        masm.vmulsd(rhs, lhs, output);
        break;
      case JSOP_DIV:
        masm.vdivsd(rhs, lhs, output);
        break;
      default:
        MOZ_CRASH("unexpected opcode");
    }
}

void
CodeGeneratorX86Shared::visitMathF(LMathF* math)
{
    FloatRegister lhs = ToFloatRegister(math->lhs());
    Operand rhs = ToOperand(math->rhs());
    FloatRegister output = ToFloatRegister(math->output());

    switch (math->jsop()) {
      case JSOP_ADD:
        masm.vaddss(rhs, lhs, output);
        break;
      case JSOP_SUB:
        masm.vsubss(rhs, lhs, output);
        break;
      case JSOP_MUL:
        masm.vmulss(rhs, lhs, output);
        break;
      case JSOP_DIV:
        masm.vdivss(rhs, lhs, output);
        break;
      default:
        MOZ_CRASH("unexpected opcode");
    }
}

void
CodeGeneratorX86Shared::visitFloor(LFloor* lir)
{
    FloatRegister input = ToFloatRegister(lir->input());
    FloatRegister scratch = ScratchDoubleReg;
    Register output = ToRegister(lir->output());

    Label bailout;

    if (AssemblerX86Shared::HasSSE41()) {
        // Bail on negative-zero.
        masm.branchNegativeZero(input, output, &bailout);
        bailoutFrom(&bailout, lir->snapshot());

        // Round toward -Infinity.
        masm.vroundsd(X86Encoding::RoundDown, input, scratch, scratch);

        bailoutCvttsd2si(scratch, output, lir->snapshot());
    } else {
        Label negative, end;

        // Branch to a slow path for negative inputs. Doesn't catch NaN or -0.
        masm.zeroDouble(scratch);
        masm.branchDouble(Assembler::DoubleLessThan, input, scratch, &negative);

        // Bail on negative-zero.
        masm.branchNegativeZero(input, output, &bailout);
        bailoutFrom(&bailout, lir->snapshot());

        // Input is non-negative, so truncation correctly rounds.
        bailoutCvttsd2si(input, output, lir->snapshot());

        masm.jump(&end);

        // Input is negative, but isn't -0.
        // Negative values go on a comparatively expensive path, since no
        // native rounding mode matches JS semantics. Still better than callVM.
        masm.bind(&negative);
        {
            // Truncate and round toward zero.
            // This is off-by-one for everything but integer-valued inputs.
            bailoutCvttsd2si(input, output, lir->snapshot());

            // Test whether the input double was integer-valued.
            masm.convertInt32ToDouble(output, scratch);
            masm.branchDouble(Assembler::DoubleEqualOrUnordered, input, scratch, &end);

            // Input is not integer-valued, so we rounded off-by-one in the
            // wrong direction. Correct by subtraction.
            masm.subl(Imm32(1), output);
            // Cannot overflow: output was already checked against INT_MIN.
        }

        masm.bind(&end);
    }
}

void
CodeGeneratorX86Shared::visitFloorF(LFloorF* lir)
{
    FloatRegister input = ToFloatRegister(lir->input());
    FloatRegister scratch = ScratchFloat32Reg;
    Register output = ToRegister(lir->output());

    Label bailout;

    if (AssemblerX86Shared::HasSSE41()) {
        // Bail on negative-zero.
        masm.branchNegativeZeroFloat32(input, output, &bailout);
        bailoutFrom(&bailout, lir->snapshot());

        // Round toward -Infinity.
        masm.vroundss(X86Encoding::RoundDown, input, scratch, scratch);

        bailoutCvttss2si(scratch, output, lir->snapshot());
    } else {
        Label negative, end;

        // Branch to a slow path for negative inputs. Doesn't catch NaN or -0.
        masm.zeroFloat32(scratch);
        masm.branchFloat(Assembler::DoubleLessThan, input, scratch, &negative);

        // Bail on negative-zero.
        masm.branchNegativeZeroFloat32(input, output, &bailout);
        bailoutFrom(&bailout, lir->snapshot());

        // Input is non-negative, so truncation correctly rounds.
        bailoutCvttss2si(input, output, lir->snapshot());

        masm.jump(&end);

        // Input is negative, but isn't -0.
        // Negative values go on a comparatively expensive path, since no
        // native rounding mode matches JS semantics. Still better than callVM.
        masm.bind(&negative);
        {
            // Truncate and round toward zero.
            // This is off-by-one for everything but integer-valued inputs.
            bailoutCvttss2si(input, output, lir->snapshot());

            // Test whether the input double was integer-valued.
            masm.convertInt32ToFloat32(output, scratch);
            masm.branchFloat(Assembler::DoubleEqualOrUnordered, input, scratch, &end);

            // Input is not integer-valued, so we rounded off-by-one in the
            // wrong direction. Correct by subtraction.
            masm.subl(Imm32(1), output);
            // Cannot overflow: output was already checked against INT_MIN.
        }

        masm.bind(&end);
    }
}

void
CodeGeneratorX86Shared::visitCeil(LCeil* lir)
{
    FloatRegister input = ToFloatRegister(lir->input());
    FloatRegister scratch = ScratchDoubleReg;
    Register output = ToRegister(lir->output());

    Label bailout, lessThanMinusOne;

    // Bail on ]-1; -0] range
    masm.loadConstantDouble(-1, scratch);
    masm.branchDouble(Assembler::DoubleLessThanOrEqualOrUnordered, input,
                      scratch, &lessThanMinusOne);

    // Test for remaining values with the sign bit set, i.e. ]-1; -0]
    masm.vmovmskpd(input, output);
    masm.branchTest32(Assembler::NonZero, output, Imm32(1), &bailout);
    bailoutFrom(&bailout, lir->snapshot());

    if (AssemblerX86Shared::HasSSE41()) {
        // x <= -1 or x > -0
        masm.bind(&lessThanMinusOne);
        // Round toward +Infinity.
        masm.vroundsd(X86Encoding::RoundUp, input, scratch, scratch);
        bailoutCvttsd2si(scratch, output, lir->snapshot());
        return;
    }

    // No SSE4.1
    Label end;

    // x >= 0 and x is not -0.0, we can truncate (resp. truncate and add 1) for
    // integer (resp. non-integer) values.
    // Will also work for values >= INT_MAX + 1, as the truncate
    // operation will return INT_MIN and there'll be a bailout.
    bailoutCvttsd2si(input, output, lir->snapshot());
    masm.convertInt32ToDouble(output, scratch);
    masm.branchDouble(Assembler::DoubleEqualOrUnordered, input, scratch, &end);

    // Input is not integer-valued, add 1 to obtain the ceiling value
    masm.addl(Imm32(1), output);
    // if input > INT_MAX, output == INT_MAX so adding 1 will overflow.
    bailoutIf(Assembler::Overflow, lir->snapshot());
    masm.jump(&end);

    // x <= -1, truncation is the way to go.
    masm.bind(&lessThanMinusOne);
    bailoutCvttsd2si(input, output, lir->snapshot());

    masm.bind(&end);
}

void
CodeGeneratorX86Shared::visitCeilF(LCeilF* lir)
{
    FloatRegister input = ToFloatRegister(lir->input());
    FloatRegister scratch = ScratchFloat32Reg;
    Register output = ToRegister(lir->output());

    Label bailout, lessThanMinusOne;

    // Bail on ]-1; -0] range
    masm.loadConstantFloat32(-1.f, scratch);
    masm.branchFloat(Assembler::DoubleLessThanOrEqualOrUnordered, input,
                     scratch, &lessThanMinusOne);

    // Test for remaining values with the sign bit set, i.e. ]-1; -0]
    masm.vmovmskps(input, output);
    masm.branchTest32(Assembler::NonZero, output, Imm32(1), &bailout);
    bailoutFrom(&bailout, lir->snapshot());

    if (AssemblerX86Shared::HasSSE41()) {
        // x <= -1 or x > -0
        masm.bind(&lessThanMinusOne);
        // Round toward +Infinity.
        masm.vroundss(X86Encoding::RoundUp, input, scratch, scratch);
        bailoutCvttss2si(scratch, output, lir->snapshot());
        return;
    }

    // No SSE4.1
    Label end;

    // x >= 0 and x is not -0.0, we can truncate (resp. truncate and add 1) for
    // integer (resp. non-integer) values.
    // Will also work for values >= INT_MAX + 1, as the truncate
    // operation will return INT_MIN and there'll be a bailout.
    bailoutCvttss2si(input, output, lir->snapshot());
    masm.convertInt32ToFloat32(output, scratch);
    masm.branchFloat(Assembler::DoubleEqualOrUnordered, input, scratch, &end);

    // Input is not integer-valued, add 1 to obtain the ceiling value
    masm.addl(Imm32(1), output);
    // if input > INT_MAX, output == INT_MAX so adding 1 will overflow.
    bailoutIf(Assembler::Overflow, lir->snapshot());
    masm.jump(&end);

    // x <= -1, truncation is the way to go.
    masm.bind(&lessThanMinusOne);
    bailoutCvttss2si(input, output, lir->snapshot());

    masm.bind(&end);
}

void
CodeGeneratorX86Shared::visitRound(LRound* lir)
{
    FloatRegister input = ToFloatRegister(lir->input());
    FloatRegister temp = ToFloatRegister(lir->temp());
    FloatRegister scratch = ScratchDoubleReg;
    Register output = ToRegister(lir->output());

    Label negativeOrZero, negative, end, bailout;

    // Branch to a slow path for non-positive inputs. Doesn't catch NaN.
    masm.zeroDouble(scratch);
    masm.loadConstantDouble(GetBiggestNumberLessThan(0.5), temp);
    masm.branchDouble(Assembler::DoubleLessThanOrEqual, input, scratch, &negativeOrZero);

    // Input is positive. Add the biggest double less than 0.5 and
    // truncate, rounding down (because if the input is the biggest double less
    // than 0.5, adding 0.5 would undesirably round up to 1). Note that we have
    // to add the input to the temp register because we're not allowed to
    // modify the input register.
    masm.addDouble(input, temp);
    bailoutCvttsd2si(temp, output, lir->snapshot());

    masm.jump(&end);

    // Input is negative, +0 or -0.
    masm.bind(&negativeOrZero);
    // Branch on negative input.
    masm.j(Assembler::NotEqual, &negative);

    // Bail on negative-zero.
    masm.branchNegativeZero(input, output, &bailout, /* maybeNonZero = */ false);
    bailoutFrom(&bailout, lir->snapshot());

    // Input is +0
    masm.xor32(output, output);
    masm.jump(&end);

    // Input is negative.
    masm.bind(&negative);

    // Inputs in ]-0.5; 0] need to be added 0.5, other negative inputs need to
    // be added the biggest double less than 0.5.
    Label loadJoin;
    masm.loadConstantDouble(-0.5, scratch);
    masm.branchDouble(Assembler::DoubleLessThan, input, scratch, &loadJoin);
    masm.loadConstantDouble(0.5, temp);
    masm.bind(&loadJoin);

    if (AssemblerX86Shared::HasSSE41()) {
        // Add 0.5 and round toward -Infinity. The result is stored in the temp
        // register (currently contains 0.5).
        masm.addDouble(input, temp);
        masm.vroundsd(X86Encoding::RoundDown, temp, scratch, scratch);

        // Truncate.
        bailoutCvttsd2si(scratch, output, lir->snapshot());

        // If the result is positive zero, then the actual result is -0. Bail.
        // Otherwise, the truncation will have produced the correct negative integer.
        masm.test32(output, output);
        bailoutIf(Assembler::Zero, lir->snapshot());
    } else {
        masm.addDouble(input, temp);

        // Round toward -Infinity without the benefit of ROUNDSD.
        {
            // If input + 0.5 >= 0, input is a negative number >= -0.5 and the result is -0.
            masm.compareDouble(Assembler::DoubleGreaterThanOrEqual, temp, scratch);
            bailoutIf(Assembler::DoubleGreaterThanOrEqual, lir->snapshot());

            // Truncate and round toward zero.
            // This is off-by-one for everything but integer-valued inputs.
            bailoutCvttsd2si(temp, output, lir->snapshot());

            // Test whether the truncated double was integer-valued.
            masm.convertInt32ToDouble(output, scratch);
            masm.branchDouble(Assembler::DoubleEqualOrUnordered, temp, scratch, &end);

            // Input is not integer-valued, so we rounded off-by-one in the
            // wrong direction. Correct by subtraction.
            masm.subl(Imm32(1), output);
            // Cannot overflow: output was already checked against INT_MIN.
        }
    }

    masm.bind(&end);
}

void
CodeGeneratorX86Shared::visitRoundF(LRoundF* lir)
{
    FloatRegister input = ToFloatRegister(lir->input());
    FloatRegister temp = ToFloatRegister(lir->temp());
    FloatRegister scratch = ScratchFloat32Reg;
    Register output = ToRegister(lir->output());

    Label negativeOrZero, negative, end, bailout;

    // Branch to a slow path for non-positive inputs. Doesn't catch NaN.
    masm.zeroFloat32(scratch);
    masm.loadConstantFloat32(GetBiggestNumberLessThan(0.5f), temp);
    masm.branchFloat(Assembler::DoubleLessThanOrEqual, input, scratch, &negativeOrZero);

    // Input is non-negative. Add the biggest float less than 0.5 and truncate,
    // rounding down (because if the input is the biggest float less than 0.5,
    // adding 0.5 would undesirably round up to 1). Note that we have to add
    // the input to the temp register because we're not allowed to modify the
    // input register.
    masm.addFloat32(input, temp);

    bailoutCvttss2si(temp, output, lir->snapshot());

    masm.jump(&end);

    // Input is negative, +0 or -0.
    masm.bind(&negativeOrZero);
    // Branch on negative input.
    masm.j(Assembler::NotEqual, &negative);

    // Bail on negative-zero.
    masm.branchNegativeZeroFloat32(input, output, &bailout);
    bailoutFrom(&bailout, lir->snapshot());

    // Input is +0.
    masm.xor32(output, output);
    masm.jump(&end);

    // Input is negative.
    masm.bind(&negative);

    // Inputs in ]-0.5; 0] need to be added 0.5, other negative inputs need to
    // be added the biggest double less than 0.5.
    Label loadJoin;
    masm.loadConstantFloat32(-0.5f, scratch);
    masm.branchFloat(Assembler::DoubleLessThan, input, scratch, &loadJoin);
    masm.loadConstantFloat32(0.5f, temp);
    masm.bind(&loadJoin);

    if (AssemblerX86Shared::HasSSE41()) {
        // Add 0.5 and round toward -Infinity. The result is stored in the temp
        // register (currently contains 0.5).
        masm.addFloat32(input, temp);
        masm.vroundss(X86Encoding::RoundDown, temp, scratch, scratch);

        // Truncate.
        bailoutCvttss2si(scratch, output, lir->snapshot());

        // If the result is positive zero, then the actual result is -0. Bail.
        // Otherwise, the truncation will have produced the correct negative integer.
        masm.test32(output, output);
        bailoutIf(Assembler::Zero, lir->snapshot());
    } else {
        masm.addFloat32(input, temp);
        // Round toward -Infinity without the benefit of ROUNDSS.
        {
            // If input + 0.5 >= 0, input is a negative number >= -0.5 and the result is -0.
            masm.compareFloat(Assembler::DoubleGreaterThanOrEqual, temp, scratch);
            bailoutIf(Assembler::DoubleGreaterThanOrEqual, lir->snapshot());

            // Truncate and round toward zero.
            // This is off-by-one for everything but integer-valued inputs.
            bailoutCvttss2si(temp, output, lir->snapshot());

            // Test whether the truncated double was integer-valued.
            masm.convertInt32ToFloat32(output, scratch);
            masm.branchFloat(Assembler::DoubleEqualOrUnordered, temp, scratch, &end);

            // Input is not integer-valued, so we rounded off-by-one in the
            // wrong direction. Correct by subtraction.
            masm.subl(Imm32(1), output);
            // Cannot overflow: output was already checked against INT_MIN.
        }
    }

    masm.bind(&end);
}

void
CodeGeneratorX86Shared::visitGuardShape(LGuardShape* guard)
{
    Register obj = ToRegister(guard->input());
    masm.cmpPtr(Operand(obj, JSObject::offsetOfShape()), ImmGCPtr(guard->mir()->shape()));

    bailoutIf(Assembler::NotEqual, guard->snapshot());
}

void
CodeGeneratorX86Shared::visitGuardObjectGroup(LGuardObjectGroup* guard)
{
    Register obj = ToRegister(guard->input());

    if (guard->mir()->checkUnboxedExpando()) {
        masm.cmpPtr(Address(obj, UnboxedPlainObject::offsetOfExpando()), ImmWord(0));
        bailoutIf(Assembler::NotEqual, guard->snapshot());
    }

    masm.cmpPtr(Operand(obj, JSObject::offsetOfGroup()), ImmGCPtr(guard->mir()->group()));

    Assembler::Condition cond =
        guard->mir()->bailOnEquality() ? Assembler::Equal : Assembler::NotEqual;
    bailoutIf(cond, guard->snapshot());
}

void
CodeGeneratorX86Shared::visitGuardClass(LGuardClass* guard)
{
    Register obj = ToRegister(guard->input());
    Register tmp = ToRegister(guard->tempInt());

    masm.loadPtr(Address(obj, JSObject::offsetOfGroup()), tmp);
    masm.cmpPtr(Operand(tmp, ObjectGroup::offsetOfClasp()), ImmPtr(guard->mir()->getClass()));
    bailoutIf(Assembler::NotEqual, guard->snapshot());
}

void
CodeGeneratorX86Shared::visitEffectiveAddress(LEffectiveAddress* ins)
{
    const MEffectiveAddress* mir = ins->mir();
    Register base = ToRegister(ins->base());
    Register index = ToRegister(ins->index());
    Register output = ToRegister(ins->output());
    masm.leal(Operand(base, index, mir->scale(), mir->displacement()), output);
}

void
CodeGeneratorX86Shared::generateInvalidateEpilogue()
{
    // Ensure that there is enough space in the buffer for the OsiPoint
    // patching to occur. Otherwise, we could overwrite the invalidation
    // epilogue.
    for (size_t i = 0; i < sizeof(void*); i += Assembler::NopSize())
        masm.nop();

    masm.bind(&invalidate_);

    // Push the Ion script onto the stack (when we determine what that pointer is).
    invalidateEpilogueData_ = masm.pushWithPatch(ImmWord(uintptr_t(-1)));
    JitCode* thunk = gen->jitRuntime()->getInvalidationThunk();

    masm.call(thunk);

    // We should never reach this point in JIT code -- the invalidation thunk should
    // pop the invalidated JS frame and return directly to its caller.
    masm.assumeUnreachable("Should have returned directly to its caller instead of here.");
}

void
CodeGeneratorX86Shared::visitNegI(LNegI* ins)
{
    Register input = ToRegister(ins->input());
    MOZ_ASSERT(input == ToRegister(ins->output()));

    masm.neg32(input);
}

void
CodeGeneratorX86Shared::visitNegD(LNegD* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    MOZ_ASSERT(input == ToFloatRegister(ins->output()));

    masm.negateDouble(input);
}

void
CodeGeneratorX86Shared::visitNegF(LNegF* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    MOZ_ASSERT(input == ToFloatRegister(ins->output()));

    masm.negateFloat(input);
}

void
CodeGeneratorX86Shared::visitInt32x4(LInt32x4* ins)
{
    const LDefinition* out = ins->getDef(0);
    masm.loadConstantInt32x4(ins->getValue(), ToFloatRegister(out));
}

void
CodeGeneratorX86Shared::visitFloat32x4(LFloat32x4* ins)
{
    const LDefinition* out = ins->getDef(0);
    masm.loadConstantFloat32x4(ins->getValue(), ToFloatRegister(out));
}

void
CodeGeneratorX86Shared::visitInt32x4ToFloat32x4(LInt32x4ToFloat32x4* ins)
{
    FloatRegister in = ToFloatRegister(ins->input());
    FloatRegister out = ToFloatRegister(ins->output());
    masm.convertInt32x4ToFloat32x4(in, out);
}

void
CodeGeneratorX86Shared::visitFloat32x4ToInt32x4(LFloat32x4ToInt32x4* ins)
{
    FloatRegister in = ToFloatRegister(ins->input());
    FloatRegister out = ToFloatRegister(ins->output());
    masm.convertFloat32x4ToInt32x4(in, out);
}

void
CodeGeneratorX86Shared::visitSimdValueInt32x4(LSimdValueInt32x4* ins)
{
    MOZ_ASSERT(ins->mir()->type() == MIRType_Int32x4);

    FloatRegister output = ToFloatRegister(ins->output());
    if (AssemblerX86Shared::HasSSE41()) {
        masm.vmovd(ToRegister(ins->getOperand(0)), output);
        for (size_t i = 1; i < 4; ++i) {
            Register r = ToRegister(ins->getOperand(i));
            masm.vpinsrd(i, r, output, output);
        }
        return;
    }

    masm.reserveStack(Simd128DataSize);
    for (size_t i = 0; i < 4; ++i) {
        Register r = ToRegister(ins->getOperand(i));
        masm.store32(r, Address(StackPointer, i * sizeof(int32_t)));
    }
    masm.loadAlignedInt32x4(Address(StackPointer, 0), output);
    masm.freeStack(Simd128DataSize);
}

void
CodeGeneratorX86Shared::visitSimdValueFloat32x4(LSimdValueFloat32x4* ins)
{
    MOZ_ASSERT(ins->mir()->type() == MIRType_Float32x4);

    FloatRegister r0 = ToFloatRegister(ins->getOperand(0));
    FloatRegister r1 = ToFloatRegister(ins->getOperand(1));
    FloatRegister r2 = ToFloatRegister(ins->getOperand(2));
    FloatRegister r3 = ToFloatRegister(ins->getOperand(3));
    FloatRegister tmp = ToFloatRegister(ins->getTemp(0));
    FloatRegister output = ToFloatRegister(ins->output());

    FloatRegister r0Copy = masm.reusedInputFloat32x4(r0, output);
    FloatRegister r1Copy = masm.reusedInputFloat32x4(r1, tmp);

    masm.vunpcklps(r3, r1Copy, tmp);
    masm.vunpcklps(r2, r0Copy, output);
    masm.vunpcklps(tmp, output, output);
}

void
CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
{
    FloatRegister output = ToFloatRegister(ins->output());

    MSimdSplatX4* mir = ins->mir();
    MOZ_ASSERT(IsSimdType(mir->type()));
    JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));

    switch (mir->type()) {
      case MIRType_Int32x4: {
        Register r = ToRegister(ins->getOperand(0));
        masm.vmovd(r, output);
        masm.vpshufd(0, output, output);
        break;
      }
      case MIRType_Float32x4: {
        FloatRegister r = ToFloatRegister(ins->getOperand(0));
        FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
        masm.vshufps(0, rCopy, rCopy, output);
        break;
      }
      default:
        MOZ_CRASH("Unknown SIMD kind");
    }
}

void
CodeGeneratorX86Shared::visitSimdReinterpretCast(LSimdReinterpretCast* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());

    if (input.aliases(output))
        return;

    switch (ins->mir()->type()) {
      case MIRType_Int32x4:
        masm.vmovdqa(input, output);
        break;
      case MIRType_Float32x4:
        masm.vmovaps(input, output);
        break;
      default:
        MOZ_CRASH("Unknown SIMD kind");
    }
}

void
CodeGeneratorX86Shared::visitSimdExtractElementI(LSimdExtractElementI* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    Register output = ToRegister(ins->output());

    SimdLane lane = ins->lane();
    if (lane == LaneX) {
        // The value we want to extract is in the low double-word
        masm.moveLowInt32(input, output);
    } else if (AssemblerX86Shared::HasSSE41()) {
        masm.vpextrd(lane, input, output);
    } else {
        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
        masm.shuffleInt32(mask, input, ScratchSimdReg);
        masm.moveLowInt32(ScratchSimdReg, output);
    }
}

void
CodeGeneratorX86Shared::visitSimdExtractElementF(LSimdExtractElementF* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());

    SimdLane lane = ins->lane();
    if (lane == LaneX) {
        // The value we want to extract is in the low double-word
        if (input != output)
            masm.moveFloat32(input, output);
    } else if (lane == LaneZ) {
        masm.moveHighPairToLowPairFloat32(input, output);
    } else {
        uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
        masm.shuffleFloat32(mask, input, output);
    }
    // NaNs contained within SIMD values are not enforced to be canonical, so
    // when we extract an element into a "regular" scalar JS value, we have to
    // canonicalize. In asm.js code, we can skip this, as asm.js only has to
    // canonicalize NaNs at FFI boundaries.
    if (!gen->compilingAsmJS())
        masm.canonicalizeFloat(output);
}

void
CodeGeneratorX86Shared::visitSimdInsertElementI(LSimdInsertElementI* ins)
{
    FloatRegister vector = ToFloatRegister(ins->vector());
    Register value = ToRegister(ins->value());
    FloatRegister output = ToFloatRegister(ins->output());
    MOZ_ASSERT(vector == output); // defineReuseInput(0)

    unsigned component = unsigned(ins->lane());

    // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
    // value goes into the first component, as vmovd clears out the higher lanes
    // of the output.
    if (AssemblerX86Shared::HasSSE41()) {
        // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
        masm.vpinsrd(component, value, vector, output);
        return;
    }

    masm.reserveStack(Simd128DataSize);
    masm.storeAlignedInt32x4(vector, Address(StackPointer, 0));
    masm.store32(value, Address(StackPointer, component * sizeof(int32_t)));
    masm.loadAlignedInt32x4(Address(StackPointer, 0), output);
    masm.freeStack(Simd128DataSize);
}

void
CodeGeneratorX86Shared::visitSimdInsertElementF(LSimdInsertElementF* ins)
{
    FloatRegister vector = ToFloatRegister(ins->vector());
    FloatRegister value = ToFloatRegister(ins->value());
    FloatRegister output = ToFloatRegister(ins->output());
    MOZ_ASSERT(vector == output); // defineReuseInput(0)

    if (ins->lane() == SimdLane::LaneX) {
        // As both operands are registers, vmovss doesn't modify the upper bits
        // of the destination operand.
        if (value != output)
            masm.vmovss(value, vector, output);
        return;
    }

    if (AssemblerX86Shared::HasSSE41()) {
        // The input value is in the low float32 of the 'value' FloatRegister.
        masm.vinsertps(masm.vinsertpsMask(SimdLane::LaneX, ins->lane()), value, output, output);
        return;
    }

    unsigned component = unsigned(ins->lane());
    masm.reserveStack(Simd128DataSize);
    masm.storeAlignedFloat32x4(vector, Address(StackPointer, 0));
    masm.storeFloat32(value, Address(StackPointer, component * sizeof(int32_t)));
    masm.loadAlignedFloat32x4(Address(StackPointer, 0), output);
    masm.freeStack(Simd128DataSize);
}

void
CodeGeneratorX86Shared::visitSimdSignMaskX4(LSimdSignMaskX4* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    Register output = ToRegister(ins->output());

    // For Float32x4 and Int32x4.
    masm.vmovmskps(input, output);
}

template <class T, class Reg> void
CodeGeneratorX86Shared::visitSimdGeneralShuffle(LSimdGeneralShuffleBase* ins, Reg tempRegister)
{
    MSimdGeneralShuffle* mir = ins->mir();
    unsigned numVectors = mir->numVectors();

    Register laneTemp = ToRegister(ins->temp());

    // This won't generate fast code, but it's fine because we expect users
    // to have used constant indices (and thus MSimdGeneralShuffle to be fold
    // into MSimdSwizzle/MSimdShuffle, which are fast).
    masm.reserveStack(Simd128DataSize * numVectors);

    for (unsigned i = 0; i < numVectors; i++) {
        masm.storeAlignedVector<T>(ToFloatRegister(ins->vector(i)),
                                   Address(StackPointer, Simd128DataSize * (1 + i)));
    }

    Label bail;

    for (size_t i = 0; i < mir->numLanes(); i++) {
        Operand lane = ToOperand(ins->lane(i));

        masm.cmp32(lane, Imm32(mir->numVectors() * mir->numLanes() - 1));
        masm.j(Assembler::Above, &bail);

        if (lane.kind() == Operand::REG) {
            masm.loadScalar<T>(Operand(StackPointer, ToRegister(ins->lane(i)), TimesFour, Simd128DataSize),
                               tempRegister);
        } else {
            masm.load32(lane, laneTemp);
            masm.loadScalar<T>(Operand(StackPointer, laneTemp, TimesFour, Simd128DataSize), tempRegister);
        }

        masm.storeScalar<T>(tempRegister, Address(StackPointer, i * sizeof(T)));
    }

    FloatRegister output = ToFloatRegister(ins->output());
    masm.loadAlignedVector<T>(Address(StackPointer, 0), output);

    Label join;
    masm.jump(&join);

    {
        masm.bind(&bail);
        masm.freeStack(Simd128DataSize * numVectors);
        bailout(ins->snapshot());
    }

    masm.bind(&join);
    masm.setFramePushed(masm.framePushed() + Simd128DataSize * numVectors);
    masm.freeStack(Simd128DataSize * numVectors);
}

void
CodeGeneratorX86Shared::visitSimdGeneralShuffleI(LSimdGeneralShuffleI* ins)
{
    visitSimdGeneralShuffle<int32_t, Register>(ins, ToRegister(ins->temp()));
}
void
CodeGeneratorX86Shared::visitSimdGeneralShuffleF(LSimdGeneralShuffleF* ins)
{
    visitSimdGeneralShuffle<float, FloatRegister>(ins, ScratchFloat32Reg);
}

void
CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());

    uint32_t x = ins->laneX();
    uint32_t y = ins->laneY();
    uint32_t z = ins->laneZ();
    uint32_t w = ins->laneW();

    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
    masm.shuffleInt32(mask, input, output);
}

void
CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF* ins)
{
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());

    uint32_t x = ins->laneX();
    uint32_t y = ins->laneY();
    uint32_t z = ins->laneZ();
    uint32_t w = ins->laneW();

    if (AssemblerX86Shared::HasSSE3()) {
        if (ins->lanesMatch(0, 0, 2, 2)) {
            masm.vmovsldup(input, output);
            return;
        }
        if (ins->lanesMatch(1, 1, 3, 3)) {
            masm.vmovshdup(input, output);
            return;
        }
    }

    // TODO Here and below, arch specific lowering could identify this pattern
    // and use defineReuseInput to avoid this move (bug 1084404)
    if (ins->lanesMatch(2, 3, 2, 3)) {
        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
        masm.vmovhlps(input, inputCopy, output);
        return;
    }

    if (ins->lanesMatch(0, 1, 0, 1)) {
        if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
            masm.vmovddup(input, output);
            return;
        }
        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
        masm.vmovlhps(input, inputCopy, output);
        return;
    }

    if (ins->lanesMatch(0, 0, 1, 1)) {
        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
        masm.vunpcklps(input, inputCopy, output);
        return;
    }

    if (ins->lanesMatch(2, 2, 3, 3)) {
        FloatRegister inputCopy = masm.reusedInputFloat32x4(input, output);
        masm.vunpckhps(input, inputCopy, output);
        return;
    }

    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
    masm.shuffleFloat32(mask, input, output);
}

void
CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle* ins)
{
    FloatRegister lhs = ToFloatRegister(ins->lhs());
    Operand rhs = ToOperand(ins->rhs());
    FloatRegister out = ToFloatRegister(ins->output());

    uint32_t x = ins->laneX();
    uint32_t y = ins->laneY();
    uint32_t z = ins->laneZ();
    uint32_t w = ins->laneW();

    // Check that lanes come from LHS in majority:
    unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
    MOZ_ASSERT(numLanesFromLHS >= 2);

    // When reading this method, remember that vshufps takes the two first
    // inputs of the destination operand (right operand) and the two last
    // inputs of the source operand (left operand).
    //
    // Legend for explanations:
    // - L: LHS
    // - R: RHS
    // - T: temporary

    uint32_t mask;

    // If all lanes came from a single vector, we should have constructed a
    // MSimdSwizzle instead.
    MOZ_ASSERT(numLanesFromLHS < 4);

    // If all values stay in their lane, this is a blend.
    if (AssemblerX86Shared::HasSSE41()) {
        if (x % 4 == 0 && y % 4 == 1 && z % 4 == 2 && w % 4 == 3) {
            masm.vblendps(masm.blendpsMask(x >= 4, y >= 4, z >= 4, w >= 4), rhs, lhs, out);
            return;
        }
    }

    // One element of the second, all other elements of the first
    if (numLanesFromLHS == 3) {
        unsigned firstMask = -1, secondMask = -1;

        // register-register vmovss preserves the high lanes.
        if (ins->lanesMatch(4, 1, 2, 3) && rhs.kind() == Operand::FPREG) {
            masm.vmovss(FloatRegister::FromCode(rhs.fpu()), lhs, out);
            return;
        }

        // SSE4.1 vinsertps can handle any single element.
        unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
        if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
            SimdLane srcLane;
            SimdLane dstLane;
            if (x >= 4) {
                srcLane = SimdLane(x - 4);
                dstLane = LaneX;
            } else if (y >= 4) {
                srcLane = SimdLane(y - 4);
                dstLane = LaneY;
            } else if (z >= 4) {
                srcLane = SimdLane(z - 4);
                dstLane = LaneZ;
            } else {
                MOZ_ASSERT(w >= 4);
                srcLane = SimdLane(w - 4);
                dstLane = LaneW;
            }
            masm.vinsertps(masm.vinsertpsMask(srcLane, dstLane), rhs, lhs, out);
            return;
        }

        FloatRegister rhsCopy = ToFloatRegister(ins->temp());

        if (x < 4 && y < 4) {
            if (w >= 4) {
                w %= 4;
                // T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhsCopy)
                firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
                // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, out)
                secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneZ, LaneX);
            } else {
                MOZ_ASSERT(z >= 4);
                z %= 4;
                // T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhsCopy)
                firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
                // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, out)
                secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneX, LaneZ);
            }

            masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
            masm.vshufps(secondMask, rhsCopy, lhs, out);
            return;
        }

        MOZ_ASSERT(z < 4 && w < 4);

        if (y >= 4) {
            y %= 4;
            // T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhsCopy)
            firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
            // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, out)
            secondMask = MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, z, w);
        } else {
            MOZ_ASSERT(x >= 4);
            x %= 4;
            // T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhsCopy)
            firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
            // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, out)
            secondMask = MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, z, w);
        }

        masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
        if (AssemblerX86Shared::HasAVX()) {
            masm.vshufps(secondMask, lhs, rhsCopy, out);
        } else {
            masm.vshufps(secondMask, lhs, rhsCopy, rhsCopy);
            masm.moveFloat32x4(rhsCopy, out);
        }
        return;
    }

    // Two elements from one vector, two other elements from the other
    MOZ_ASSERT(numLanesFromLHS == 2);

    // TODO Here and below, symmetric case would be more handy to avoid a move,
    // but can't be reached because operands would get swapped (bug 1084404).
    if (ins->lanesMatch(2, 3, 6, 7)) {
        if (AssemblerX86Shared::HasAVX()) {
            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, ScratchSimdReg);
            masm.vmovhlps(lhs, rhsCopy, out);
        } else {
            masm.loadAlignedFloat32x4(rhs, ScratchSimdReg);
            masm.vmovhlps(lhs, ScratchSimdReg, ScratchSimdReg);
            masm.moveFloat32x4(ScratchSimdReg, out);
        }
        return;
    }

    if (ins->lanesMatch(0, 1, 4, 5)) {
        FloatRegister rhsCopy;
        if (rhs.kind() == Operand::FPREG) {
            // No need to make an actual copy, since the operand is already
            // in a register, and it won't be clobbered by the vmovlhps.
            rhsCopy = FloatRegister::FromCode(rhs.fpu());
        } else {
            masm.loadAlignedFloat32x4(rhs, ScratchSimdReg);
            rhsCopy = ScratchSimdReg;
        }
        masm.vmovlhps(rhsCopy, lhs, out);
        return;
    }

    if (ins->lanesMatch(0, 4, 1, 5)) {
        masm.vunpcklps(rhs, lhs, out);
        return;
    }

    // TODO swapped case would be better (bug 1084404)
    if (ins->lanesMatch(4, 0, 5, 1)) {
        if (AssemblerX86Shared::HasAVX()) {
            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, ScratchSimdReg);
            masm.vunpcklps(lhs, rhsCopy, out);
        } else {
            masm.loadAlignedFloat32x4(rhs, ScratchSimdReg);
            masm.vunpcklps(lhs, ScratchSimdReg, ScratchSimdReg);
            masm.moveFloat32x4(ScratchSimdReg, out);
        }
        return;
    }

    if (ins->lanesMatch(2, 6, 3, 7)) {
        masm.vunpckhps(rhs, lhs, out);
        return;
    }

    // TODO swapped case would be better (bug 1084404)
    if (ins->lanesMatch(6, 2, 7, 3)) {
        if (AssemblerX86Shared::HasAVX()) {
            FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, ScratchSimdReg);
            masm.vunpckhps(lhs, rhsCopy, out);
        } else {
            masm.loadAlignedFloat32x4(rhs, ScratchSimdReg);
            masm.vunpckhps(lhs, ScratchSimdReg, ScratchSimdReg);
            masm.moveFloat32x4(ScratchSimdReg, out);
        }
        return;
    }

    // In one vshufps
    if (x < 4 && y < 4) {
        mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
        masm.vshufps(mask, rhs, lhs, out);
        return;
    }

    // At creation, we should have explicitly swapped in this case.
    MOZ_ASSERT(!(z >= 4 && w >= 4));

    // In two vshufps, for the most generic case:
    uint32_t firstMask[4], secondMask[4];
    unsigned i = 0, j = 2, k = 0;

#define COMPUTE_MASK(lane)       \
    if (lane >= 4) {             \
        firstMask[j] = lane % 4; \
        secondMask[k++] = j++;   \
    } else {                     \
        firstMask[i] = lane;     \
        secondMask[k++] = i++;   \
    }

    COMPUTE_MASK(x)
    COMPUTE_MASK(y)
    COMPUTE_MASK(z)
    COMPUTE_MASK(w)
#undef COMPUTE_MASK

    MOZ_ASSERT(i == 2 && j == 4 && k == 4);

    mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
                                              firstMask[2], firstMask[3]);
    masm.vshufps(mask, rhs, lhs, lhs);

    mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
                                              secondMask[2], secondMask[3]);
    masm.vshufps(mask, lhs, lhs, lhs);
}

void
CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4* ins)
{
    static const SimdConstant allOnes = SimdConstant::SplatX4(-1);

    FloatRegister lhs = ToFloatRegister(ins->lhs());
    Operand rhs = ToOperand(ins->rhs());
    MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs);

    MSimdBinaryComp::Operation op = ins->operation();
    switch (op) {
      case MSimdBinaryComp::greaterThan:
        masm.packedGreaterThanInt32x4(rhs, lhs);
        return;
      case MSimdBinaryComp::equal:
        masm.packedEqualInt32x4(rhs, lhs);
        return;
      case MSimdBinaryComp::lessThan:
        // src := rhs
        if (rhs.kind() == Operand::FPREG)
            masm.moveInt32x4(ToFloatRegister(ins->rhs()), ScratchSimdReg);
        else
            masm.loadAlignedInt32x4(rhs, ScratchSimdReg);

        // src := src > lhs (i.e. lhs < rhs)
        // Improve by doing custom lowering (rhs is tied to the output register)
        masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), ScratchSimdReg);
        masm.moveInt32x4(ScratchSimdReg, lhs);
        return;
      case MSimdBinaryComp::notEqual:
        // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
        // should invert the comparison by, e.g. swapping the arms of a select
        // if that's what it's used in.
        masm.loadConstantInt32x4(allOnes, ScratchSimdReg);
        masm.packedEqualInt32x4(rhs, lhs);
        masm.bitwiseXorX4(Operand(ScratchSimdReg), lhs);
        return;
      case MSimdBinaryComp::greaterThanOrEqual:
        // src := rhs
        if (rhs.kind() == Operand::FPREG)
            masm.moveInt32x4(ToFloatRegister(ins->rhs()), ScratchSimdReg);
        else
            masm.loadAlignedInt32x4(rhs, ScratchSimdReg);
        masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), ScratchSimdReg);
        masm.loadConstantInt32x4(allOnes, lhs);
        masm.bitwiseXorX4(Operand(ScratchSimdReg), lhs);
        return;
      case MSimdBinaryComp::lessThanOrEqual:
        // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
        masm.loadConstantInt32x4(allOnes, ScratchSimdReg);
        masm.packedGreaterThanInt32x4(rhs, lhs);
        masm.bitwiseXorX4(Operand(ScratchSimdReg), lhs);
        return;
    }
    MOZ_CRASH("unexpected SIMD op");
}

void
CodeGeneratorX86Shared::visitSimdBinaryCompFx4(LSimdBinaryCompFx4* ins)
{
    FloatRegister lhs = ToFloatRegister(ins->lhs());
    Operand rhs = ToOperand(ins->rhs());
    FloatRegister output = ToFloatRegister(ins->output());

    MSimdBinaryComp::Operation op = ins->operation();
    switch (op) {
      case MSimdBinaryComp::equal:
        masm.vcmpeqps(rhs, lhs, output);
        return;
      case MSimdBinaryComp::lessThan:
        masm.vcmpltps(rhs, lhs, output);
        return;
      case MSimdBinaryComp::lessThanOrEqual:
        masm.vcmpleps(rhs, lhs, output);
        return;
      case MSimdBinaryComp::notEqual:
        masm.vcmpneqps(rhs, lhs, output);
        return;
      case MSimdBinaryComp::greaterThanOrEqual:
      case MSimdBinaryComp::greaterThan:
        // We reverse these before register allocation so that we don't have to
        // copy into and out of temporaries after codegen.
        MOZ_CRASH("lowering should have reversed this");
    }
    MOZ_CRASH("unexpected SIMD op");
}

void
CodeGeneratorX86Shared::visitSimdBinaryArithIx4(LSimdBinaryArithIx4* ins)
{
    FloatRegister lhs = ToFloatRegister(ins->lhs());
    Operand rhs = ToOperand(ins->rhs());
    FloatRegister output = ToFloatRegister(ins->output());

    MSimdBinaryArith::Operation op = ins->operation();
    switch (op) {
      case MSimdBinaryArith::Op_add:
        masm.vpaddd(rhs, lhs, output);
        return;
      case MSimdBinaryArith::Op_sub:
        masm.vpsubd(rhs, lhs, output);
        return;
      case MSimdBinaryArith::Op_mul: {
        if (AssemblerX86Shared::HasSSE41()) {
            masm.vpmulld(rhs, lhs, output);
            return;
        }

        masm.loadAlignedInt32x4(rhs, ScratchSimdReg);
        masm.vpmuludq(lhs, ScratchSimdReg, ScratchSimdReg);
        // ScratchSimdReg contains (Rx, _, Rz, _) where R is the resulting vector.

        FloatRegister temp = ToFloatRegister(ins->temp());
        masm.vpshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), lhs, lhs);
        masm.vpshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), rhs, temp);
        masm.vpmuludq(temp, lhs, lhs);
        // lhs contains (Ry, _, Rw, _) where R is the resulting vector.

        masm.vshufps(MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, LaneX, LaneZ), ScratchSimdReg, lhs, lhs);
        // lhs contains (Ry, Rw, Rx, Rz)
        masm.vshufps(MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, LaneW, LaneY), lhs, lhs, lhs);
        return;
      }
      case MSimdBinaryArith::Op_div:
        // x86 doesn't have SIMD i32 div.
        break;
      case MSimdBinaryArith::Op_max:
        // we can do max with a single instruction only if we have SSE4.1
        // using the PMAXSD instruction.
        break;
      case MSimdBinaryArith::Op_min:
        // we can do max with a single instruction only if we have SSE4.1
        // using the PMINSD instruction.
        break;
      case MSimdBinaryArith::Op_minNum:
      case MSimdBinaryArith::Op_maxNum:
        break;
    }
    MOZ_CRASH("unexpected SIMD op");
}

void
CodeGeneratorX86Shared::visitSimdBinaryArithFx4(LSimdBinaryArithFx4* ins)
{
    FloatRegister lhs = ToFloatRegister(ins->lhs());
    Operand rhs = ToOperand(ins->rhs());
    FloatRegister output = ToFloatRegister(ins->output());

    MSimdBinaryArith::Operation op = ins->operation();
    switch (op) {
      case MSimdBinaryArith::Op_add:
        masm.vaddps(rhs, lhs, output);
        return;
      case MSimdBinaryArith::Op_sub:
        masm.vsubps(rhs, lhs, output);
        return;
      case MSimdBinaryArith::Op_mul:
        masm.vmulps(rhs, lhs, output);
        return;
      case MSimdBinaryArith::Op_div:
        masm.vdivps(rhs, lhs, output);
        return;
      case MSimdBinaryArith::Op_max: {
        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, ScratchSimdReg);
        masm.vcmpunordps(rhs, lhsCopy, ScratchSimdReg);

        FloatRegister tmp = ToFloatRegister(ins->temp());
        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, tmp);
        masm.vmaxps(Operand(lhs), rhsCopy, tmp);
        masm.vmaxps(rhs, lhs, output);

        masm.vandps(tmp, output, output);
        masm.vorps(ScratchSimdReg, output, output); // or in the all-ones NaNs
        return;
      }
      case MSimdBinaryArith::Op_min: {
        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, ScratchSimdReg);
        masm.vminps(Operand(lhs), rhsCopy, ScratchSimdReg);
        masm.vminps(rhs, lhs, output);
        masm.vorps(ScratchSimdReg, output, output); // NaN or'd with arbitrary bits is NaN
        return;
      }
      case MSimdBinaryArith::Op_minNum: {
        FloatRegister tmp = ToFloatRegister(ins->temp());
        masm.loadConstantInt32x4(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);

        FloatRegister mask = ScratchSimdReg;
        FloatRegister tmpCopy = masm.reusedInputFloat32x4(tmp, ScratchSimdReg);
        masm.vpcmpeqd(Operand(lhs), tmpCopy, mask);
        masm.vandps(tmp, mask, mask);

        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
        masm.vminps(rhs, lhsCopy, tmp);
        masm.vorps(mask, tmp, tmp);

        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
        masm.vcmpneqps(rhs, rhsCopy, mask);

        if (AssemblerX86Shared::HasAVX()) {
            masm.vblendvps(mask, lhs, tmp, output);
        } else {
            // Emulate vblendvps.
            // With SSE.4.1 we could use blendvps, however it's awkward since
            // it requires the mask to be in xmm0.
            if (lhs != output)
                masm.moveFloat32x4(lhs, output);
            masm.vandps(Operand(mask), output, output);
            masm.vandnps(Operand(tmp), mask, mask);
            masm.vorps(Operand(mask), output, output);
        }
        return;
      }
      case MSimdBinaryArith::Op_maxNum: {
        FloatRegister mask = ScratchSimdReg;
        masm.loadConstantInt32x4(SimdConstant::SplatX4(0), mask);
        masm.vpcmpeqd(Operand(lhs), mask, mask);

        FloatRegister tmp = ToFloatRegister(ins->temp());
        masm.loadConstantInt32x4(SimdConstant::SplatX4(int32_t(0x80000000)), tmp);
        masm.vandps(tmp, mask, mask);

        FloatRegister lhsCopy = masm.reusedInputFloat32x4(lhs, tmp);
        masm.vmaxps(rhs, lhsCopy, tmp);
        masm.vandnps(Operand(tmp), mask, mask);

        // Ensure tmp always contains the temporary result
        mask = tmp;
        tmp = ScratchSimdReg;

        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
        masm.vcmpneqps(rhs, rhsCopy, mask);

        if (AssemblerX86Shared::HasAVX()) {
            masm.vblendvps(mask, lhs, tmp, output);
        } else {
            // Emulate vblendvps.
            // With SSE.4.1 we could use blendvps, however it's awkward since
            // it requires the mask to be in xmm0.
            if (lhs != output)
                masm.moveFloat32x4(lhs, output);
            masm.vandps(Operand(mask), output, output);
            masm.vandnps(Operand(tmp), mask, mask);
            masm.vorps(Operand(mask), output, output);
        }
        return;
      }
    }
    MOZ_CRASH("unexpected SIMD op");
}

void
CodeGeneratorX86Shared::visitSimdUnaryArithIx4(LSimdUnaryArithIx4* ins)
{
    Operand in = ToOperand(ins->input());
    FloatRegister out = ToFloatRegister(ins->output());

    static const SimdConstant allOnes = SimdConstant::CreateX4(-1, -1, -1, -1);

    switch (ins->operation()) {
      case MSimdUnaryArith::neg:
        masm.zeroInt32x4(out);
        masm.packedSubInt32(in, out);
        return;
      case MSimdUnaryArith::not_:
        masm.loadConstantInt32x4(allOnes, out);
        masm.bitwiseXorX4(in, out);
        return;
      case MSimdUnaryArith::abs:
      case MSimdUnaryArith::reciprocalApproximation:
      case MSimdUnaryArith::reciprocalSqrtApproximation:
      case MSimdUnaryArith::sqrt:
        break;
    }
    MOZ_CRASH("unexpected SIMD op");
}

void
CodeGeneratorX86Shared::visitSimdUnaryArithFx4(LSimdUnaryArithFx4* ins)
{
    Operand in = ToOperand(ins->input());
    FloatRegister out = ToFloatRegister(ins->output());

    // All ones but the sign bit
    float signMask = SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
    static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);

    // All ones including the sign bit
    float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
    static const SimdConstant allOnes = SimdConstant::SplatX4(ones);

    // All zeros but the sign bit
    static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);

    switch (ins->operation()) {
      case MSimdUnaryArith::abs:
        masm.loadConstantFloat32x4(signMasks, out);
        masm.bitwiseAndX4(in, out);
        return;
      case MSimdUnaryArith::neg:
        masm.loadConstantFloat32x4(minusZero, out);
        masm.bitwiseXorX4(in, out);
        return;
      case MSimdUnaryArith::not_:
        masm.loadConstantFloat32x4(allOnes, out);
        masm.bitwiseXorX4(in, out);
        return;
      case MSimdUnaryArith::reciprocalApproximation:
        masm.packedRcpApproximationFloat32x4(in, out);
        return;
      case MSimdUnaryArith::reciprocalSqrtApproximation:
        masm.packedRcpSqrtApproximationFloat32x4(in, out);
        return;
      case MSimdUnaryArith::sqrt:
        masm.packedSqrtFloat32x4(in, out);
        return;
    }
    MOZ_CRASH("unexpected SIMD op");
}

void
CodeGeneratorX86Shared::visitSimdBinaryBitwiseX4(LSimdBinaryBitwiseX4* ins)
{
    FloatRegister lhs = ToFloatRegister(ins->lhs());
    Operand rhs = ToOperand(ins->rhs());
    FloatRegister output = ToFloatRegister(ins->output());

    MSimdBinaryBitwise::Operation op = ins->operation();
    switch (op) {
      case MSimdBinaryBitwise::and_:
        if (ins->type() == MIRType_Float32x4)
            masm.vandps(rhs, lhs, output);
        else
            masm.vpand(rhs, lhs, output);
        return;
      case MSimdBinaryBitwise::or_:
        if (ins->type() == MIRType_Float32x4)
            masm.vorps(rhs, lhs, output);
        else
            masm.vpor(rhs, lhs, output);
        return;
      case MSimdBinaryBitwise::xor_:
        if (ins->type() == MIRType_Float32x4)
            masm.vxorps(rhs, lhs, output);
        else
            masm.vpxor(rhs, lhs, output);
        return;
    }
    MOZ_CRASH("unexpected SIMD bitwise op");
}

void
CodeGeneratorX86Shared::visitSimdShift(LSimdShift* ins)
{
    FloatRegister out = ToFloatRegister(ins->output());
    MOZ_ASSERT(ToFloatRegister(ins->vector()) == out); // defineReuseInput(0);

    // TODO: If the shift count is greater than 31, this will just zero all
    // lanes by default for lsh and ursh, and set the count to 32 for rsh
    // (which will just extend the sign bit to all bits). Plain JS doesn't do
    // this: instead it only keeps the five low bits of the mask. Spec isn't
    // clear about that topic so this might need to be fixed. See also bug
    // 1068028.
    const LAllocation* val = ins->value();
    if (val->isConstant()) {
        int32_t c = ToInt32(val);
        if (c > 31) {
            switch (ins->operation()) {
              case MSimdShift::lsh:
              case MSimdShift::ursh:
                masm.zeroInt32x4(out);
                return;
              default:
                c = 31;
                break;
            }
        }
        Imm32 count(c);
        switch (ins->operation()) {
          case MSimdShift::lsh:
            masm.packedLeftShiftByScalar(count, out);
            return;
          case MSimdShift::rsh:
            masm.packedRightShiftByScalar(count, out);
            return;
          case MSimdShift::ursh:
            masm.packedUnsignedRightShiftByScalar(count, out);
            return;
        }
        MOZ_CRASH("unexpected SIMD bitwise op");
    }

    MOZ_ASSERT(val->isRegister());
    FloatRegister tmp = ScratchFloat32Reg;
    masm.vmovd(ToRegister(val), tmp);

    switch (ins->operation()) {
      case MSimdShift::lsh:
        masm.packedLeftShiftByScalar(tmp, out);
        return;
      case MSimdShift::rsh:
        masm.packedRightShiftByScalar(tmp, out);
        return;
      case MSimdShift::ursh:
        masm.packedUnsignedRightShiftByScalar(tmp, out);
        return;
    }
    MOZ_CRASH("unexpected SIMD bitwise op");
}

void
CodeGeneratorX86Shared::visitSimdSelect(LSimdSelect* ins)
{
    FloatRegister mask = ToFloatRegister(ins->mask());
    FloatRegister onTrue = ToFloatRegister(ins->lhs());
    FloatRegister onFalse = ToFloatRegister(ins->rhs());
    FloatRegister output = ToFloatRegister(ins->output());
    FloatRegister temp = ToFloatRegister(ins->temp());

    if (onTrue != output)
        masm.vmovaps(onTrue, output);
    if (mask != temp)
        masm.vmovaps(mask, temp);

    MSimdSelect* mir = ins->mir();
    if (mir->isElementWise()) {
        if (AssemblerX86Shared::HasAVX()) {
            masm.vblendvps(mask, onTrue, onFalse, output);
            return;
        }

        // SSE4.1 has plain blendvps which can do this, but it is awkward
        // to use because it requires the mask to be in xmm0.

        // Propagate sign to all bits of mask vector, if necessary.
        if (!mir->mask()->isSimdBinaryComp())
            masm.packedRightShiftByScalar(Imm32(31), temp);
    }

    masm.bitwiseAndX4(Operand(temp), output);
    masm.bitwiseAndNotX4(Operand(onFalse), temp);
    masm.bitwiseOrX4(Operand(temp), output);
}

void
CodeGeneratorX86Shared::visitMemoryBarrier(LMemoryBarrier* ins)
{
    if (ins->type() & MembarStoreLoad)
        masm.storeLoadFence();
}

} // namespace jit
} // namespace js