Bug 810526 - Add an implementation of ARM EHABI stack unwinding for the profiler. r=BenWa r=huseby
authorJed Davis <jld@mozilla.com>
Wed, 11 Sep 2013 14:53:14 -0400
changeset 146647 6e5cdc0ba2e7753fbb1787bb46a45026bb279dc9
parent 146646 887e6d05d1d55210fc3e033daa2ee066c2dcda60
child 146648 fa066cc5c1f99157656baa3610941348631cf91c
push id1
push userroot
push dateMon, 20 Oct 2014 17:29:22 +0000
reviewersBenWa, huseby
bugs810526
milestone26.0a1
Bug 810526 - Add an implementation of ARM EHABI stack unwinding for the profiler. r=BenWa r=huseby
tools/profiler/EHABIStackWalk.cpp
tools/profiler/EHABIStackWalk.h
tools/profiler/TableTicker.cpp
tools/profiler/moz.build
tools/profiler/platform-linux.cc
tools/profiler/platform.cpp
new file mode 100644
--- /dev/null
+++ b/tools/profiler/EHABIStackWalk.cpp
@@ -0,0 +1,625 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * This is an implementation of stack unwinding according to a subset
+ * of the ARM Exception Handling ABI, as described in:
+ *   http://infocenter.arm.com/help/topic/com.arm.doc.ihi0038a/IHI0038A_ehabi.pdf
+ *
+ * This handles only the ARM-defined "personality routines" (chapter
+ * 9), and don't track the value of FP registers, because profiling
+ * needs only chain of PC/SP values.
+ *
+ * Because the exception handling info may not be accurate for all
+ * possible places where an async signal could occur (e.g., in a
+ * prologue or epilogue), this bounds-checks all stack accesses.
+ *
+ * This file uses "struct" for structures in the exception tables and
+ * "class" otherwise.  We should avoid violating the C++11
+ * standard-layout rules in the former.
+ */
+
+#include "EHABIStackWalk.h"
+
+#include "shared-libraries.h"
+#include "platform.h"
+
+#include "mozilla/Atomics.h"
+#include "mozilla/Attributes.h"
+#include "mozilla/DebugOnly.h"
+#include "mozilla/Endian.h"
+
+#include <algorithm>
+#include <elf.h>
+#include <stdint.h>
+#include <vector>
+#include <string>
+
+#ifndef PT_ARM_EXIDX
+#define PT_ARM_EXIDX 0x70000001
+#endif
+
+
+namespace mozilla {
+
+struct EHEntry;
+
+class EHState {
+  // Note that any core register can be used as a "frame pointer" to
+  // influence the unwinding process, so this must track all of them.
+  uint32_t mRegs[16];
+public:
+  bool unwind(const EHEntry *aEntry, const void *stackBase);
+  uint32_t &operator[](int i) { return mRegs[i]; }
+  const uint32_t &operator[](int i) const { return mRegs[i]; }
+  EHState(const mcontext_t &);
+};
+
+enum {
+  R_SP = 13,
+  R_LR = 14,
+  R_PC = 15
+};
+
+class EHEntryHandle {
+  const EHEntry *mValue;
+public:
+  EHEntryHandle(const EHEntry *aEntry) : mValue(aEntry) { }
+  const EHEntry *value() const { return mValue; }
+};
+
+class EHTable {
+  uint32_t mStartPC;
+  uint32_t mEndPC;
+  uint32_t mLoadOffset;
+  // In principle we should be able to binary-search the index section in
+  // place, but the ICS toolchain's linker is noncompliant and produces
+  // indices that aren't entirely sorted (e.g., libc).  So we have this:
+  std::vector<EHEntryHandle> mEntries;
+  std::string mName;
+public:
+  EHTable(const void *aELF, size_t aSize, const std::string &aName);
+  const EHEntry *lookup(uint32_t aPC) const;
+  bool isValid() const { return mEntries.size() > 0; }
+  const std::string &name() const { return mName; }
+  uint32_t startPC() const { return mStartPC; }
+  uint32_t endPC() const { return mEndPC; }
+  uint32_t loadOffset() const { return mLoadOffset; }
+};
+
+class EHAddrSpace {
+  std::vector<uint32_t> mStarts;
+  std::vector<EHTable> mTables;
+  static mozilla::Atomic<const EHAddrSpace*> sCurrent;
+public:
+  explicit EHAddrSpace(const std::vector<EHTable>& aTables);
+  const EHTable *lookup(uint32_t aPC) const;
+  static void Update();
+  static const EHAddrSpace *Get();
+};
+
+
+void EHABIStackWalkInit()
+{
+  EHAddrSpace::Update();
+}
+
+size_t EHABIStackWalk(const mcontext_t &aContext, void *stackBase,
+                      void **aSPs, void **aPCs, const size_t aNumFrames)
+{
+  const EHAddrSpace *space = EHAddrSpace::Get();
+  EHState state(aContext);
+  size_t count = 0;
+
+  while (count < aNumFrames) {
+    uint32_t pc = state[R_PC], sp = state[R_SP];
+    aPCs[count] = reinterpret_cast<void *>(pc);
+    aSPs[count] = reinterpret_cast<void *>(sp);
+    count++;
+
+    if (!space)
+      break;
+    // TODO: cache these lookups.  Binary-searching libxul is
+    // expensive (possibly more expensive than doing the actual
+    // unwind), and even a small cache should help.
+    const EHTable *table = space->lookup(pc);
+    if (!table)
+      break;
+    const EHEntry *entry = table->lookup(pc);
+    if (!entry)
+      break;
+    if (!state.unwind(entry, stackBase))
+      break;
+  }
+  
+  return count;
+}
+
+
+struct PRel31 {
+  uint32_t mBits;
+  bool topBit() const { return mBits & 0x80000000; }
+  uint32_t value() const { return mBits & 0x7fffffff; }
+  int32_t offset() const { return (static_cast<int32_t>(mBits) << 1) >> 1; }
+  const void *compute() const {
+    return reinterpret_cast<const char *>(this) + offset();
+  }
+private:
+  PRel31(const PRel31 &copied) MOZ_DELETE;
+  PRel31() MOZ_DELETE;
+};
+
+struct EHEntry {
+  PRel31 startPC;
+  PRel31 exidx;
+private:
+  EHEntry(const EHEntry &copied) MOZ_DELETE;
+  EHEntry() MOZ_DELETE;
+};
+
+
+class EHInterp {
+public:
+  EHInterp(EHState &aState, const EHEntry *aEntry,
+           uint32_t aStackLimit, uint32_t aStackBase)
+    : mState(aState),
+      mStackLimit(aStackLimit),
+      mStackBase(aStackBase),
+      mNextWord(0),
+      mWordsLeft(0),
+      mFailed(false)
+  {
+    const PRel31 &exidx = aEntry->exidx;
+    uint32_t firstWord;
+
+    if (exidx.mBits == 1) {  // EXIDX_CANTUNWIND
+      mFailed = true;
+      return;
+    }
+    if (exidx.topBit()) {
+      firstWord = exidx.mBits;
+    } else {
+      mNextWord = reinterpret_cast<const uint32_t *>(exidx.compute());
+      firstWord = *mNextWord++;
+    }
+
+    switch (firstWord >> 24) {
+    case 0x80: // short
+      mWord = firstWord << 8;
+      mBytesLeft = 3;
+      break;
+    case 0x81: case 0x82: // long; catch descriptor size ignored
+      mWord = firstWord << 16;
+      mBytesLeft = 2;
+      mWordsLeft = (firstWord >> 16) & 0xff;
+      break;
+    default:
+      // unknown personality
+      mFailed = true;
+    }
+  }
+
+  bool unwind();
+
+private:
+  // TODO: GCC has been observed not CSEing repeated reads of
+  // mState[R_SP] with writes to mFailed between them, suggesting that
+  // it hasn't determined that they can't alias and is thus missing
+  // optimization opportunities.  So, we may want to flatten EHState
+  // into this class; this may also make the code simpler.
+  EHState &mState;
+  uint32_t mStackLimit;
+  uint32_t mStackBase;
+  const uint32_t *mNextWord;
+  uint32_t mWord;
+  uint8_t mWordsLeft;
+  uint8_t mBytesLeft;
+  bool mFailed;
+
+  enum {
+    I_ADDSP    = 0x00, // 0sxxxxxx (subtract if s)
+    M_ADDSP    = 0x80,
+    I_POPMASK  = 0x80, // 1000iiii iiiiiiii (if any i set)
+    M_POPMASK  = 0xf0,
+    I_MOVSP    = 0x90, // 1001nnnn
+    M_MOVSP    = 0xf0,
+    I_POPN     = 0xa0, // 1010lnnn
+    M_POPN     = 0xf0,
+    I_FINISH   = 0xb0, // 10110000
+    I_POPLO    = 0xb1, // 10110001 0000iiii (if any i set)
+    I_ADDSPBIG = 0xb2, // 10110010 uleb128
+    I_POPFDX   = 0xb3, // 10110011 sssscccc
+    I_POPFDX8  = 0xb8, // 10111nnn
+    M_POPFDX8  = 0xf8,
+    // "Intel Wireless MMX" extensions omitted.
+    I_POPFDD   = 0xc8, // 1100100h sssscccc
+    M_POPFDD   = 0xfe,
+    I_POPFDD8  = 0xd0, // 11010nnn
+    M_POPFDD8  = 0xf8
+  };
+
+  uint8_t next() {
+    if (mBytesLeft == 0) {
+      if (mWordsLeft == 0) {
+        return I_FINISH;
+      }
+      mWordsLeft--;
+      mWord = *mNextWord++;
+      mBytesLeft = 4;
+    }
+    mBytesLeft--;
+    mWord = (mWord << 8) | (mWord >> 24); // rotate
+    return mWord;
+  }
+
+  uint32_t &vSP() { return mState[R_SP]; }
+  uint32_t *ptrSP() { return reinterpret_cast<uint32_t *>(vSP()); }
+
+  void checkStackBase() { if (vSP() > mStackBase) mFailed = true; }
+  void checkStackLimit() { if (vSP() <= mStackLimit) mFailed = true; }
+  void checkStackAlign() { if ((vSP() & 3) != 0) mFailed = true; }
+  void checkStack() {
+    checkStackBase();
+    checkStackLimit();
+    checkStackAlign();
+  }
+
+  void popRange(uint8_t first, uint8_t last, uint16_t mask) {
+    bool hasSP = false;
+    uint32_t tmpSP;
+    if (mask == 0)
+      mFailed = true;
+    for (uint8_t r = first; r <= last; ++r) {
+      if (mask & 1) {
+        if (r == R_SP) {
+          hasSP = true;
+          tmpSP = *ptrSP();
+        } else
+          mState[r] = *ptrSP();
+        vSP() += 4;
+        checkStackBase();
+        if (mFailed)
+          return;
+      }
+      mask >>= 1;
+    }
+    if (hasSP) {
+      vSP() = tmpSP;
+      checkStack();
+    }
+  }
+};
+
+
+bool EHState::unwind(const EHEntry *aEntry, const void *stackLimit) {
+  EHInterp interp(*this, aEntry, mRegs[R_SP] - 4,
+                  reinterpret_cast<uint32_t>(stackLimit));
+
+  return interp.unwind();
+}
+
+bool EHInterp::unwind() {
+  mState[R_PC] = 0;
+  checkStack();
+  while (!mFailed) {
+    uint8_t insn = next();
+#if DEBUG_EHABI_UNWIND
+    LOGF("unwind insn = %02x", (unsigned)insn);
+#endif
+    // Try to put the common cases first.
+
+    // 00xxxxxx: vsp = vsp + (xxxxxx << 2) + 4
+    // 01xxxxxx: vsp = vsp - (xxxxxx << 2) - 4
+    if ((insn & M_ADDSP) == I_ADDSP) {
+      uint32_t offset = ((insn & 0x3f) << 2) + 4;
+      if (insn & 0x40) {
+        vSP() -= offset;
+        checkStackLimit();
+      } else {
+        vSP() += offset;
+        checkStackBase();
+      }
+      continue;
+    }
+
+    // 10100nnn: Pop r4-r[4+nnn]
+    // 10101nnn: Pop r4-r[4+nnn], r14
+    if ((insn & M_POPN) == I_POPN) {
+      uint8_t n = (insn & 0x07) + 1;
+      bool lr = insn & 0x08;
+      uint32_t *ptr = ptrSP();
+      vSP() += (n + (lr ? 1 : 0)) * 4;
+      checkStackBase();
+      for (uint8_t r = 4; r < 4 + n; ++r)
+        mState[r] = *ptr++;
+      if (lr)
+        mState[R_LR] = *ptr++;
+      continue;
+    }
+
+    // 1011000: Finish
+    if (insn == I_FINISH) {
+      if (mState[R_PC] == 0)
+        mState[R_PC] = mState[R_LR];
+      return true;
+    }
+
+    // 1001nnnn: Set vsp = r[nnnn]
+    if ((insn & M_MOVSP) == I_MOVSP) {
+      vSP() = mState[insn & 0x0f];
+      checkStack();
+      continue;
+    }
+
+    // 11001000 sssscccc: Pop VFP regs D[16+ssss]-D[16+ssss+cccc] (as FLDMFDD)
+    // 11001001 sssscccc: Pop VFP regs D[ssss]-D[ssss+cccc] (as FLDMFDD)
+    if ((insn & M_POPFDD) == I_POPFDD) {
+      uint8_t n = (next() & 0x0f) + 1;
+      // Note: if the 16+ssss+cccc > 31, the encoding is reserved.
+      // As the space is currently unused, we don't try to check.
+      vSP() += 8 * n;
+      checkStackBase();
+      continue;
+    }
+
+    // 11010nnn: Pop VFP regs D[8]-D[8+nnn] (as FLDMFDD)
+    if ((insn & M_POPFDD8) == I_POPFDD8) {
+      uint8_t n = (insn & 0x07) + 1;
+      vSP() += 8 * n;
+      checkStackBase();
+      continue;
+    }
+
+    // 10110010 uleb128: vsp = vsp + 0x204 + (uleb128 << 2)
+    if (insn == I_ADDSPBIG) {
+      uint32_t acc = 0;
+      uint8_t shift = 0;
+      uint8_t byte;
+      do {
+        if (shift >= 32)
+          return false;
+        byte = next();
+        acc |= (byte & 0x7f) << shift;
+        shift += 7;
+      } while (byte & 0x80);
+      uint32_t offset = 0x204 + (acc << 2);
+      // The calculations above could have overflowed.
+      // But the one we care about is this:
+      if (vSP() + offset < vSP())
+        mFailed = true;
+      vSP() += offset;
+      // ...so that this is the only other check needed:
+      checkStackBase();
+      continue;
+    }
+
+    // 1000iiii iiiiiiii (i not all 0): Pop under masks {r15-r12}, {r11-r4}
+    if ((insn & M_POPMASK) == I_POPMASK) {
+      popRange(4, 15, ((insn & 0x0f) << 8) | next());
+      continue;
+    }
+
+    // 1011001 0000iiii (i not all 0): Pop under mask {r3-r0}
+    if (insn == I_POPLO) {
+      popRange(0, 3, next() & 0x0f);
+      continue;
+    }
+
+    // 10110011 sssscccc: Pop VFP regs D[ssss]-D[ssss+cccc] (as FLDMFDX)
+    if (insn == I_POPFDX) {
+      uint8_t n = (next() & 0x0f) + 1;
+      vSP() += 8 * n + 4;
+      checkStackBase();
+      continue;
+    }
+
+    // 10111nnn: Pop VFP regs D[8]-D[8+nnn] (as FLDMFDX)
+    if ((insn & M_POPFDX8) == I_POPFDX8) {
+      uint8_t n = (insn & 0x07) + 1;
+      vSP() += 8 * n + 4;
+      checkStackBase();
+      continue;
+    }
+
+    // unhandled instruction
+#ifdef DEBUG_EHABI_UNWIND
+    LOGF("Unhandled EHABI instruction 0x%02x", insn);
+#endif
+    mFailed = true;
+  }
+  return false;
+}
+
+
+bool operator<(const EHTable &lhs, const EHTable &rhs) {
+  return lhs.startPC() < rhs.endPC();
+}
+
+// Async signal unsafe.
+EHAddrSpace::EHAddrSpace(const std::vector<EHTable>& aTables)
+  : mTables(aTables)
+{
+  std::sort(mTables.begin(), mTables.end());
+  DebugOnly<uint32_t> lastEnd = 0;
+  for (std::vector<EHTable>::iterator i = mTables.begin();
+       i != mTables.end(); ++i) {
+    MOZ_ASSERT(i->startPC() >= lastEnd);
+    mStarts.push_back(i->startPC());
+    lastEnd = i->endPC();
+  }
+}
+
+const EHTable *EHAddrSpace::lookup(uint32_t aPC) const {
+  ptrdiff_t i = (std::upper_bound(mStarts.begin(), mStarts.end(), aPC)
+                 - mStarts.begin()) - 1;
+
+  if (i < 0 || aPC >= mTables[i].endPC())
+    return 0;
+  return &mTables[i];
+}
+
+
+bool operator<(const EHEntryHandle &lhs, const EHEntryHandle &rhs) {
+  return lhs.value()->startPC.compute() < rhs.value()->startPC.compute();
+}
+
+const EHEntry *EHTable::lookup(uint32_t aPC) const {
+  MOZ_ASSERT(aPC >= mStartPC);
+  if (aPC >= mEndPC)
+    return NULL;
+
+  std::vector<EHEntryHandle>::const_iterator begin = mEntries.begin();
+  std::vector<EHEntryHandle>::const_iterator end = mEntries.end();
+  MOZ_ASSERT(begin < end);
+  if (aPC < reinterpret_cast<uint32_t>(begin->value()->startPC.compute()))
+    return NULL;
+
+  while (end - begin > 1) {
+    std::vector<EHEntryHandle>::const_iterator mid = begin + (end - begin) / 2;
+    if (aPC < reinterpret_cast<uint32_t>(mid->value()->startPC.compute()))
+      end = mid;
+    else
+      begin = mid;
+  }
+  return begin->value();
+}
+
+
+#if MOZ_LITTLE_ENDIAN
+static const unsigned char hostEndian = ELFDATA2LSB;
+#elif MOZ_BIG_ENDIAN
+static const unsigned char hostEndian = ELFDATA2MSB;
+#else
+#error "No endian?"
+#endif
+
+// Async signal unsafe.  (Note use of std::vector::reserve.)
+EHTable::EHTable(const void *aELF, size_t aSize, const std::string &aName)
+  : mStartPC(~0), // largest uint32_t
+    mEndPC(0),
+    mName(aName)
+{
+  const uint32_t base = reinterpret_cast<uint32_t>(aELF);
+
+  if (aSize < sizeof(Elf32_Ehdr))
+    return;
+
+  const Elf32_Ehdr &file = *(reinterpret_cast<Elf32_Ehdr *>(base));
+  if (memcmp(&file.e_ident[EI_MAG0], ELFMAG, SELFMAG) != 0 ||
+      file.e_ident[EI_CLASS] != ELFCLASS32 ||
+      file.e_ident[EI_DATA] != hostEndian ||
+      file.e_ident[EI_VERSION] != EV_CURRENT ||
+      file.e_ident[EI_OSABI] != ELFOSABI_SYSV ||
+      file.e_ident[EI_ABIVERSION] != 0 ||
+      file.e_machine != EM_ARM ||
+      file.e_version != EV_CURRENT)
+    // e_flags?
+    return;
+
+  MOZ_ASSERT(file.e_phoff + file.e_phnum * file.e_phentsize <= aSize);
+  const Elf32_Phdr *exidxHdr = 0, *zeroHdr = 0;
+  for (unsigned i = 0; i < file.e_phnum; ++i) {
+    const Elf32_Phdr &phdr =
+      *(reinterpret_cast<Elf32_Phdr *>(base + file.e_phoff
+                                       + i * file.e_phentsize));
+    if (phdr.p_type == PT_ARM_EXIDX) {
+      exidxHdr = &phdr;
+    } else if (phdr.p_type == PT_LOAD) {
+      if (phdr.p_offset == 0) {
+        zeroHdr = &phdr;
+      }
+      if (phdr.p_flags & PF_X) {
+        mStartPC = std::min(mStartPC, phdr.p_vaddr);
+        mEndPC = std::max(mEndPC, phdr.p_vaddr + phdr.p_memsz);
+      }
+    }
+  }
+  if (!exidxHdr)
+    return;
+  if (!zeroHdr)
+    return;
+  mLoadOffset = base - zeroHdr->p_vaddr;
+  mStartPC += mLoadOffset;
+  mEndPC += mLoadOffset;
+
+  // Create a sorted index of the index to work around linker bugs.
+  const EHEntry *startTable =
+    reinterpret_cast<const EHEntry *>(mLoadOffset + exidxHdr->p_vaddr);
+  const EHEntry *endTable =
+    reinterpret_cast<const EHEntry *>(mLoadOffset + exidxHdr->p_vaddr
+                                    + exidxHdr->p_memsz);
+  mEntries.reserve(endTable - startTable);
+  for (const EHEntry *i = startTable; i < endTable; ++i)
+    mEntries.push_back(i);
+  std::sort(mEntries.begin(), mEntries.end());
+}
+
+
+mozilla::Atomic<const EHAddrSpace*> EHAddrSpace::sCurrent(nullptr);
+
+// Async signal safe; can fail if Update() hasn't returned yet.
+const EHAddrSpace *EHAddrSpace::Get() {
+  return sCurrent;
+}
+
+// Collect unwinding information from loaded objects.  Calls after the
+// first have no effect.  Async signal unsafe.
+void EHAddrSpace::Update() {
+  const EHAddrSpace *space = sCurrent;
+  if (space)
+    return;
+
+  SharedLibraryInfo info = SharedLibraryInfo::GetInfoForSelf();
+  std::vector<EHTable> tables;
+
+  for (size_t i = 0; i < info.GetSize(); ++i) {
+    const SharedLibrary &lib = info.GetEntry(i);
+    if (lib.GetOffset() != 0)
+      // TODO: if it has a name, and we haven't seen a mapping of
+      // offset 0 for that file, try opening it and reading the
+      // headers instead.  The only thing I've seen so far that's
+      // linked so as to need that treatment is the dynamic linker
+      // itself.
+      continue;
+    EHTable tab(reinterpret_cast<const void *>(lib.GetStart()),
+              lib.GetEnd() - lib.GetStart(), lib.GetName());
+    if (tab.isValid())
+      tables.push_back(tab);
+  }
+  space = new EHAddrSpace(tables);
+
+  if (!sCurrent.compareExchange(nullptr, space)) {
+    delete space;
+    space = sCurrent;
+  }
+}
+
+
+EHState::EHState(const mcontext_t &context) {
+#ifdef linux
+  mRegs[0] = context.arm_r0;
+  mRegs[1] = context.arm_r1;
+  mRegs[2] = context.arm_r2;
+  mRegs[3] = context.arm_r3;
+  mRegs[4] = context.arm_r4;
+  mRegs[5] = context.arm_r5;
+  mRegs[6] = context.arm_r6;
+  mRegs[7] = context.arm_r7;
+  mRegs[8] = context.arm_r8;
+  mRegs[9] = context.arm_r9;
+  mRegs[10] = context.arm_r10;
+  mRegs[11] = context.arm_fp;
+  mRegs[12] = context.arm_ip;
+  mRegs[13] = context.arm_sp;
+  mRegs[14] = context.arm_lr;
+  mRegs[15] = context.arm_pc;
+#else
+# error "Unhandled OS for ARM EHABI unwinding"
+#endif
+}
+
+} // namespace mozilla
+
new file mode 100644
--- /dev/null
+++ b/tools/profiler/EHABIStackWalk.h
@@ -0,0 +1,33 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * This is an implementation of stack unwinding according to a subset
+ * of the ARM Exception Handling ABI; see the comment at the top of
+ * the .cpp file for details.
+ */
+
+#ifndef mozilla_EHABIStackWalk_h__
+#define mozilla_EHABIStackWalk_h__
+
+#include <stddef.h>
+
+#ifdef ANDROID
+# include "android-signal-defs.h"
+#else
+# include <ucontext.h>
+#endif
+
+namespace mozilla {
+
+void EHABIStackWalkInit();
+
+size_t EHABIStackWalk(const mcontext_t &aContext, void *stackBase,
+                      void **aSPs, void **aPCs, size_t aNumFrames);
+
+}
+
+#endif
--- a/tools/profiler/TableTicker.cpp
+++ b/tools/profiler/TableTicker.cpp
@@ -44,16 +44,24 @@
 
 #if defined(MOZ_PROFILING) && (defined(XP_MACOSX) || defined(XP_WIN))
  #define USE_NS_STACKWALK
 #endif
 #ifdef USE_NS_STACKWALK
  #include "nsStackWalk.h"
 #endif
 
+#if defined(SPS_ARCH_arm) && defined(MOZ_WIDGET_GONK)
+ // Should also work on other Android and ARM Linux, but not tested there yet.
+ #define USE_EHABI_STACKWALK
+#endif
+#ifdef USE_EHABI_STACKWALK
+ #include "EHABIStackWalk.h"
+#endif
+
 using std::string;
 using namespace mozilla;
 
 #ifndef MAXPATHLEN
  #ifdef PATH_MAX
   #define MAXPATHLEN PATH_MAX
  #elif defined(MAX_PATH)
   #define MAXPATHLEN MAX_PATH
@@ -332,17 +340,17 @@ void addProfileEntry(volatile StackEntry
     aProfile.addTag(ProfileEntry('c', sampleLabel));
     lineno = entry.line();
   }
   if (lineno != -1) {
     aProfile.addTag(ProfileEntry('n', lineno));
   }
 }
 
-#ifdef USE_NS_STACKWALK
+#if defined(USE_NS_STACKWALK) || defined(USE_EHABI_STACKWALK)
 typedef struct {
   void** array;
   void** sp_array;
   size_t size;
   size_t count;
 } PCArray;
 
 static void mergeNativeBacktrace(ThreadProfile &aProfile, const PCArray &array) {
@@ -435,16 +443,36 @@ void TableTicker::doNativeBacktrace(Thre
   nsresult rv = NS_StackWalk(StackWalkCallback, /* skipFrames */ 0, maxFrames,
                              &array, thread, platformData);
 #endif
   if (NS_SUCCEEDED(rv))
     mergeNativeBacktrace(aProfile, array);
 }
 #endif
 
+#ifdef USE_EHABI_STACKWALK
+void TableTicker::doNativeBacktrace(ThreadProfile &aProfile, TickSample* aSample)
+{
+  void *pc_array[1000];
+  void *sp_array[1000];
+  PCArray array = {
+    pc_array,
+    sp_array,
+    mozilla::ArrayLength(pc_array),
+    0
+  };
+
+  ucontext_t *ucontext = reinterpret_cast<ucontext_t *>(aSample->context);
+  array.count = EHABIStackWalk(ucontext->uc_mcontext, aProfile.GetStackTop(),
+                               sp_array, pc_array, array.size);
+  mergeNativeBacktrace(aProfile, array);
+}
+
+#endif
+
 static
 void doSampleStackTrace(PseudoStack *aStack, ThreadProfile &aProfile, TickSample *sample)
 {
   // Sample
   // 's' tag denotes the start of a sample block
   // followed by 0 or more 'c' tags.
   aProfile.addTag(ProfileEntry('s', "(root)"));
   for (uint32_t i = 0; i < aStack->stackSize(); i++) {
@@ -497,17 +525,17 @@ void TableTicker::InplaceTick(TickSample
     if (!sLastTracerEvent.IsNull()) {
       TimeDuration delta = sample->timestamp - sLastTracerEvent;
       if (delta.ToMilliseconds() > 100.0) {
           recordSample = true;
       }
     }
   }
 
-#if defined(USE_NS_STACKWALK)
+#if defined(USE_NS_STACKWALK) || defined(USE_EHABI_STACKWALK)
   if (mUseStackWalk) {
     doNativeBacktrace(currThreadProfile, sample);
   } else {
     doSampleStackTrace(stack, currThreadProfile, mAddLeafAddresses ? sample : nullptr);
   }
 #else
   doSampleStackTrace(stack, currThreadProfile, mAddLeafAddresses ? sample : nullptr);
 #endif
--- a/tools/profiler/moz.build
+++ b/tools/profiler/moz.build
@@ -42,16 +42,20 @@ if CONFIG['MOZ_ENABLE_PROFILER_SPS']:
         'SQLiteInterposer.cpp',
     ]
 
     if CONFIG['OS_TARGET'] in ('Android', 'Linux'):
         CPP_SOURCES += [
             'shared-libraries-linux.cc',
             'platform-linux.cc',
         ]
+        if CONFIG['CPU_ARCH'] == 'arm':
+            CPP_SOURCES += [
+                'EHABIStackWalk.cpp',
+            ]
     elif CONFIG['OS_TARGET'] == 'Darwin':
         CPP_SOURCES += [
             'shared-libraries-macos.cc',
             'platform-macos.cc',
         ]
         CMMSRCS += [
             'shim_mac_dump_syms.mm',
         ]
--- a/tools/profiler/platform-linux.cc
+++ b/tools/profiler/platform-linux.cc
@@ -64,16 +64,21 @@
 #include "platform.h"
 #include "GeckoProfilerImpl.h"
 #include "mozilla/Mutex.h"
 #include "mozilla/Atomics.h"
 #include "ProfileEntry.h"
 #include "nsThreadUtils.h"
 #include "TableTicker.h"
 #include "UnwinderThread2.h"
+#if defined(__ARM_EABI__) && defined(MOZ_WIDGET_GONK)
+ // Should also work on other Android and ARM Linux, but not tested there yet.
+#define USE_EHABI_STACKWALK
+#include "EHABIStackWalk.h"
+#endif
 
 #include <string.h>
 #include <stdio.h>
 #include <list>
 
 #define SIGNAL_SAVE_PROFILE SIGUSR2
 
 #if defined(__GLIBC__)
@@ -304,16 +309,19 @@ Sampler::Sampler(double interval, bool p
 Sampler::~Sampler() {
   ASSERT(!signal_sender_launched_);
 }
 
 
 void Sampler::Start() {
   LOG("Sampler started");
 
+#ifdef USE_EHABI_STACKWALK
+  mozilla::EHABIStackWalkInit();
+#endif
   SamplerRegistry::AddActiveSampler(this);
 
   // Initialize signal handler communication
   sCurrentThreadProfile = NULL;
   if (sem_init(&sSignalHandlingDone, /* pshared: */ 0, /* value: */ 0) != 0) {
     LOG("Error initializing semaphore");
     return;
   }
--- a/tools/profiler/platform.cpp
+++ b/tools/profiler/platform.cpp
@@ -311,17 +311,17 @@ void mozilla_sampler_init(void* stackTop
   // NOTE: Default
   const char *val = PR_GetEnv("MOZ_PROFILER_STARTUP");
   if (!val || !*val) {
     return;
   }
 
   const char* features[] = {"js"
                          , "leaf"
-#if defined(XP_WIN) || defined(XP_MACOSX)
+#if defined(XP_WIN) || defined(XP_MACOSX) || (defined(SPS_ARCH_arm) && defined(linux))
                          , "stackwalk"
 #endif
 #if defined(SPS_OS_android) && !defined(MOZ_WIDGET_GONK)
                          , "java"
 #endif
                          };
   profiler_start(PROFILE_DEFAULT_ENTRY, PROFILE_DEFAULT_INTERVAL,
                          features, sizeof(features)/sizeof(const char*),