tools/jprof/stub/libmalloc.cpp
author Mike Hommey <mh+mozilla@glandium.org>
Fri, 05 Apr 2019 02:30:56 +0000
changeset 468491 60669a841a87644ba9b2f4d9f17f225e2eca0980
parent 454538 5f4630838d46dd81dadb13220a4af0da9e23a619
child 472073 e1993a1f09ac53cd1a04fdf6a87f8cad8e44f73e
permissions -rw-r--r--
Bug 1541792 - Replace linker magic with manual component registration. r=froydnj Before bug 938437, we had a rather large and error-prone nsStaticXULComponents.cpp used to register all modules. That was replaced with clever use of the linker, which allowed to avoid the mess that maintaining that file was. Fast forward to now, where after bug 1524687 and other work that preceded it, we have a much smaller number of remaining static xpcom components, registered via this linker hack, and don't expect to add any new ones. The list should eventually go down to zero. Within that context, it seems to be the right time to get rid of the magic, and with it the problems it causes on its own. Some of those components could probably be trivially be converted to static registration via .conf files, but I didn't want to deal with the possible need to increase the number of dummy modules in XPCOMInit.cpp. They can still be converted as a followup. Differential Revision: https://phabricator.services.mozilla.com/D26076

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
// vim:cindent:sw=4:et:ts=8:
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

// The linux glibc hides part of sigaction if _POSIX_SOURCE is defined
#if defined(linux)
#  undef _POSIX_SOURCE
#  undef _SVID_SOURCE
#  ifndef _GNU_SOURCE
#    define _GNU_SOURCE
#  endif
#endif

#include <errno.h>
#if defined(linux)
#  include <linux/rtc.h>
#  include <pthread.h>
#endif
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <ucontext.h>
#include <execinfo.h>

#include "libmalloc.h"
#include "jprof.h"
#include <string.h>
#include <errno.h>
#include <dlfcn.h>

#ifdef NTO
#  include <sys/link.h>
extern r_debug _r_debug;
#else
#  include <link.h>
#endif

#define USE_GLIBC_BACKTRACE 1
// To debug, use #define JPROF_STATIC
#define JPROF_STATIC static

static int gLogFD = -1;
static pthread_t main_thread;

static bool gIsSlave = false;
static int gFilenamePID;

static void startSignalCounter(unsigned long millisec);
static int enableRTCSignals(bool enable);

//----------------------------------------------------------------------
// replace use of atexit()

static void DumpAddressMap();

struct JprofShutdown {
  JprofShutdown() {}
  ~JprofShutdown() { DumpAddressMap(); }
};

static void RegisterJprofShutdown() {
  // This instanciates the dummy class above, and will trigger the class
  // destructor when libxul is unloaded. This is equivalent to atexit(),
  // but gracefully handles dlclose().
  static JprofShutdown t;
}

#if defined(i386) || defined(_i386) || defined(__x86_64__)
JPROF_STATIC void CrawlStack(malloc_log_entry *me, void *stack_top,
                             void *top_instr_ptr) {
#  if USE_GLIBC_BACKTRACE
  // This probably works on more than x86!  But we need a way to get the
  // top instruction pointer, which is kindof arch-specific
  void *array[500];
  int cnt, i;
  u_long numpcs = 0;

  // This is from glibc.  A more generic version might use
  // libunwind and/or CaptureStackBackTrace() on Windows
  cnt = backtrace(&array[0], sizeof(array) / sizeof(array[0]));

  // StackHook->JprofLog->CrawlStack
  // Then we have sigaction, which replaced top_instr_ptr
  array[3] = top_instr_ptr;
  for (i = 3; i < cnt; i++) {
    me->pcs[numpcs++] = (char *)array[i];
  }
  me->numpcs = numpcs;

#  else
  // original code - this breaks on many platforms
  void **bp;
#    if defined(__i386)
  __asm__("movl %%ebp, %0" : "=g"(bp));
#    elif defined(__x86_64__)
  __asm__("movq %%rbp, %0" : "=g"(bp));
#    else
  // It would be nice if this worked uniformly, but at least on i386 and
  // x86_64, it stopped working with gcc 4.1, because it points to the
  // end of the saved registers instead of the start.
  bp = __builtin_frame_address(0);
#    endif
  u_long numpcs = 0;
  bool tracing = false;

  me->pcs[numpcs++] = (char *)top_instr_ptr;

  while (numpcs < MAX_STACK_CRAWL) {
    void **nextbp = (void **)*bp++;
    void *pc = *bp;
    if (nextbp < bp) {
      break;
    }
    if (tracing) {
      // Skip the signal handling.
      me->pcs[numpcs++] = (char *)pc;
    } else if (pc == top_instr_ptr) {
      tracing = true;
    }
    bp = nextbp;
  }
  me->numpcs = numpcs;
#  endif
}
#endif

//----------------------------------------------------------------------

static int rtcHz;
static int rtcFD = -1;
static bool circular = false;

#if defined(linux) || defined(NTO)
static void DumpAddressMap() {
  // Turn off the timer so we don't get interrupts during shutdown
#  if defined(linux)
  if (rtcHz) {
    enableRTCSignals(false);
  } else
#  endif
  {
    startSignalCounter(0);
  }

  char filename[2048];
  if (gIsSlave)
    snprintf(filename, sizeof(filename), "%s-%d", M_MAPFILE, gFilenamePID);
  else
    snprintf(filename, sizeof(filename), "%s", M_MAPFILE);

  int mfd = open(filename, O_CREAT | O_WRONLY | O_TRUNC, 0666);
  if (mfd >= 0) {
    malloc_map_entry mme;
    link_map *map = _r_debug.r_map;
    while (nullptr != map) {
      if (map->l_name && *map->l_name) {
        mme.nameLen = strlen(map->l_name);
        mme.address = map->l_addr;
        write(mfd, &mme, sizeof(mme));
        write(mfd, map->l_name, mme.nameLen);
#  if 0
	write(1, map->l_name, mme.nameLen);
	write(1, "\n", 1);
#  endif
      }
      map = map->l_next;
    }
    close(mfd);
  }
}
#endif

static bool was_paused = true;

JPROF_STATIC void JprofBufferDump();
JPROF_STATIC void JprofBufferClear();

static void ClearProfilingHook(int signum) {
  if (circular) {
    JprofBufferClear();
    puts("Jprof: cleared circular buffer.");
  }
}

static void EndProfilingHook(int signum) {
  if (circular) JprofBufferDump();

  DumpAddressMap();
  was_paused = true;
  puts("Jprof: profiling paused.");
}

//----------------------------------------------------------------------
// proper usage would be a template, including the function to find the
// size of an entry, or include a size header explicitly to each entry.
#if defined(linux)
#  define DUMB_LOCK() pthread_mutex_lock(&mutex);
#  define DUMB_UNLOCK() pthread_mutex_unlock(&mutex);
#else
#  define DUMB_LOCK() FIXME()
#  define DUMB_UNLOCK() FIXME()
#endif

class DumbCircularBuffer {
 public:
  DumbCircularBuffer(size_t init_buffer_size) {
    used = 0;
    buffer_size = init_buffer_size;
    buffer = (unsigned char *)malloc(buffer_size);
    head = tail = buffer;

#if defined(linux)
    pthread_mutexattr_t mAttr;
    pthread_mutexattr_settype(&mAttr, PTHREAD_MUTEX_RECURSIVE_NP);
    pthread_mutex_init(&mutex, &mAttr);
    pthread_mutexattr_destroy(&mAttr);
#endif
  }
  ~DumbCircularBuffer() {
    free(buffer);
#if defined(linux)
    pthread_mutex_destroy(&mutex);
#endif
  }

  void clear() {
    DUMB_LOCK();
    head = tail;
    used = 0;
    DUMB_UNLOCK();
  }

  bool empty() { return head == tail; }

  size_t space_available() {
    size_t result;
    DUMB_LOCK();
    if (tail > head)
      result = buffer_size - (tail - head) - 1;
    else
      result = head - tail - 1;
    DUMB_UNLOCK();
    return result;
  }

  void drop(size_t size) {
    // assumes correctness!
    DUMB_LOCK();
    head += size;
    if (head >= &buffer[buffer_size]) head -= buffer_size;
    used--;
    DUMB_UNLOCK();
  }

  bool insert(void *data, size_t size) {
    // can fail if not enough space in the entire buffer
    DUMB_LOCK();
    if (space_available() < size) return false;

    size_t max_without_wrap = &buffer[buffer_size] - tail;
    size_t initial = size > max_without_wrap ? max_without_wrap : size;
#if DEBUG_CIRCULAR
    fprintf(stderr, "insert(%d): max_without_wrap %d, size %d, initial %d\n",
            used, max_without_wrap, size, initial);
#endif
    memcpy(tail, data, initial);
    tail += initial;
    data = ((char *)data) + initial;
    size -= initial;
    if (size != 0) {
#if DEBUG_CIRCULAR
      fprintf(stderr, "wrapping by %d bytes\n", size);
#endif
      memcpy(buffer, data, size);
      tail = &(((unsigned char *)buffer)[size]);
    }

    used++;
    DUMB_UNLOCK();

    return true;
  }

  // for external access to the buffer (saving)
  void lock() { DUMB_LOCK(); }

  void unlock() { DUMB_UNLOCK(); }

  // XXX These really shouldn't be public...
  unsigned char *head;
  unsigned char *tail;
  unsigned int used;
  unsigned char *buffer;
  size_t buffer_size;

 private:
  pthread_mutex_t mutex;
};

class DumbCircularBuffer *JprofBuffer;

JPROF_STATIC void JprofBufferInit(size_t size) {
  JprofBuffer = new DumbCircularBuffer(size);
}

JPROF_STATIC void JprofBufferClear() {
  fprintf(stderr, "Told to clear JPROF circular buffer\n");
  JprofBuffer->clear();
}

JPROF_STATIC size_t JprofEntrySizeof(malloc_log_entry *me) {
  return offsetof(malloc_log_entry, pcs) + me->numpcs * sizeof(char *);
}

JPROF_STATIC void JprofBufferAppend(malloc_log_entry *me) {
  size_t size = JprofEntrySizeof(me);

  do {
    while (JprofBuffer->space_available() < size && JprofBuffer->used > 0) {
#if DEBUG_CIRCULAR
      fprintf(
          stderr,
          "dropping entry: %d in use, %d free, need %d, size_to_free = %d\n",
          JprofBuffer->used, JprofBuffer->space_available(), size,
          JprofEntrySizeof((malloc_log_entry *)JprofBuffer->head));
#endif
      JprofBuffer->drop(
          JprofEntrySizeof((malloc_log_entry *)JprofBuffer->head));
    }
    if (JprofBuffer->space_available() < size) return;

  } while (!JprofBuffer->insert(me, size));
}

JPROF_STATIC void JprofBufferDump() {
  JprofBuffer->lock();
#if DEBUG_CIRCULAR
  fprintf(
      stderr, "dumping JP_CIRCULAR buffer, %d of %d bytes\n",
      JprofBuffer->tail > JprofBuffer->head
          ? JprofBuffer->tail - JprofBuffer->head
          : JprofBuffer->buffer_size + JprofBuffer->tail - JprofBuffer->head,
      JprofBuffer->buffer_size);
#endif
  if (JprofBuffer->tail >= JprofBuffer->head) {
    write(gLogFD, JprofBuffer->head, JprofBuffer->tail - JprofBuffer->head);
  } else {
    write(gLogFD, JprofBuffer->head,
          &(JprofBuffer->buffer[JprofBuffer->buffer_size]) - JprofBuffer->head);
    write(gLogFD, JprofBuffer->buffer, JprofBuffer->tail - JprofBuffer->buffer);
  }
  JprofBuffer->clear();
  JprofBuffer->unlock();
}

//----------------------------------------------------------------------

JPROF_STATIC void JprofLog(u_long aTime, void *stack_top, void *top_instr_ptr) {
  // Static is simply to make debugging tolerable
  static malloc_log_entry me;

  me.delTime = aTime;
  me.thread = syscall(SYS_gettid);  // gettid();
  if (was_paused) {
    me.flags = JP_FIRST_AFTER_PAUSE;
    was_paused = 0;
  } else {
    me.flags = 0;
  }

  CrawlStack(&me, stack_top, top_instr_ptr);

#ifndef NTO
  if (circular) {
    JprofBufferAppend(&me);
  } else {
    write(gLogFD, &me, JprofEntrySizeof(&me));
  }
#else
  printf("Neutrino is missing the pcs member of malloc_log_entry!! \n");
#endif
}

static int realTime;

/* Lets interrupt at 10 Hz.  This is so my log files don't get too large.
 * This can be changed to a faster value latter.  This timer is not
 * programmed to reset, even though it is capable of doing so.  This is
 * to keep from getting interrupts from inside of the handler.
 */
static void startSignalCounter(unsigned long millisec) {
  struct itimerval tvalue;

  tvalue.it_interval.tv_sec = 0;
  tvalue.it_interval.tv_usec = 0;
  tvalue.it_value.tv_sec = millisec / 1000;
  tvalue.it_value.tv_usec = (millisec % 1000) * 1000;

  if (realTime) {
    setitimer(ITIMER_REAL, &tvalue, nullptr);
  } else {
    setitimer(ITIMER_PROF, &tvalue, nullptr);
  }
}

static long timerMilliSec = 50;

#if defined(linux)
static int setupRTCSignals(int hz, struct sigaction *sap) {
  /* global */ rtcFD = open("/dev/rtc", O_RDONLY);
  if (rtcFD < 0) {
    perror("JPROF_RTC setup: open(\"/dev/rtc\", O_RDONLY)");
    return 0;
  }

  if (sigaction(SIGIO, sap, nullptr) == -1) {
    perror("JPROF_RTC setup: sigaction(SIGIO)");
    return 0;
  }

  if (ioctl(rtcFD, RTC_IRQP_SET, hz) == -1) {
    perror("JPROF_RTC setup: ioctl(/dev/rtc, RTC_IRQP_SET, $JPROF_RTC_HZ)");
    return 0;
  }

  if (ioctl(rtcFD, RTC_PIE_ON, 0) == -1) {
    perror("JPROF_RTC setup: ioctl(/dev/rtc, RTC_PIE_ON)");
    return 0;
  }

  if (fcntl(rtcFD, F_SETSIG, 0) == -1) {
    perror("JPROF_RTC setup: fcntl(/dev/rtc, F_SETSIG, 0)");
    return 0;
  }

  if (fcntl(rtcFD, F_SETOWN, getpid()) == -1) {
    perror("JPROF_RTC setup: fcntl(/dev/rtc, F_SETOWN, getpid())");
    return 0;
  }

  return 1;
}

static int enableRTCSignals(bool enable) {
  static bool enabled = false;
  if (enabled == enable) {
    return 0;
  }
  enabled = enable;

  int flags = fcntl(rtcFD, F_GETFL);
  if (flags < 0) {
    perror("JPROF_RTC setup: fcntl(/dev/rtc, F_GETFL)");
    return 0;
  }

  if (enable) {
    flags |= FASYNC;
  } else {
    flags &= ~FASYNC;
  }

  if (fcntl(rtcFD, F_SETFL, flags) == -1) {
    if (enable) {
      perror("JPROF_RTC setup: fcntl(/dev/rtc, F_SETFL, flags | FASYNC)");
    } else {
      perror("JPROF_RTC setup: fcntl(/dev/rtc, F_SETFL, flags & ~FASYNC)");
    }
    return 0;
  }

  return 1;
}
#endif

JPROF_STATIC void StackHook(int signum, siginfo_t *info, void *ucontext) {
  static struct timeval tFirst;
  static int first = 1;
  size_t millisec = 0;

#if defined(linux)
  if (rtcHz && pthread_self() != main_thread) {
    // Only collect stack data on the main thread, for now.
    return;
  }
#endif

  if (first && !(first = 0)) {
    puts("Jprof: received first signal");
#if defined(linux)
    if (rtcHz) {
      enableRTCSignals(true);
    } else
#endif
    {
      gettimeofday(&tFirst, 0);
      millisec = 0;
    }
  } else {
#if defined(linux)
    if (rtcHz) {
      enableRTCSignals(true);
    } else
#endif
    {
      struct timeval tNow;
      gettimeofday(&tNow, 0);
      double usec = 1e6 * (tNow.tv_sec - tFirst.tv_sec);
      usec += (tNow.tv_usec - tFirst.tv_usec);
      millisec = static_cast<size_t>(usec * 1e-3);
    }
  }

  gregset_t &gregs = ((ucontext_t *)ucontext)->uc_mcontext.gregs;
#ifdef __x86_64__
  JprofLog(millisec, (void *)gregs[REG_RSP], (void *)gregs[REG_RIP]);
#else
  JprofLog(millisec, (void *)gregs[REG_ESP], (void *)gregs[REG_EIP]);
#endif

  if (!rtcHz) startSignalCounter(timerMilliSec);
}

NS_EXPORT_(void) setupProfilingStuff(void) {
  static int gFirstTime = 1;
  char filename[2048];  // XXX fix

  if (gFirstTime && !(gFirstTime = 0)) {
    int startTimer = 1;
    int doNotStart = 1;
    int firstDelay = 0;
    int append = O_TRUNC;
    char *tst = getenv("JPROF_FLAGS");

    /* Options from JPROF_FLAGS environment variable:
     *   JP_DEFER  -> Wait for a SIGPROF (or SIGALRM, if JP_REALTIME
     *               is set) from userland before starting
     *               to generate them internally
     *   JP_START  -> Install the signal handler
     *   JP_PERIOD -> Time between profiler ticks
     *   JP_FIRST  -> Extra delay before starting
     *   JP_REALTIME -> Take stack traces in intervals of real time
     *               rather than time used by the process (and the
     *               system for the process).  This is useful for
     *               finding time spent by the X server.
     *   JP_APPEND -> Append to jprof-log rather than overwriting it.
     *               This is somewhat risky since it depends on the
     *               address map staying constant across multiple runs.
     *   JP_FILENAME -> base filename to use when saving logs.  Note that
     *               this does not affect the mapfile.
     *   JP_CIRCULAR -> use a circular buffer of size N, write/clear on SIGUSR1
     *
     * JPROF_SLAVE is set if this is not the first process.
     */

    circular = false;

    if (tst) {
      if (strstr(tst, "JP_DEFER")) {
        doNotStart = 0;
        startTimer = 0;
      }
      if (strstr(tst, "JP_START")) doNotStart = 0;
      if (strstr(tst, "JP_REALTIME")) realTime = 1;
      if (strstr(tst, "JP_APPEND")) append = O_APPEND;

      char *delay = strstr(tst, "JP_PERIOD=");
      if (delay) {
        double tmp = strtod(delay + strlen("JP_PERIOD="), nullptr);
        if (tmp >= 1e-3) {
          timerMilliSec = static_cast<unsigned long>(1000 * tmp);
        } else {
          fprintf(stderr, "JP_PERIOD of %g less than 0.001 (1ms), using 1ms\n",
                  tmp);
          timerMilliSec = 1;
        }
      }

      char *circular_op = strstr(tst, "JP_CIRCULAR=");
      if (circular_op) {
        size_t size = atol(circular_op + strlen("JP_CIRCULAR="));
        if (size < 1000) {
          fprintf(stderr, "JP_CIRCULAR of %lu less than 1000, using 10000\n",
                  (unsigned long)size);
          size = 10000;
        }
        JprofBufferInit(size);
        fprintf(stderr, "JP_CIRCULAR buffer of %lu bytes\n",
                (unsigned long)size);
        circular = true;
      }

      char *first = strstr(tst, "JP_FIRST=");
      if (first) {
        firstDelay = atol(first + strlen("JP_FIRST="));
      }

      char *rtc = strstr(tst, "JP_RTC_HZ=");
      if (rtc) {
#if defined(linux)
        rtcHz = atol(rtc + strlen("JP_RTC_HZ="));
        timerMilliSec = 0; /* This makes JP_FIRST work right. */
        realTime = 1;      /* It's the _R_TC and all.  ;) */

#  define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)

        if (!IS_POWER_OF_TWO(rtcHz) || rtcHz < 2) {
          fprintf(stderr,
                  "JP_RTC_HZ must be power of two and >= 2, "
                  "but %d was provided; using default of 2048\n",
                  rtcHz);
          rtcHz = 2048;
        }
#else
        fputs(
            "JP_RTC_HZ found, but RTC profiling only supported on "
            "Linux!\n",
            stderr);

#endif
      }
      const char *f = strstr(tst, "JP_FILENAME=");
      if (f)
        f = f + strlen("JP_FILENAME=");
      else
        f = M_LOGFILE;

      char *is_slave = getenv("JPROF_SLAVE");
      if (!is_slave) setenv("JPROF_SLAVE", "", 0);
      gIsSlave = !!is_slave;

      gFilenamePID = syscall(SYS_gettid);  // gettid();
      if (is_slave)
        snprintf(filename, sizeof(filename), "%s-%d", f, gFilenamePID);
      else
        snprintf(filename, sizeof(filename), "%s", f);

      // XXX FIX! inherit current capture state!
    }

    if (!doNotStart) {
      if (gLogFD < 0) {
        gLogFD = open(filename, O_CREAT | O_WRONLY | append, 0666);
        if (gLogFD < 0) {
          fprintf(stderr, "Unable to create " M_LOGFILE);
          perror(":");
        } else {
          struct sigaction action;
          sigset_t mset;

          // Dump out the address map when we terminate
          RegisterJprofShutdown();

          main_thread = pthread_self();
          // fprintf(stderr,"jprof: main_thread = %u\n",
          //        (unsigned int)main_thread);

          // FIX!  probably should block these against each other
          // Very unlikely.
          sigemptyset(&mset);
          action.sa_handler = nullptr;
          action.sa_sigaction = StackHook;
          action.sa_mask = mset;
          action.sa_flags = SA_RESTART | SA_SIGINFO;
#if defined(linux)
          if (rtcHz) {
            if (!setupRTCSignals(rtcHz, &action)) {
              fputs(
                  "jprof: Error initializing RTC, NOT "
                  "profiling\n",
                  stderr);
              return;
            }
          }

          if (!rtcHz || firstDelay != 0)
#endif
          {
            if (realTime) {
              sigaction(SIGALRM, &action, nullptr);
            }
          }
          // enable PROF in all cases to simplify JP_DEFER/pause/restart
          sigaction(SIGPROF, &action, nullptr);

          // make it so a SIGUSR1 will stop the profiling
          // Note:  It currently does not close the logfile.
          // This could be configurable (so that it could
          // later be reopened).

          struct sigaction stop_action;
          stop_action.sa_handler = EndProfilingHook;
          stop_action.sa_mask = mset;
          stop_action.sa_flags = SA_RESTART;
          sigaction(SIGUSR1, &stop_action, nullptr);

          // make it so a SIGUSR2 will clear the circular buffer

          stop_action.sa_handler = ClearProfilingHook;
          stop_action.sa_mask = mset;
          stop_action.sa_flags = SA_RESTART;
          sigaction(SIGUSR2, &stop_action, nullptr);

          printf(
              "Jprof: Initialized signal handler and set "
              "timer for %lu %s, %d s "
              "initial delay\n",
              rtcHz ? rtcHz : timerMilliSec, rtcHz ? "Hz" : "ms", firstDelay);

          if (startTimer) {
#if defined(linux)
            /* If we have an initial delay we can just use
               startSignalCounter to set up a timer to fire the
               first stackHook after that delay.  When that happens
               we'll go and switch to RTC profiling. */
            if (rtcHz && firstDelay == 0) {
              puts("Jprof: enabled RTC signals");
              enableRTCSignals(true);
            } else
#endif
            {
              puts("Jprof: started timer");
              startSignalCounter(firstDelay * 1000 + timerMilliSec);
            }
          }
        }
      }
    }
  } else {
    printf("setupProfilingStuff() called multiple times\n");
  }
}