systemtap/addrsymfilt.py
author Andrew Sutherland <asutherland@asutherland.org>
Thu, 03 Feb 2011 00:07:53 -0800
changeset 101 0626e0045f76
parent 81 1b2abfb29a0b
permissions -rwxr-xr-x
add the patch required to feed the analyzer logic
#!/usr/bin/python
# MPL/GPL/LGPL licensed
# Andrew Sutherland <asutherland@asutherland.org>
#
# Usage: addrsymfilt.py <PID>
#
# Example: stap mytap.stp | addrsymfilt.py `pgrep thunderbird-bin`
#
# We filter the provided text stream, replacing any addresses we find with the
#  closest preceding symbol found in the process.
#
# This is accomplished by reading /proc/PID/maps to understand the address space
# of the process.  Once we have this information, we are able to get the symbols
# for each process using 'nm'.  (We could be fancier, but why bother?)  We do
# not do anything with dwarf debug symbols, but do ask nm to perform C++
# demangling for us.
#

import sys, re
import subprocess


def hexparse(x):
    return int(x, 16)

class BinaryInfo(object):
    '''
    Provides address to symbol translation for a binary with on-demand retrieval
    of symbols.  Because we only fetch things on-demand, it's okay even if the
    binaries are data things like fonts.

    We do some pragmatic if likely sketchy things when it comes to address
    mapping.  Sadly, I even used to know how to do these things properly, but
    it turns out this is easier than re-learning.  See _loadOffsetInfo.
    '''
    def __init__(self, path):
        self.path = path
        #: the adjustment to apply if we have any offset passed
        self.offsetAdjustment = 0
        self.symbols = None

    def _loadOffsetInfo(self):
        '''
        Figure out how to map the file offset to the virtual address offset.

        We take all the LOAD segments from readelf -l and record the offset and
        VirtAddr.  This will allow us to map using the offset passed in from
        the offset provided by /proc/PID/maps to just directly have the virt
        addr to add on.
        '''
        self.offsetInfo = {}
        args = ['/usr/bin/readelf', '-l', self.path]
        proc = subprocess.Popen(args, stdout=subprocess.PIPE)
        for line in proc.stdout:
            line = line.strip()
            # we only care about load lines
            if line.startswith('LOAD'):
                bits = line.split(None, 3)
                offset = int(bits[1], 16)
                virtAddr = int(bits[2], 16)
                self.offsetAdjustment = virtAddr - offset

    def _loadSymbols(self):
        '''
        Grab symbols from the binary via nm.
        '''
        self._loadOffsetInfo()
        self.symbols = []
        lastaddr = -1

        args = ['/usr/bin/nm', '--demangle', '--defined-only',
                '--numeric-sort', self.path]
        proc = subprocess.Popen(args, stdout=subprocess.PIPE)
        for line in proc.stdout:
            try:
                addrStr, symtype, symname = line.rstrip().split(None, 2)
            except Exception, e:
                # weird lines should not kill us.
                continue
            addr = hexparse(addrStr)

            # no dupes!
            if addr == lastaddr:
                continue


            rawname = symname

            # - trim the symbol name of junk
            if symname.startswith('non-virtual thunk to '):
                symname = 'thunk:' + symname[21:]

            # - Special (non-)transformations...
            if (symname.startswith('nsRunnableMethod') or
                symname.startswith('vtable for nsRunnableMethod')):
                pass
            # - Otherwise clean up ranty big parts
            else:
                idxParen = symname.find('(')
                if idxParen >= 0:
                    if symname[idxParen+1] != ')':
                        ridxParen = symname.find(')', idxParen+1)
                        symname = symname[:idxParen+1] + '...' + symname[ridxParen:]

            lastaddr = addr
            self.symbols.append((addr, symname, rawname))

    def translateAddress(self, addr, offset, raw=False):
        '''
        @returns (symbol string, overshoot in bytes)
        '''
        if self.symbols is None:
            self._loadSymbols()

        if raw:
            result_index = 2
        else:
            result_index = 1

        if offset:
            addr += offset + self.offsetAdjustment

        symbols = self.symbols
        if not symbols:
            return None, None
        lo = 0
        hi = len(symbols)
        while lo < hi:
            mid = (lo+hi)//2
            midtupe = symbols[mid]
            midaddr = midtupe[0]
            if midaddr < addr:
                lo = mid+1
            elif midaddr > addr:
                hi = mid
            else:
                # exact match! hooray!
                return midtupe[result_index], 0
        # not an exact match, lo-1 is our index if lo>0
        if lo:
            midtupe = symbols[lo-1]
            return midtupe[result_index], addr-midtupe[0]
        return None, None

class ProcInfo(object):
    def __init__(self, pid, mappath=None):
        #: Tuples of (low addr, high addr, adjust, binary).
        #:  The low address is inclusive, the high address is exclusive.
        self.ranges = []
        self.binaries_by_path = {}

        if pid is None:
            self.pid = None
            if mappath is None:
                return
        else:
            self.pid = int(pid)
        self._read_maps(mappath)

    def _read_maps(self, mappath=None):
        '''
        Read /proc/PID/maps to get info about the address space and store it in
        self.ranges.
        '''
        # example:
        #address           perms offset  dev   inode      pathname
        #08040000-08050000 r-xp 00000000 01:02 12345      /usr/bin/ls
        # perms: read/write/execute/shared/private (copy on write)
        # offset: the offset into the mapped file
        # dev: major/minor device number of the file's origin
        # inode: inode on the origin device
        if mappath is None:
            mappath = '/proc/%d/maps' % (self.pid,)
        mapfile = open(mappath, 'r')
        for line in mapfile:
            bits = line.rstrip().split(None, 5)
            # ignore things without paths, we can't look in them
            if len(bits) < 6:
                continue
            path = bits[5]
            # ignore stack/vdso/vsyscall
            if path.startswith('['):
                continue
            # ignore things we can't get at
            if path.endswith(' (deleted)'):
                continue
            if path not in self.binaries_by_path:
                self.binaries_by_path[path] = BinaryInfo(path)
            binary = self.binaries_by_path[path]

            addr_low, addr_high = map(hexparse, bits[0].split('-'))
            offset = hexparse(bits[2])

            #print 'mapped', hex(addr_low), hex(addr_high), hex(offset), binary.path
            self.ranges.append((addr_low, addr_high, offset, binary))

        mapfile.close()

    def translateAddress(self, addr, raw=False):
        '''
        Map the provided address into a binary's address space using info from
        self.ranges and then ask the binary (a BinaryInfo instance) to
        map that normalized address into a useful symbol.
        '''
        ranges = self.ranges
        if not ranges:
            return None, None

        lo = 0
        hi = len(ranges)
        while lo < hi:
            mid = (lo+hi)//2
            midtupe = ranges[mid]
            range_start = midtupe[0]
            range_end = midtupe[1]
            if range_end <= addr:
                lo = mid+1
            elif range_start > addr:
                hi = mid
            else:
                # in the range, hooray!
                binary = midtupe[3]
                offset = midtupe[2]
                #print hex(addr), hex(range_start), hex(addr-range_start), 'in', binary.path
                return binary.translateAddress(addr-range_start, offset, raw)
        return None, None

    def normalizeHexAddress(self, hexaddr, command, padding=None):
        addr = int(hexaddr, 16)
        symname, overshoot = self.translateAddress(addr, raw=(command == 'raw'))
        if symname:
            if command:
                if command == 'vt':
                    if symname.startswith('vtable for '):
                        symname = symname[11:]
                if padding:
                    symname = symname.ljust(int(padding))
                return symname
            if overshoot:
                return '%s+%x' % (symname, overshoot)
            return symname
        return hexaddr

    full_addr_hex_re = re.compile('^:!([a-z]{2,2})(?:,(\d+))?:([0-9a-f]+)$')
    def transformString(self, s):
        match = self.full_addr_hex_re.match(s)
        if match:
            command = match.group(1)
            padding = match.group(2)
            hexaddr = match.group(3)
            return self.normalizeHexAddress(hexaddr, command, padding)
        return s

    def transformStackString(self, s):
        '''
        Given a string with space-delimited pointers, return a list of
        symbol names.
        '''
        frames = []
        for addr in s.split(' '):
            frames.append(self.normalizeHexAddress(addr, 'raw'))
        return frames

def main(pid):
    normal_hex_re = re.compile("0x[0-9a-f]+")
    addr_hex_re = re.compile(':!([a-z]{2,2})(?:,(\d+))?:([0-9a-f]+)')
    
    proc = ProcInfo(pid)

    def normal_replacer(match):
        hexaddr = match.group(0)
        symname, overshoot = proc.translateAddress(int(hexaddr, 16))
        return symname or hexaddr

    def replacer(match):
        command = match.group(1)
        padding = match.group(2)
        hexaddr = match.group(3)
        return proc.normalizeHexAddress(hexaddr, command, padding)

    while not sys.stdin.closed:
        line = sys.stdin.readline()
        line = addr_hex_re.sub(replacer, line)
        line = normal_hex_re.sub(normal_replacer, line)
        sys.stdout.write(line)

if __name__ == '__main__':
    if len(sys.argv) != 2:
        sys.stderr.write('We need the PID as an arg!')
        sys.exit(1)

    main(sys.argv[1])