find-interesting-modules.py
author L. David Baron <dbaron@dbaron.org>
Tue, 22 Sep 2009 23:43:00 -0700
changeset 3 eb077231986ca3e1c0fc58a5e5a732990fdd401b
parent 2 20629239f4f99b34eec19d35e369bf966b6526d7
child 4 9e133145ce8124d296b7117160ca34c2cd48ba41
permissions -rwxr-xr-x
Add modeline and license block.

#!/usr/bin/python
# vim: set shiftwidth=4 tabstop=4 autoindent expandtab:
# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is find-interesting-modules.py.
#
# The Initial Developer of the Original Code is the Mozilla Foundation.
# Portions created by the Initial Developer are Copyright (C) 2009
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   L. David Baron <dbaron@dbaron.org>, Mozilla Corporation (original author)
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK *****

import json
import gzip
import os

JSONZ_FILES_DIR = "/home/dbaron/crash-stats/mnt"
#JSONZ_FILES_DIR = "/home/dbaron/crash-stats/mnt/old_bp_jsonz/name/00/"
REQUIRED_PRODUCT = "Firefox"
REQUIRED_VERSION = "3.5.3"

# A generator that gives all crashes
def all_crashes():
    for root, dirs, files in os.walk(JSONZ_FILES_DIR):
        for file in files:
            io = gzip.open(os.path.join(root, file), "rb")
            crash = json.load(io)
            io.close()
            if crash["product"] != REQUIRED_PRODUCT:
                raise StandardError("Unexpected product " + crash["product"])
            if crash["version"] != REQUIRED_VERSION:
                raise StandardError("Unexpected version " + crash["version"])
            yield crash

# For each operating system, accumulate total and per-stack-signature
# counts of the total number of crashes and the number of crashes in
# which each module was found.
osyses = {}
for crash in all_crashes():
    # print json.dumps(crash)
    # print crash["product"] + " " + crash["version"] + " " + crash["osys_name"]

    # setdefault is a method on python dictionaries defined so that
    # dict.setdefault(key, default) is equivalent to:
    #     if not key in dict:
    #         dict[key] = default
    #     return dict[key]

    if not "os_name" in crash:
        # We have some bad crash reports.
        continue

    osys = osyses.setdefault(crash["os_name"],
                             { "count": 0,
                               "signatures": {},
                               "module_counts": {} })
    signature = osys["signatures"].setdefault(crash["signature"],
                                              { "count": 0,
                                                "module_counts": {} })
    accumulate_objs = [osys, signature]

    for obj in accumulate_objs:
        obj["count"] = obj["count"] + 1

    for dump_line in crash["dump"].split("\n"):
        if dump_line.startswith("Module|"):
            module_str, libname, version, pdb, checksum, addrstart, addrend, \
              unknown = dump_line.split("|")
            # Increment the global count on osys and the per-signature count.
            for obj in accumulate_objs:
                counts = obj["module_counts"]
                counts[libname] = counts.get(libname, 0) + 1

# For each stack signature for which we have at least 10 crashes, print
# out modules at least 5% above baseline.
MIN_CRASHES = 10
MIN_BASELINE_DIFF = 0.05
for osname, osys in osyses.items():
    print
    print osname
    sorted_signatures = [sig for sig in osys["signatures"].items() \
                           if sig[1]["count"] >= MIN_CRASHES]
    sorted_signatures.sort(key=lambda tuple: tuple[1]["count"], reverse=True)
    for signame, sig in sorted_signatures:
        print "  %s (%d crashes)" % (signame, sig["count"])
        modules = [ (libname, in_sig_count, float(in_sig_count) / sig["count"],
                       osys["module_counts"][libname],
                       float(osys["module_counts"][libname]) / osys["count"])
                     for libname, in_sig_count in sig["module_counts"].items()]
        modules = [ module for module in modules \
                      if module[2] - module[4] >= MIN_BASELINE_DIFF ]
        modules.sort(key = lambda module: module[2] - module[4], reverse = True)
        for module in modules:
            (libname, in_sig_count, in_sig_ratio, in_os_count,
               in_os_ratio) = module
            print "    %3d%% (%d/%d) vs. %3d%% (%d/%d) %s" % \
                  (int(round(in_sig_ratio * 100)), in_sig_count, sig["count"],
                   int(round(in_os_ratio * 100)), in_os_count, osys["count"],
                   libname)
        print
    print