scrape_input.py
author Christian Legnitto <clegnitto@mozilla.com>
Thu, 19 Jan 2012 21:22:16 -0800
changeset 59 4512337538131d0d7a6d34408d8f336d865d76ba
parent 46 9ed6d1cc4115eae0e586fb93e4a6ecbf1f6e5fa7
permissions -rw-r--r--
Don't include argparse

#!/usr/env python

'''
Example usage:

# Get all negative input
python scrape_input.py --type issues > ~/Desktop/negative.csv;

# Get all mobile negative input
python scrape_input.py --product mobile --type issues > ~/Desktop/mobile_negative.csv;

# Get all positive input
python scrape_input.py --type praise > ~/Desktop/positive.csv;

# Get all idea input
python scrape_input.py --type ideas > ~/Desktop/ideas.csv;

# Get overall input
python scrape_input.py --type all > ~/Desktop/all_input.csv;

# Get overall "crash" input
python scrape_input.py --type all --search crash > ~/Desktop/all_search_crash.csv;

# Get for Firefox 5
python scrape_input.py --version 5.0 > ~/Desktop/5.0_input.csv;

# Get for Firefox 5 mobile crashes
python scrape_input.py --product mobile --version 5.0 --search crash > ~/Desktop/5.0_mobile_crash_input.csv;

# Get for Firefox 7 desktop hangs
python scrape_input.py --dev --version 7.0 --search hang,freeze,responding,pause > ~/Desktop/7.0_hang_input.csv;


'''

import requests
import ast
import datetime
import argparse
import sys
import re
import htmlentitydefs
import csv


# Grabbed from the web
def convertentity(m):
    if m.group(1)=='#':
        try:
            return chr(int(m.group(2)))
        except ValueError:
            return '&#%s;' % m.group(2)
    try:
        return htmlentitydefs.entitydefs[m.group(2)]
    except KeyError:
        return '&%s;' % m.group(2)

def converthtml(s):
    return re.sub(r'&(#?)(.+?);',convertentity,s)

def get_input_data(version, search_version, product, search, DATA, use_dev=False):
    # Get the page from input.mozilla.com
    if not use_dev:
        url = 'http://input.mozilla.com/en-US/'
    else:
        url = 'http://input.allizom.org/en-US/'

    r = requests.get(url,
                     params={
                         'product': product,
                         'version': search_version,
                               'q': search,
                      'date_start': '2011-01-01'})

    # Extract the chart data
    m = re.search(r'data-chart-config="([^"]+)"', r.content)

    # If we found chart data
    if m:
        # Remove entities and convert into a dictionary
        webcontent = ast.literal_eval(converthtml(m.group(1)))

        # Loop through each series/type of feedback
        for s in webcontent['series']:

            mydata = {}

            # Feedback type name
            type = s['name'].lower()

            types.add(type)

            # Loop through the data
            for d in s['data']:

                # Extract the date from unix timestamp
                date = datetime.date.fromtimestamp(d[0]).isoformat()

                # Get the magnitude
                value = d[1]

                # Set up some vars
                if date not in DATA:
                    DATA[date] = {}
                if version not in DATA[date]:
                    DATA[date][version] = {
                        'praise': 0,
                        'issues': 0,
                        'ideas': 0
                    }

                if type in DATA[date][version]:
                    # We already have something for this type on this date for 
                    # this version, aggregate
                    DATA[date][version][type] = DATA[date][version][type] + value
                else:
                    # Store the data
                    DATA[date][version][type] = value


# Start main
parser = argparse.ArgumentParser(description='Scrape metrics from input.mozilla.org')

parser.add_argument('--version',
                    help='Version of Firefox to focus on',
                    default=None)
parser.add_argument('--type',
                    help='Type of feedback to focus on',
                    default=None)
parser.add_argument('--dev',
                    help='Use the input development server',
                    action='store_true',
                    default=False)
parser.add_argument('--product',
                    help='Product to look for (mobile or firefox)',
                    default='firefox')
parser.add_argument('--search',
                    help='String to search input for. Multiple values can be specified with commas',
                    default=None)
parser.add_argument('--verbose',
                    help='Verbose output',
                    action='store_true',
                    default=False)

args = parser.parse_args()

DATA     = {}
versions = set()
types    = set()

# Get the main page from input
r = requests.get('http://input.mozilla.com/en-US/')

# Extract the version data
m = re.search(r'data-versions="([^"]+)"', r.content)

# If we found version data
if m:
    # Remove entities and convert into a dictionary
    avail_vers = ast.literal_eval(converthtml(m.group(1)))


# Loop through all the Firefox versions
for x in avail_vers['firefox']:

    # Skip anything that doesn't look like a version
    if x[0].count('.'):

        version = x[0]

        # We are going to combine all aurora, beta, and point releases with the main
        # version
        version = re.sub(r'(\d\.\d).*','\\1', version)


        # If we specified a version and it isn't this one, try
        # the next one
        if args.version and not args.version == version:
            continue

        # Store the version for later
        versions.add(version)

        if args.verbose:
            print >> sys.stderr, 'Processing %s' % x[0]
            print >> sys.stderr, 'Grouping with %s' % version

        if args.search:

            # Support multiple search terms
            if args.search.count(','):
                args.search = args.search.replace(',','|')

            # Search for the individual terms and aggregate
            if args.verbose:
                print >> sys.stderr, 'Searching for %s' % args.search
            get_input_data(version, x[0], args.product, args.search, DATA, args.dev)

        else:
            # No search terms specified
            get_input_data(version, x[0], args.product, None, DATA)


# We're going to write a CSV to stdout
f = csv.writer(sys.stdout, 
               delimiter=',')

# User has specified a type to look for
if args.type:

    # Hardcode some values
    if args.type not in ['praise','issues','ideas','all']:
        raise

    # -- Heading --
    heading = []
    heading.append('date')
    for version in sorted(versions):
        heading.append('%s %s' % (args.product.capitalize(), version))

    # All
    heading.append('All')

    # End the heading line
    f.writerow(heading)

    # -- CSV Data --
    for date in sorted(DATA):

        row = []

        # Date first
        row.append(date)

        all = 0

        # Each col is a Firefox version as we are pivoting on a type
        for version in sorted(versions):
            try:
                num = 0
                if args.type == 'all':
                    # If we specified "all" types, we need to aggregate
                    for x in ['praise','issues','ideas']:
                        num = num + int(DATA[date][version][x])
                else:
                    # Specific type specified, just use the value
                    num = DATA[date][version][args.type]

                all = all + num

                # Don't print out zeros
                if not num:
                    num = ''

                # Print the value
                row.append(num)

            except:
                # Any errors leave the value blank
                row.append('')

        # All
        row.append(all)

        # Only write the row if there is data
        if all:
            f.writerow(row)


# User has specified a version to look for
if args.version:

    # Hardcode some values
    if args.version not in versions:
        raise

    # -- Heading --
    heading = []
    heading.append('date')
    
    # Feedback types
    for type in sorted(types):
        heading.append(type)

    # Ratios
    for type1 in sorted(types):
        for type2 in sorted(types):
            if not type1 == type2:
                heading.append('%s/%s' % (type1, type2))

    # Total for types
    heading.append('All')

    # End the heading line
    f.writerow(heading)

    # -- CSV Data --
    for date in sorted(DATA):

        row = []

        # Date first
        row.append(date)

        # Each col is a feedback type as we are pivoting on a version
        all = 0

        # Straight types
        for type in sorted(types):
            try:
                # Print the value
                row.append(DATA[date][args.version][type])
                all = all + int(DATA[date][args.version][type])
            except:
                # Any errors leave the value blank
                row.append('')

        # Ratios
        for type1 in sorted(types):
            for type2 in sorted(types):
                if not type1 == type2:
                    try:
                        row.append('%.2f' % (float(DATA[date][args.version][type1]) / float(DATA[date][args.version][type2])))
                    except:
                        # Any errors leave the value blank
                        row.append('')

        # All
        row.append(all)

        # Only write the row if there is data
        if all:
            f.writerow(row)