#!/usr/env python
'''
Example usage:
# Get all negative input
python scrape_input.py --type issues > ~/Desktop/negative.csv;
# Get all mobile negative input
python scrape_input.py --product mobile --type issues > ~/Desktop/mobile_negative.csv;
# Get all positive input
python scrape_input.py --type praise > ~/Desktop/positive.csv;
# Get all idea input
python scrape_input.py --type ideas > ~/Desktop/ideas.csv;
# Get overall input
python scrape_input.py --type all > ~/Desktop/all_input.csv;
# Get overall "crash" input
python scrape_input.py --type all --search crash > ~/Desktop/all_search_crash.csv;
# Get for Firefox 5
python scrape_input.py --version 5.0 > ~/Desktop/5.0_input.csv;
# Get for Firefox 5 mobile crashes
python scrape_input.py --product mobile --version 5.0 --search crash > ~/Desktop/5.0_mobile_crash_input.csv;
# Get for Firefox 7 desktop hangs
python scrape_input.py --dev --version 7.0 --search hang,freeze,responding,pause > ~/Desktop/7.0_hang_input.csv;
'''
import requests
import ast
import datetime
import argparse
import sys
import re
import htmlentitydefs
import csv
# Grabbed from the web
def convertentity(m):
if m.group(1)=='#':
try:
return chr(int(m.group(2)))
except ValueError:
return '&#%s;' % m.group(2)
try:
return htmlentitydefs.entitydefs[m.group(2)]
except KeyError:
return '&%s;' % m.group(2)
def converthtml(s):
return re.sub(r'&(#?)(.+?);',convertentity,s)
def get_input_data(version, search_version, product, search, DATA, use_dev=False):
# Get the page from input.mozilla.com
if not use_dev:
url = 'http://input.mozilla.com/en-US/'
else:
url = 'http://input.allizom.org/en-US/'
r = requests.get(url,
params={
'product': product,
'version': search_version,
'q': search,
'date_start': '2011-01-01'})
# Extract the chart data
m = re.search(r'data-chart-config="([^"]+)"', r.content)
# If we found chart data
if m:
# Remove entities and convert into a dictionary
webcontent = ast.literal_eval(converthtml(m.group(1)))
# Loop through each series/type of feedback
for s in webcontent['series']:
mydata = {}
# Feedback type name
type = s['name'].lower()
types.add(type)
# Loop through the data
for d in s['data']:
# Extract the date from unix timestamp
date = datetime.date.fromtimestamp(d[0]).isoformat()
# Get the magnitude
value = d[1]
# Set up some vars
if date not in DATA:
DATA[date] = {}
if version not in DATA[date]:
DATA[date][version] = {
'praise': 0,
'issues': 0,
'ideas': 0
}
if type in DATA[date][version]:
# We already have something for this type on this date for
# this version, aggregate
DATA[date][version][type] = DATA[date][version][type] + value
else:
# Store the data
DATA[date][version][type] = value
# Start main
parser = argparse.ArgumentParser(description='Scrape metrics from input.mozilla.org')
parser.add_argument('--version',
help='Version of Firefox to focus on',
default=None)
parser.add_argument('--type',
help='Type of feedback to focus on',
default=None)
parser.add_argument('--dev',
help='Use the input development server',
action='store_true',
default=False)
parser.add_argument('--product',
help='Product to look for (mobile or firefox)',
default='firefox')
parser.add_argument('--search',
help='String to search input for. Multiple values can be specified with commas',
default=None)
parser.add_argument('--verbose',
help='Verbose output',
action='store_true',
default=False)
args = parser.parse_args()
DATA = {}
versions = set()
types = set()
# Get the main page from input
r = requests.get('http://input.mozilla.com/en-US/')
# Extract the version data
m = re.search(r'data-versions="([^"]+)"', r.content)
# If we found version data
if m:
# Remove entities and convert into a dictionary
avail_vers = ast.literal_eval(converthtml(m.group(1)))
# Loop through all the Firefox versions
for x in avail_vers['firefox']:
# Skip anything that doesn't look like a version
if x[0].count('.'):
version = x[0]
# We are going to combine all aurora, beta, and point releases with the main
# version
version = re.sub(r'(\d\.\d).*','\\1', version)
# If we specified a version and it isn't this one, try
# the next one
if args.version and not args.version == version:
continue
# Store the version for later
versions.add(version)
if args.verbose:
print >> sys.stderr, 'Processing %s' % x[0]
print >> sys.stderr, 'Grouping with %s' % version
if args.search:
# Support multiple search terms
if args.search.count(','):
args.search = args.search.replace(',','|')
# Search for the individual terms and aggregate
if args.verbose:
print >> sys.stderr, 'Searching for %s' % args.search
get_input_data(version, x[0], args.product, args.search, DATA, args.dev)
else:
# No search terms specified
get_input_data(version, x[0], args.product, None, DATA)
# We're going to write a CSV to stdout
f = csv.writer(sys.stdout,
delimiter=',')
# User has specified a type to look for
if args.type:
# Hardcode some values
if args.type not in ['praise','issues','ideas','all']:
raise
# -- Heading --
heading = []
heading.append('date')
for version in sorted(versions):
heading.append('%s %s' % (args.product.capitalize(), version))
# All
heading.append('All')
# End the heading line
f.writerow(heading)
# -- CSV Data --
for date in sorted(DATA):
row = []
# Date first
row.append(date)
all = 0
# Each col is a Firefox version as we are pivoting on a type
for version in sorted(versions):
try:
num = 0
if args.type == 'all':
# If we specified "all" types, we need to aggregate
for x in ['praise','issues','ideas']:
num = num + int(DATA[date][version][x])
else:
# Specific type specified, just use the value
num = DATA[date][version][args.type]
all = all + num
# Don't print out zeros
if not num:
num = ''
# Print the value
row.append(num)
except:
# Any errors leave the value blank
row.append('')
# All
row.append(all)
# Only write the row if there is data
if all:
f.writerow(row)
# User has specified a version to look for
if args.version:
# Hardcode some values
if args.version not in versions:
raise
# -- Heading --
heading = []
heading.append('date')
# Feedback types
for type in sorted(types):
heading.append(type)
# Ratios
for type1 in sorted(types):
for type2 in sorted(types):
if not type1 == type2:
heading.append('%s/%s' % (type1, type2))
# Total for types
heading.append('All')
# End the heading line
f.writerow(heading)
# -- CSV Data --
for date in sorted(DATA):
row = []
# Date first
row.append(date)
# Each col is a feedback type as we are pivoting on a version
all = 0
# Straight types
for type in sorted(types):
try:
# Print the value
row.append(DATA[date][args.version][type])
all = all + int(DATA[date][args.version][type])
except:
# Any errors leave the value blank
row.append('')
# Ratios
for type1 in sorted(types):
for type2 in sorted(types):
if not type1 == type2:
try:
row.append('%.2f' % (float(DATA[date][args.version][type1]) / float(DATA[date][args.version][type2])))
except:
# Any errors leave the value blank
row.append('')
# All
row.append(all)
# Only write the row if there is data
if all:
f.writerow(row)