diff --git a/datascraper.py b/datascraper.py index 4221097..155c07d 100755 --- a/datascraper.py +++ b/datascraper.py @@ -50,7 +50,7 @@ from gzip import GzipFile import binascii # local imports -import log_parser +import logparser import mappings try: diff --git a/datascraper.py~ b/datascraper.py~ new file mode 100755 index 0000000..4221097 --- /dev/null +++ b/datascraper.py~ @@ -0,0 +1,532 @@ +#!/usr/bin/env python +#Indentation is 2 spaces ***** DO NOT USE TABS ***** + +# ***** BEGIN LICENSE BLOCK ***** +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is TopFails site code. +# +# The Initial Developer of the Original Code is +# Mozilla foundation +# Portions created by the Initial Developer are Copyright (C) 2010 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Serge Gautherie +# Ted Mielczarek . +# Murali Nandigama +# Jeff Hammel +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ***** END LICENSE BLOCK ***** + +import re, os, sys, urllib, logging +import MySQLdb # Moved from sqlite3 db to MySQL +from time import ctime, sleep, time +from math import ceil +from optparse import OptionParser +from gzip import GzipFile +import binascii + +# local imports +import log_parser +import mappings + +try: + # 'Availability: Unix.' + from time import tzset +except ImportError: + print >>sys.stderr, "WARNING: time.tzset() is not available on non-Unixes!" + + # Define a fake function. (for development use only) + # ToDo: Investigate Windows/etc situation. (Bug 525699) + def tzset(): + pass + +try: + import simplejson as json +except ImportError: + try: + # 'New in version 2.6.' + import json + except ImportError: + print >>sys.stderr, "ERROR: no simplejson nor json package found!" + sys.exit(1) + +from dbschema import CreateDBSchema + +# Number of seconds in a hour: 60 mn * 60 s = 3600 s. +S_IN_H = 3600 +# Download data in 24 hours chunks so as not to overwhelm the tinderbox server. +chunksize = 24 * S_IN_H +# seconds between requests +SLEEP_TIME = 1 + +class BuildStatus(): + # Unavailable builds to skip. + # Values need (only) to be less than the 'Success' one. + NoBuild = -2 + InProgress = -1 + + # Builds to save in the db. + # Do not change these values (without updating db data). + Success = 0 + TestFailed = 1 + Burning = 2 + Exception = 3 + Unknown = 4 + +csetre = re.compile("rev/([0-9A-Za-z]+)") +def FindChangesetInScrape(scrape): + for line in scrape: + m = csetre.search(line) + if m: + return m.group(1) + return None + +buildStatuses = { + # "No build in progress". + "null": BuildStatus.NoBuild, + # "Build in progress". + "building": BuildStatus.InProgress, + # "Successful build". + "success": BuildStatus.Success, + # "Successful build, but tests failed". + "testfailed": BuildStatus.TestFailed, + # "Build failed". + "busted": BuildStatus.Burning, + # "Non-build failure". (i.e. "automation failure") + "exception": BuildStatus.Exception, +} +def BuildStatusFromText(status): + try: + return buildStatuses[status] + except KeyError: + # Log 'Unknown' status failure: this should not happen (unless new statuses are created), but we want to know if it does. + logging.info("WARNING: unknown status = '%s'!" % status) + return BuildStatus.Unknown + +def GetOrInsertTree(conn, tree): + """Get an id for a tree named |tree|. If it's not already in the trees + table, insert a new row and return the id.""" + + conn.execute(""" + SELECT id FROM viewer_tree WHERE name = %s + """, (tree)) + if conn.rowcount > 0: + return conn.fetchone()[0] + + # need to insert it + conn.execute(""" + INSERT INTO viewer_tree (name) VALUES (%s) + """, (tree)) + connection.commit() + return conn.lastrowid + +def GetOrInsertTest(conn, testname): + """Get an id for a test named |testname|. If it's not already in the testnames + table, insert a new row and return the id.""" + + conn.execute(""" + SELECT id FROM viewer_test WHERE name = %s + """, (testname)) + if conn.rowcount > 0: + return conn.fetchone()[0] + + # need to insert it + conn.execute(""" + INSERT INTO viewer_test (name) VALUES (%s) + """, (testname)) + connection.commit() + return conn.lastrowid + +def HaveBuild(conn, treeid, _os, starttime): + """See if we already have this build in our database.""" + conn.execute(""" + SELECT COUNT(*) FROM viewer_build WHERE tree_id = %s AND os = %s AND starttime = %s + """, (treeid, _os, starttime)) + return conn.fetchone()[0] == 1 + +def UpdateLogfile(conn, treeid, _os, starttime, logfile): + """Update empty 'logfile' for a given build (added in db schema v1).""" + conn.execute(""" + UPDATE viewer_build SET logfile = %s WHERE tree_id = %s AND os = %s AND starttime = %s AND logfile IS NULL + """, (logfile, treeid, _os, starttime)) + connection.commit() + +def InsertBuild(conn, treeid, _os, starttime, status, logfile, changeset): + """Insert a build into the builds table and return the id.""" + conn.execute(""" + INSERT INTO viewer_build (tree_id, os, starttime, status, logfile, changeset) VALUES (%s, %s, %s, %s, %s, %s) + """, (treeid, _os, starttime, status, logfile, changeset)) + connection.commit() + return conn.lastrowid + +def HaveFailRecord(conn,buildid, result, testnames_id): + """See if we already have this failRecord in our database.""" + conn.execute(""" + SELECT COUNT(*) FROM viewer_testfailure WHERE build_id = %s AND test_id = %s + """, (buildid, testnames_id)) + return conn.fetchone()[0] == 1 + +def InsertTest(conn, buildid, result, testnames_id, description): + # ToDo: Add column to save result. + conn.execute(""" + INSERT INTO viewer_testfailure (build_id, test_id, failtext) VALUES (%s, %s, %s) + """, (buildid, testnames_id, description)) + connection.commit() +def asciirepl(match): + # replace the hexadecimal characters with ascii characters + s = match.group() + return binascii.unhexlify(s) + +def reformat_content(data): + p = re.compile(r'\\x(\w{2})') + return p.sub(asciirepl, data) + + +def fix_tbox_json(s): + """Fixes up tinderbox json. + + Tinderbox returns strings as single-quoted strings, and occasionally + includes the unquoted substring 'undef' (with quotes) in the output, e.g. + + {'key': 'hello 'undef' world'} + + should return a dictionary + + {'key': 'hello \'undef\' world'} + """ + + json_data = re.sub(r"^tinderbox_data\s*=\s*", "", s) + json_data = re.sub(r";$", "", json_data) + retval = [] + in_str = False + in_esc = False + skip = 0 + for i,c in enumerate(json_data): + # The tinderbox data is a fracked json. and it some times contains + # Control characters. that would totally fail the json.loads step. + # So, eliminate them .. all of them .. here -- Murali + if (c < '\xFD' and c > '\x1F') or c == '\n' or c == '\r' : + if skip > 0: + skip -= 1 + continue + + if in_str: + if in_esc: + if c == "'": + retval.append("'") + else: + retval.append("\\") + retval.append(c) + in_esc = False + elif c == "\\": + in_esc = True + elif c == "\"": + retval.append("\\\"") + elif c == "'": + if json_data[i:i+7] == "'undef'": + retval.append("'undef'") + skip = 7 + else: + retval.append("\"") + in_str = False + else: + retval.append(c) + else: + if c == "'": + retval.append("\"") + in_str = True + else: + retval.append(c) + return "".join(retval) + +parser = OptionParser() +parser.add_option("-s", "--span", action="store", + dest="timespan", default="20d", + help="Period of time to fetch data for (N[y,m,w,d,h], default=%default)") +parser.add_option("-t", "--tree", action="store", + dest="tree", default="Firefox", + help="Tinderbox tree to fetch data from (default=%default)") +parser.add_option("-d", "--database", action="store", + dest="db", default="topfails", + help="Database filename (default=%default)") +parser.add_option("--host", action="store", + dest="dbhost", default="localhost", + help="Database host name (default=%default)") +parser.add_option( "--port", action="store", + dest="dbport",default="3306", + help="Database port (default=%default)") +parser.add_option("-u", "--user", action="store", + dest="dbuser", default="root", + help="Database username (default=%default)") +parser.add_option("-p", "--passwd", action="store", + dest="dbpasswd", + help="Database user password") +parser.add_option("-v", "--verbose", action="store_true", + dest="verbose", default="False", + help="Enable verbose logging") +parser.add_option("--debug", action='store_true', + dest='debug', default=False, + help="enable interactive debugger on exceptions (pdb)") +parser.add_option("--die", action='store_true', + dest='die', default=False, + help="enable application to die on error") +(options, args) = parser.parse_args() + +# check parsed options +if options.tree not in mappings.trees: + parser.error("Unknown tree: '%s'; should be one of [%s]" % (options.tree, ', '.join(mappings.trees))) + +logging.basicConfig(level=options.verbose and logging.DEBUG or logging.WARNING) + +os.environ['TZ'] = "US/Pacific" +tzset() +# Get current time, in seconds. +endtime = int(time()) + +m = re.match("(\d+)([ymwdh])", options.timespan) +if m is None: + print >>sys.stderr, "ERROR: bad timespan = '%s'!" % options.timespan + sys.exit(1) + +timespan = int(m.group(1)) * {'y': 365 * 24 * S_IN_H, + 'm': 30 * 24 * S_IN_H, + 'w': 7 * 24 * S_IN_H, + 'd': 24 * S_IN_H, + 'h': S_IN_H}[m.group(2)] +# Set current time to beginning of requested timespan ending now. +curtime = endtime - timespan + + +createdb=False + + +try: + connection = MySQLdb.connect (host = options.dbhost, + port = int(options.dbport), + db = options.db, + user = options.dbuser, + passwd = options.dbpasswd) + conn=connection.cursor() +except MySQLdb.Error, e: + print "Error %d: %s" % (e.args[0], e.args[1]) + createdb = True + + +if createdb: + connection = MySQLdb.connect (host = options.dbhost, + port = int(options.dbport), + user = options.dbuser, + passwd = options.dbpasswd) + conn = connection.cursor() + try: + createdatabase='create database %s' %(options.db) + conn.execute (createdatabase) + conn.close() + connection.commit() + connection.close() + except MySQLdb.Error, e: + print "Error %d: %s" % (e.args[0], e.args[1]) + sys.exit (1) + try: + connection = MySQLdb.connect (host = options.dbhost, + port = int(options.dbport), + db = options.db, + user = options.dbuser, + passwd = options.dbpasswd) + conn=connection.cursor() + except MySQLdb.Error, e: + print "Error %d: %s" % (e.args[0], e.args[1]) + sys.exit(1) + + CreateDBSchema(conn) + + +treeid = GetOrInsertTree(conn, options.tree) + +logging.info("Reading tinderbox data...") + +chunk = 0 +# add a fudge factor here, since builds can take up to 3 hours to finish, +# and we can't get the changeset unless we ask for time up to the end of the +# build +endtime += 3 * S_IN_H +timespan += 3 * S_IN_H +totalchunks = int(ceil(float(timespan) / chunksize)) + +while curtime < endtime and chunk < totalchunks: + chunk += 1 + logging.info("Chunk %d/%d" % (chunk, totalchunks)) + + if (endtime - curtime) < chunksize: + chunksize = endtime - curtime + + tboxurl = "http://tinderbox.mozilla.org/showbuilds.cgi?tree=%(tree)s&maxdate=%(maxdate)d&noignore=1&hours=%(hours)d&json=1&noignore=1" \ + % {'tree': options.tree, + 'maxdate': curtime + chunksize, # tbox wants the end time + 'hours': int(chunksize / S_IN_H)} + u = urllib.urlopen(tboxurl) + tboxjson = u.read() + #tboxjson = tboxjson.encode('utf-8').decode('string_escape').decode('utf-8') + #tboxjson = ''.join(u.readlines()) + u.close() + + tboxjson = fix_tbox_json(tboxjson) + try: + tboxdata = json.loads(tboxjson) + except Exception, inst: + print >>sys.stderr, "Error parsing JSON: %s" % inst + continue + + # dictionary of parsers + parsers = { + 'check': log_parser.CheckParser, + 'mochitest': log_parser.MochitestParser, + 'reftest': log_parser.ReftestParser, + 'jsreftest': log_parser.ReftestParser, + 'crashtest': log_parser.ReftestParser, + 'xpcshell': log_parser.XPCshellParser, + } + + # regular expression to find the harness + harness_regex = r'.* (%s)(-.*)?' % '|'.join(parsers.keys()) + + # we only care about unit test boxes + unittest_indices = [(logname, index) #tboxdata['build_name_index'][index] + for logname, index in tboxdata['build_name_index'].items() + if re.search("ref|mochi|xpc|check", logname)] + + # 'TestFailed' expected log format is "result | test | optional text". + # testfailedRe = re.compile(r"(TEST-UNEXPECTED-.*) \| (.*) \|(.*)") + # XXX ^ to delete + + # read build table + for timerow in tboxdata['build_table']: + for logname, index in unittest_indices: + if index >= len(timerow) or timerow[index] == -1: + continue + + build = timerow[index] + if 'buildname' not in build or \ + 'logfile' not in build: + continue + + status = BuildStatusFromText(build['buildstatus']) + # Skip unavailable "builds". + if status < BuildStatus.Success: + continue + + name = build['buildname'] + build_name_dict = mappings.parse_build_name(name) + if build_name_dict: + _os = mappings.OS_to_index[build_name_dict['os']] + else: + _os = -1 # UNKNOWN + starttime = int(build['buildtime']) + # skip builds we've already seen + if HaveBuild(conn, treeid, _os, starttime): + logging.info("Skipping already seen build '%s' at %d (%s)" % (name, starttime, ctime(starttime))) + + # Call 'UpdateLogfile()' anyway. + UpdateLogfile(conn, treeid, _os, starttime, build['logfile']) + continue + + # must have scrape data for changeset + if build['logfile'] not in tboxdata['scrape']: + continue + changeset = FindChangesetInScrape(tboxdata['scrape'][build['logfile']]) + if changeset is None: + continue + + buildid = InsertBuild(conn, treeid, _os, starttime, status, build['logfile'], changeset) + + # 'Success' is fine as is. + if status == BuildStatus.Success: + pass + + # Parse log to save 'TestFailed' results. + elif status == BuildStatus.TestFailed : + logging.info("Checking build log for '%s' at %d (%s)" % (name, starttime, ctime(starttime))) + try: + failures = [] + # Grab the build log. + log, headers = urllib.urlretrieve("http://tinderbox.mozilla.org/%s/%s" % (options.tree, build['logfile'])) + gz = GzipFile(log) # I need a list of lines from the build log + + # assured to match because we search for this above + harness_type = re.match(harness_regex, logname).groups()[0] + parser = parsers.get(harness_type, log_parser.LogParser)() + failures = parser.parse(gz) + + # add the failures to the database + for failure in failures: + + # convenience variables; can probably delete + test = failure['test'] + text = failure['text'] + reason = failure['reason'] + + testnames_id=GetOrInsertTest(conn,test) + if HaveFailRecord(conn,buildid, reason, testnames_id): + logging.info("Skipping already recorded failure '%s' in build with id '%s' with failure record '%s' " % (test, buildid, text)) + else: + InsertTest(conn, buildid, reason, testnames_id, text) + + except Exception, e: + errstring = "Unexpected error: %s" % e + if options.debug: + print errstring + import pdb; pdb.set_trace() + elif options.die: + raise + else: + logging.error(errstring) + + # Ignore 'Burning' builds: tests may have run nontheless, but it's safer to discard them :-| + elif status == BuildStatus.Burning: + continue + + # Ignore 'Exception' builds: should only be worse than 'Burning'. + # (Don't know much at time of writing, since this feature is not active yet: see bug 476656 and follow-ups.) + elif status == BuildStatus.Exception: + continue + + # Save 'Unknown' status failure: this should not happen (unless new statuses are created), but we want to know if it does. + elif status == BuildStatus.Unknown: + # Add a fake test failure. + InsertTest(conn, buildid, "TEST-UNEXPECTED-FAIL", "99999999999", "Unknown status = '%s'!" % build['buildstatus']) + continue + + + + if chunk < totalchunks: + sleep(SLEEP_TIME) + curtime += chunksize + +conn.close() +connection.commit() +connection.close() +logging.info("Done") diff --git a/log_parser.py b/logparser.py similarity index 100% rename from log_parser.py rename to logparser.py diff --git a/topfails/__init__.pyc b/topfails/__init__.pyc new file mode 100644 index 0000000..000b5e1 Binary files /dev/null and b/topfails/__init__.pyc differ diff --git a/topfails/settings.pyc b/topfails/settings.pyc new file mode 100644 index 0000000..b3750af Binary files /dev/null and b/topfails/settings.pyc differ