Created
November 28, 2016 17:01
-
-
Save jgraham/be56d2e7c3bd92efb7e0a357953049d6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cPickle | |
import re | |
import os | |
from collections import defaultdict | |
import numpy | |
from sklearn.feature_extraction import DictVectorizer | |
from sklearn.linear_model import SGDClassifier | |
from treeherder.model.models import Job | |
def feature_dict(error_lines): | |
prev = [] | |
for line in error_lines: | |
print "line %s" % line.line | |
features = {"test": None, | |
"subtest": None, | |
"action": None, | |
"status": None, | |
"expected": None, | |
"level": None} | |
if line.failure_line: | |
failure_line = line.failure_line | |
for key in features.iterkeys(): | |
features[key] = getattr(failure_line, key) | |
if failure_line.signature: | |
features.update(count(tokenize_message(failure_line.signature))) | |
elif failure_line.message: | |
features.update(count(tokenize_message(failure_line.message))) | |
else: | |
level, status, test, message = split_unstructured(line.line) | |
features["status"] = status | |
features["test"] = test | |
features.update(count(tokenize_message(message))) | |
for key in features.keys(): | |
if features[key] is None: | |
del features[key] | |
prev.append(features) | |
rev_prev = reversed(prev) | |
try: | |
rev_prev.next() | |
except StopIteration: | |
pass | |
for i in xrange(5): | |
try: | |
prev_features = rev_prev.next() | |
except StopIteration: | |
break | |
for key, value in prev_features.iteritems(): | |
if key.startswith("token:"): | |
new_key = "token_%i:%s" % (i, key.split(":", 1)[1]) | |
else: | |
new_key = "%s_%i" % (key.split("_", 1)[0], i) | |
features[new_key] = value | |
print features | |
yield line, features | |
re_split = re.compile("\s+") | |
re_non_word = re.compile("^\W+$") | |
re_ip = re.compile("\W*\d{1,3}(?:\.\d{1,3}){3}\W*") | |
re_hex = re.compile("\W*0x[0-9a-fA-F]+\W*") | |
re_digits = re.compile("\W*\d+\W*") | |
def tokenize_message(message): | |
message = message.splitlines()[0] | |
tokens = re_split.split(message) | |
rv = [] | |
for token in tokens: | |
if re_non_word.match(token): | |
continue | |
for regexp, replacement in [(re_ip, "<ip_address>"), | |
(re_hex, "<hex>"), | |
(re_digits, "<digits>")]: | |
m = regexp.match(token) | |
if m: | |
token = regexp.sub(replacement, token) | |
break | |
rv.append(token) | |
return rv | |
def count(tokens): | |
rv = defaultdict(int) | |
for item in tokens: | |
rv["token:%s" % item] += 1 | |
return rv | |
status_re = re.compile(".*TEST-UNEXPECTED-([\w]+)") | |
log_re = re.compile(".*(DEBUG|INFO|WARNING|ERROR|CRITICAL|FATAL)") | |
re_unstructured = re.compile("\s*(?:\d\d:?){3}?\s*(.*)") | |
def split_unstructured(line): | |
parts = line.split(" | ", 3) | |
if len(parts) == 3: | |
level = None | |
status = None | |
if "TEST-UNEXPECTED" in parts[0]: | |
status = status_re.match(parts[0]).group(1) | |
elif "CRASH" in parts[0]: | |
status = "CRASH" | |
else: | |
m = log_re.match(parts[0]) | |
if m: | |
level = m.group(1) | |
test = parts[1] | |
message = parts[2] | |
else: | |
level = None | |
status = None | |
test = None | |
message = re_unstructured.match(line).group(1) | |
return level, status, test, message | |
def get_data(jobs, targets=False): | |
out_features = [] | |
if targets: | |
out_targets = [] | |
for job in jobs: | |
for step in job.steps.all(): | |
errors = step.errors.all() | |
if not len(errors): | |
continue | |
for line, features in feature_dict(errors): | |
# This probably isn't the right way to do this | |
if line.best_classification and line.best_classification.bug_number is None: | |
continue | |
out_features.append(features) | |
if targets: | |
out_targets.append(line.best_classification.bug_number | |
if line.best_classification | |
else 0) | |
if targets: | |
return out_features, numpy.array(out_targets, dtype=numpy.int64) | |
else: | |
return out_features | |
def train(jobs): | |
vectorizer = DictVectorizer() | |
features, targets = get_data(jobs, True) | |
print "Num Features %s" % len(features) | |
X_train = vectorizer.fit_transform(features) | |
clf = SGDClassifier(loss='hinge', penalty='l2', | |
alpha=1e-3, n_iter=5, random_state=42).fit(X_train, targets) | |
return vectorizer, clf | |
def test(vectorizer, clf, jobs): | |
rv = [] | |
features, expected = get_data(jobs, True) | |
for i, (test_features, actual) in enumerate(zip(features, expected)): | |
X = vectorizer.transform([test_features]) | |
res = clf.predict(X)[0] | |
rv.append({"features": test_features, | |
"expected": expected[i], | |
"actual": res}) | |
return rv | |
def score(data): | |
total = len(data) | |
correct = sum(1 for x in data if x["actual"] == x["expected"]) | |
incorrect = total - correct | |
ratio = float(correct) / total | |
return {"total": total, | |
"correct": correct, | |
"incorrect": incorrect, | |
"ratio": ratio} | |
def get_input(): | |
all_jobs = (Job.objects | |
.filter(steps__errors__best_is_verified=True) | |
.prefetch_related("steps", | |
"steps__errors", | |
"steps__errors__best_classification") | |
.order_by('id')) | |
train_data = all_jobs[:len(all_jobs) / 2] | |
test_data = all_jobs[len(all_jobs) / 2:] | |
return train_data, test_data | |
def main(): | |
if not os.path.exists("_ml_cache"): | |
jobs_train, jobs_test = get_input() | |
with open("_ml_cache", "wb") as f: | |
cPickle.dump((jobs_train, jobs_test), f) | |
else: | |
with open("_ml_cache", "rb") as f: | |
jobs_train, jobs_test = cPickle.load(f) | |
vectorizer, clf = train(jobs_train) | |
results = test(vectorizer, clf, jobs_test) | |
print score(results) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment