Last active
April 21, 2016 01:25
-
-
Save tkaemming/303f0761723801f350b92345a53a46e2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.pyc |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
import pprint | |
import json | |
import sys | |
import zlib | |
from collections import ( | |
defaultdict, | |
Mapping, | |
Sequence, | |
Set, | |
) | |
unset = object() | |
TYPE_REFERENCE = 0 | |
TYPE_VALUE = 1 | |
TYPE_MAPPING = 2 | |
TYPE_SEQUENCE = 3 | |
class Encoder(object): | |
def __init__(self): | |
sequence = itertools.count() | |
def make_value(): | |
identifier = str(next(sequence)) | |
return identifier, 0 | |
self.__identifiers = defaultdict(make_value) | |
def bindings(self): | |
def rewrite_value((flag, type, name, annotations)): | |
return (flag, type, name, tuple(annotations)) | |
return {identifier: rewrite_value(value) for value, (identifier, _) in self.__identifiers.iteritems()} | |
def statistics(self): | |
return {identifier: count for (identifier, count) in self.__identifiers.values()} | |
def encode(self, value, preprocess=None, postprocess=None): | |
if preprocess is None: | |
preprocess = lambda value, key=None: (value, set()) | |
if postprocess is None: | |
postprocess = lambda value, key=None: (value, set()) | |
def process(value, key=None): | |
t = type(value).__name__ | |
annotations = frozenset() | |
value, updates = preprocess(value, key) | |
annotations = annotations | updates | |
if isinstance(value, Mapping): | |
flag = TYPE_MAPPING | |
value = tuple((process(k), process(v, k)) for k, v in value.iteritems()) | |
elif isinstance(value, (Sequence, Set)) and not isinstance(value, basestring): | |
flag = TYPE_SEQUENCE | |
value = tuple(process(v) for v in value) | |
else: | |
flag = TYPE_VALUE | |
value = str(value) | |
value, updates = postprocess(value, key) | |
annotations = annotations | updates | |
signature = (flag, t, value, annotations) | |
identifier, count = self.__identifiers[signature] | |
self.__identifiers[signature] = (identifier, count + 1) | |
return (TYPE_REFERENCE, identifier) | |
return process(value) | |
def decode((bindings, value)): | |
def process(value): | |
value = list(value) | |
flag = value.pop(0) | |
if flag == TYPE_REFERENCE: | |
identifier, = value | |
return process(bindings[identifier]) | |
elif flag == TYPE_VALUE: | |
type, value, annotations = value | |
return value | |
elif flag == TYPE_MAPPING: | |
type, value, annotations = value | |
return {process(k): process(v) for (k, v) in value} | |
elif flag == TYPE_SEQUENCE: | |
type, value, annotations = value | |
return [process(i) for i in value] | |
else: | |
raise AssertionError('unexpected flag') | |
return process(value) | |
def rewrite(encoder, payload): | |
def rewrite_frame(frame): | |
frame['vars'] = encoder.encode(frame.get('vars', {})) | |
return frame | |
def rewrite_stacktrace(stacktrace): | |
stacktrace['frames'] = map(rewrite_frame, stacktrace['frames']) | |
return stacktrace | |
def rewrite_exception(exception): | |
exception['stacktrace'] = rewrite_stacktrace(exception['stacktrace']) | |
return exception | |
payload['extra'] = encoder.encode(payload['extra']) | |
payload['sentry.interfaces.Exception']['values'] = map( | |
rewrite_exception, | |
payload['sentry.interfaces.Exception']['values'], | |
) | |
return inline(encoder, { | |
'data': payload, | |
'bindings': bindings, | |
}) | |
def inline(encoder, data): | |
statistics = encoder.statistics() | |
remove = set() | |
def inline(value): | |
value = list(value) | |
flag = value.pop(0) | |
if flag == TYPE_REFERENCE: | |
identifier, = value | |
if statistics[identifier] == 1: | |
remove.add(identifier) | |
return bindings[identifier] | |
elif flag == TYPE_SEQUENCE: | |
type, value, annotations = value | |
return flag, type, tuple(map(inline, value)), annotations | |
elif flag == TYPE_MAPPING: | |
type, value, annotations = value | |
value = tuple((key, inline(value)) for key, value in value) | |
return flag, type, value, annotations | |
return (flag,) + tuple(value) | |
bindings = data['bindings'] | |
for key, value in bindings.iteritems(): | |
bindings[key] = inline(value) | |
for identifier in remove: | |
bindings.pop(identifier) | |
return data | |
def dump(name): | |
encoder = Encoder() | |
file = open(name) | |
sys.stdout.write( | |
json.dumps( | |
rewrite( | |
encoder, | |
json.load(file), | |
), | |
indent=2 | |
) | |
) | |
sys.stdout.write('\n') | |
def stats(*names): | |
for name in names: | |
file = open(name) | |
sys.stdout.write('{}\t'.format(name)) | |
original = json.dumps(json.load(file)) | |
file.seek(0) | |
encoder = Encoder() | |
processed = json.dumps(rewrite(encoder, json.load(file))) | |
sys.stdout.write('{}\t{}\t{:.2%}\t{}\t{}\t{:.2%}\n'.format( | |
len(original), | |
len(processed), | |
float(len(processed)) / len(original), | |
len(zlib.compress(original)), | |
len(zlib.compress(processed)), | |
float(len(zlib.compress(processed))) / len(zlib.compress(original)), | |
)) | |
if __name__ == '__main__': | |
command = { | |
'dump': dump, | |
'stats': stats, | |
}[sys.argv[1]] | |
command(*sys.argv[2:]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
import json | |
import sys | |
import zlib | |
from bindings import Encoder, inline | |
unset = object() | |
class MappingSplitter(object): | |
def __init__(self, splitters): | |
self.splitters = splitters | |
def split(self, value): | |
left = value.copy() | |
right = {} | |
for key, splitter in self.splitters.iteritems(): | |
item = left.pop(key, unset) | |
if item is not unset: | |
result = splitter.split(item) | |
if result[0] is not unset: | |
left[key] = result[0] | |
right[key] = result[1] | |
return left, right | |
def combine(self, left, right): | |
result = left.copy() | |
for key, value in right.iteritems(): | |
result[key] = self.splitters[key].combine(left.get(key), value) | |
return result | |
class SequenceSplitter(object): | |
def __init__(self, splitter): | |
self.splitter = splitter | |
def split(self, value): | |
left = [] | |
right = [] | |
for i, item in enumerate(value): | |
a, b = self.splitter.split(item) | |
left.append(a) | |
right.append(b) | |
return left, right | |
def combine(self, left, right): | |
assert len(left) == len(right) | |
result = [] | |
for l, r in zip(left, right): | |
result.append(self.splitter.combine(l, r)) | |
return result | |
class ValueExtractor(object): | |
def __init__(self, encoder=lambda i: i): | |
self.encoder = encoder | |
def split(self, value): | |
return unset, self.encoder(value) | |
def combine(self, left, right): | |
return right | |
def rewrite(data): | |
encoder = Encoder() | |
encode = encoder.encode | |
# encode = lambda value: value | |
splitter = MappingSplitter({ | |
'id': ValueExtractor(), | |
'project': ValueExtractor(), | |
'release': ValueExtractor(), | |
'message': ValueExtractor(), | |
'datetime': ValueExtractor(), | |
'tags': ValueExtractor(encode), | |
'extra': ValueExtractor(encode), | |
'received': ValueExtractor(), | |
'modules': ValueExtractor(), | |
'sentry.interfaces.Message': MappingSplitter({ | |
'params': ValueExtractor(encode), | |
}), | |
'sentry.interfaces.Exception': MappingSplitter({ | |
'values': SequenceSplitter( | |
MappingSplitter({ | |
'stacktrace': MappingSplitter({ | |
'frames': SequenceSplitter( | |
MappingSplitter({ | |
'vars': ValueExtractor(encode), | |
}), | |
), | |
}), | |
}) | |
), | |
}), | |
'sentry.interfaces.http.Http': ValueExtractor(encode), | |
'sentry.interfaces.user.User': ValueExtractor(encode), | |
}) | |
shared, unique = splitter.split(data) | |
return shared, inline(encoder, { | |
'bindings': encoder.bindings(), | |
'data': unique, | |
}) | |
def dump(name='/dev/stdin'): | |
file = open(name) | |
shared, unique = rewrite(json.load(file)) | |
for data in (shared, unique): | |
sys.stdout.write(json.dumps(data, indent=2)) | |
sys.stdout.write('\n') | |
def stats(*names): | |
for name in names: | |
file = open(name) | |
sys.stdout.write('{}\t'.format(name)) | |
data = json.load(file) | |
original = json.dumps(data) | |
shared, unique = map(json.dumps, rewrite(data)) | |
def zlen(value): | |
return len(zlib.compress(value)) | |
sys.stdout.write( | |
(' '.join(['{}\t{}\t{:.2%}\t{}\t{:.2%}'] * 2) + '\n').format( | |
len(original), | |
len(shared), | |
float(len(shared)) / len(original), | |
len(unique), | |
float(len(unique)) / len(original), | |
zlen(original), | |
zlen(shared), | |
float(zlen(shared)) / zlen(original), | |
zlen(unique), | |
float(zlen(unique)) / zlen(original), | |
), | |
) | |
if __name__ == '__main__': | |
command = { | |
'dump': dump, | |
'stats': stats, | |
}[sys.argv[1]] | |
command(*sys.argv[2:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment