tkaemming · April 21, 2016 01:25
diff --git a/.gitignore b/.gitignore
 *.pyc
diff --git a/__init__.py b/__init__.py
diff --git a/bindings.py b/bindings.py
 import itertools
 import pprint
 import json
 import sys
 import zlib
 from collections import (
    defaultdict,
    Mapping,
    Sequence,
    Set,
 )


 unset = object()


 TYPE_REFERENCE = 0
 TYPE_VALUE = 1
 TYPE_MAPPING = 2
 TYPE_SEQUENCE = 3


 class Encoder(object):
    def __init__(self):
        sequence = itertools.count()

        def make_value():
            identifier = str(next(sequence))
            return identifier, 0

        self.__identifiers = defaultdict(make_value)

    def bindings(self):
        def rewrite_value((flag, type, name, annotations)):
            return (flag, type, name, tuple(annotations))

        return {identifier: rewrite_value(value) for value, (identifier, _) in self.__identifiers.iteritems()}

    def statistics(self):
        return {identifier: count for (identifier, count) in self.__identifiers.values()}

    def encode(self, value, preprocess=None, postprocess=None):
        if preprocess is None:
            preprocess = lambda value, key=None: (value, set())

        if postprocess is None:
            postprocess = lambda value, key=None: (value, set())

        def process(value, key=None):
            t = type(value).__name__

            annotations = frozenset()

            value, updates = preprocess(value, key)
            annotations = annotations | updates

            if isinstance(value, Mapping):
                flag = TYPE_MAPPING
                value = tuple((process(k), process(v, k)) for k, v in value.iteritems())
            elif isinstance(value, (Sequence, Set)) and not isinstance(value, basestring):
                flag = TYPE_SEQUENCE
                value = tuple(process(v) for v in value)
            else:
                flag = TYPE_VALUE
                value = str(value)

            value, updates = postprocess(value, key)
            annotations = annotations | updates

            signature = (flag, t, value, annotations)
            identifier, count = self.__identifiers[signature]
            self.__identifiers[signature] = (identifier, count + 1)
            return (TYPE_REFERENCE, identifier)

        return process(value)

 def decode((bindings, value)):
    def process(value):
        value = list(value)
        flag = value.pop(0)
        if flag == TYPE_REFERENCE:
            identifier, = value
            return process(bindings[identifier])
        elif flag == TYPE_VALUE:
            type, value, annotations = value
            return value
        elif flag == TYPE_MAPPING:
            type, value, annotations = value
            return {process(k): process(v) for (k, v) in value}
        elif flag == TYPE_SEQUENCE:
            type, value, annotations = value
            return [process(i) for i in value]
        else:
            raise AssertionError('unexpected flag')

    return process(value)


 def rewrite(encoder, payload):
    def rewrite_frame(frame):
        frame['vars'] = encoder.encode(frame.get('vars', {}))
        return frame

    def rewrite_stacktrace(stacktrace):
        stacktrace['frames'] = map(rewrite_frame, stacktrace['frames'])
        return stacktrace

    def rewrite_exception(exception):
        exception['stacktrace'] = rewrite_stacktrace(exception['stacktrace'])
        return exception

    payload['extra'] = encoder.encode(payload['extra'])
    payload['sentry.interfaces.Exception']['values'] = map(
        rewrite_exception,
        payload['sentry.interfaces.Exception']['values'],
    )

    return inline(encoder, {
        'data': payload,
        'bindings': bindings,
    })


 def inline(encoder, data):
    statistics = encoder.statistics()

    remove = set()

    def inline(value):
        value = list(value)
        flag = value.pop(0)
        if flag == TYPE_REFERENCE:
            identifier, = value
            if statistics[identifier] == 1:
                remove.add(identifier)
                return bindings[identifier]
        elif flag == TYPE_SEQUENCE:
            type, value, annotations = value
            return flag, type, tuple(map(inline, value)), annotations
        elif flag == TYPE_MAPPING:
            type, value, annotations = value
            value = tuple((key, inline(value)) for key, value in value)
            return flag, type, value, annotations
        return (flag,) + tuple(value)

    bindings = data['bindings']

    for key, value in bindings.iteritems():
        bindings[key] = inline(value)

    for identifier in remove:
        bindings.pop(identifier)

    return data


 def dump(name):
    encoder = Encoder()
    file = open(name)
    sys.stdout.write(
        json.dumps(
            rewrite(
                encoder,
                json.load(file),
            ),
            indent=2
        )
    )
    sys.stdout.write('\n')


 def stats(*names):
    for name in names:
        file = open(name)
        sys.stdout.write('{}\t'.format(name))

        original = json.dumps(json.load(file))
        file.seek(0)

        encoder = Encoder()
        processed = json.dumps(rewrite(encoder, json.load(file)))

        sys.stdout.write('{}\t{}\t{:.2%}\t{}\t{}\t{:.2%}\n'.format(
            len(original),
            len(processed),
            float(len(processed)) / len(original),
            len(zlib.compress(original)),
            len(zlib.compress(processed)),
            float(len(zlib.compress(processed))) / len(zlib.compress(original)),
        ))


 if __name__ == '__main__':
    command = {
        'dump': dump,
        'stats': stats,
    }[sys.argv[1]]
    command(*sys.argv[2:])
diff --git a/split.py b/split.py
 import pprint
 import json
 import sys
 import zlib

 from bindings import Encoder, inline

 unset = object()


 class MappingSplitter(object):
    def __init__(self, splitters):
        self.splitters = splitters

    def split(self, value):
        left = value.copy()
        right = {}

        for key, splitter in self.splitters.iteritems():
            item = left.pop(key, unset)
            if item is not unset:
                result = splitter.split(item)

                if result[0] is not unset:
                    left[key] = result[0]

                right[key] = result[1]

        return left, right

    def combine(self, left, right):
        result = left.copy()

        for key, value in right.iteritems():
            result[key] = self.splitters[key].combine(left.get(key), value)

        return result


 class SequenceSplitter(object):
    def __init__(self, splitter):
        self.splitter = splitter

    def split(self, value):
        left = []
        right = []

        for i, item in enumerate(value):
            a, b = self.splitter.split(item)
            left.append(a)
            right.append(b)

        return left, right

    def combine(self, left, right):
        assert len(left) == len(right)
        result = []

        for l, r in zip(left, right):
            result.append(self.splitter.combine(l, r))

        return result


 class ValueExtractor(object):
    def __init__(self, encoder=lambda i: i):
        self.encoder = encoder

    def split(self, value):
        return unset, self.encoder(value)

    def combine(self, left, right):
        return right


 def rewrite(data):
    encoder = Encoder()
    encode = encoder.encode
    # encode = lambda value: value

    splitter = MappingSplitter({
        'id': ValueExtractor(),
        'project': ValueExtractor(),
        'release': ValueExtractor(),
        'message': ValueExtractor(),
        'datetime': ValueExtractor(),
        'tags': ValueExtractor(encode),
        'extra': ValueExtractor(encode),
        'received': ValueExtractor(),
        'modules': ValueExtractor(),
        'sentry.interfaces.Message': MappingSplitter({
            'params': ValueExtractor(encode),
        }),
        'sentry.interfaces.Exception': MappingSplitter({
            'values': SequenceSplitter(
                MappingSplitter({
                    'stacktrace': MappingSplitter({
                        'frames': SequenceSplitter(
                            MappingSplitter({
                                'vars': ValueExtractor(encode),
                            }),
                        ),
                    }),
                })
            ),
        }),
        'sentry.interfaces.http.Http': ValueExtractor(encode),
        'sentry.interfaces.user.User': ValueExtractor(encode),
    })

    shared, unique = splitter.split(data)
    return shared, inline(encoder, {
        'bindings': encoder.bindings(),
        'data': unique,
    })


 def dump(name='/dev/stdin'):
    file = open(name)
    shared, unique = rewrite(json.load(file))
    for data in (shared, unique):
        sys.stdout.write(json.dumps(data, indent=2))
        sys.stdout.write('\n')


 def stats(*names):
    for name in names:
        file = open(name)
        sys.stdout.write('{}\t'.format(name))

        data = json.load(file)

        original = json.dumps(data)
        shared, unique = map(json.dumps, rewrite(data))

        def zlen(value):
            return len(zlib.compress(value))

        sys.stdout.write(
            (' '.join(['{}\t{}\t{:.2%}\t{}\t{:.2%}'] * 2) + '\n').format(
                len(original),
                len(shared),
                float(len(shared)) / len(original),
                len(unique),
                float(len(unique)) / len(original),
                zlen(original),
                zlen(shared),
                float(zlen(shared)) / zlen(original),
                zlen(unique),
                float(zlen(unique)) / zlen(original),
            ),
        )


 if __name__ == '__main__':
    command = {
        'dump': dump,
        'stats': stats,
    }[sys.argv[1]]
    command(*sys.argv[2:])
	import itertools
	import pprint
	import json
	import sys
	import zlib
	from collections import (
	defaultdict,
	Mapping,
	Sequence,
	Set,
	)


	unset = object()


	TYPE_REFERENCE = 0
	TYPE_VALUE = 1
	TYPE_MAPPING = 2
	TYPE_SEQUENCE = 3


	class Encoder(object):
	def __init__(self):
	sequence = itertools.count()

	def make_value():
	identifier = str(next(sequence))
	return identifier, 0

	self.__identifiers = defaultdict(make_value)

	def bindings(self):
	def rewrite_value((flag, type, name, annotations)):
	return (flag, type, name, tuple(annotations))

	return {identifier: rewrite_value(value) for value, (identifier, _) in self.__identifiers.iteritems()}

	def statistics(self):
	return {identifier: count for (identifier, count) in self.__identifiers.values()}

	def encode(self, value, preprocess=None, postprocess=None):
	if preprocess is None:
	preprocess = lambda value, key=None: (value, set())

	if postprocess is None:
	postprocess = lambda value, key=None: (value, set())

	def process(value, key=None):
	t = type(value).__name__

	annotations = frozenset()

	value, updates = preprocess(value, key)
	annotations = annotations \| updates

	if isinstance(value, Mapping):
	flag = TYPE_MAPPING
	value = tuple((process(k), process(v, k)) for k, v in value.iteritems())
	elif isinstance(value, (Sequence, Set)) and not isinstance(value, basestring):
	flag = TYPE_SEQUENCE
	value = tuple(process(v) for v in value)
	else:
	flag = TYPE_VALUE
	value = str(value)

	value, updates = postprocess(value, key)
	annotations = annotations \| updates

	signature = (flag, t, value, annotations)
	identifier, count = self.__identifiers[signature]
	self.__identifiers[signature] = (identifier, count + 1)
	return (TYPE_REFERENCE, identifier)

	return process(value)

	def decode((bindings, value)):
	def process(value):
	value = list(value)
	flag = value.pop(0)
	if flag == TYPE_REFERENCE:
	identifier, = value
	return process(bindings[identifier])
	elif flag == TYPE_VALUE:
	type, value, annotations = value
	return value
	elif flag == TYPE_MAPPING:
	type, value, annotations = value
	return {process(k): process(v) for (k, v) in value}
	elif flag == TYPE_SEQUENCE:
	type, value, annotations = value
	return [process(i) for i in value]
	else:
	raise AssertionError('unexpected flag')

	return process(value)


	def rewrite(encoder, payload):
	def rewrite_frame(frame):
	frame['vars'] = encoder.encode(frame.get('vars', {}))
	return frame

	def rewrite_stacktrace(stacktrace):
	stacktrace['frames'] = map(rewrite_frame, stacktrace['frames'])
	return stacktrace

	def rewrite_exception(exception):
	exception['stacktrace'] = rewrite_stacktrace(exception['stacktrace'])
	return exception

	payload['extra'] = encoder.encode(payload['extra'])
	payload['sentry.interfaces.Exception']['values'] = map(
	rewrite_exception,
	payload['sentry.interfaces.Exception']['values'],
	)

	return inline(encoder, {
	'data': payload,
	'bindings': bindings,
	})


	def inline(encoder, data):
	statistics = encoder.statistics()

	remove = set()

	def inline(value):
	value = list(value)
	flag = value.pop(0)
	if flag == TYPE_REFERENCE:
	identifier, = value
	if statistics[identifier] == 1:
	remove.add(identifier)
	return bindings[identifier]
	elif flag == TYPE_SEQUENCE:
	type, value, annotations = value
	return flag, type, tuple(map(inline, value)), annotations
	elif flag == TYPE_MAPPING:
	type, value, annotations = value
	value = tuple((key, inline(value)) for key, value in value)
	return flag, type, value, annotations
	return (flag,) + tuple(value)

	bindings = data['bindings']

	for key, value in bindings.iteritems():
	bindings[key] = inline(value)

	for identifier in remove:
	bindings.pop(identifier)

	return data


	def dump(name):
	encoder = Encoder()
	file = open(name)
	sys.stdout.write(
	json.dumps(
	rewrite(
	encoder,
	json.load(file),
	),
	indent=2
	)
	)
	sys.stdout.write('\n')


	def stats(*names):
	for name in names:
	file = open(name)
	sys.stdout.write('{}\t'.format(name))

	original = json.dumps(json.load(file))
	file.seek(0)

	encoder = Encoder()
	processed = json.dumps(rewrite(encoder, json.load(file)))

	sys.stdout.write('{}\t{}\t{:.2%}\t{}\t{}\t{:.2%}\n'.format(
	len(original),
	len(processed),
	float(len(processed)) / len(original),
	len(zlib.compress(original)),
	len(zlib.compress(processed)),
	float(len(zlib.compress(processed))) / len(zlib.compress(original)),
	))


	if __name__ == '__main__':
	command = {
	'dump': dump,
	'stats': stats,
	}[sys.argv[1]]
	command(*sys.argv[2:])