# Quick and dirty Brat-to-CSV conversion.

from __future__ import print_function
import csv
import io
import re
import sys

# copy server/src/{gtbtokenize,tokenise}.py from Brat
from tokenise import gtb_token_boundary_gen


def read_annot(fname):
    ann = {}
    level = {}
    with open(fname) as f:
        for ln in f:
            if ln.startswith('T'):
                ident, label, start, end, text = ln.split(None, 4)
                start, end = int(start), int(end)
                for t_start, t_end in gtb_token_boundary_gen(text):
                    # Index annotations by token start only, because it's
                    # too hard to get the tokenizer to behave just like it
                    # does in Brat and the ends tend to go wrong.
                    #ann[(start + int(t_start), start + int(t_end))] = label
                    ann[start + int(t_start)] = (label, int(t_end), ident)
            else:
                try:
                    _, lev, ident, value = ln.split()
                    if lev != 'Level':
                        continue
                    level[ident] = value
                except ValueError:
                    pass
    return ann, level


if len(sys.argv) != 4:
    print("usage: %s review.txt user1.ann user2.ann" % sys.argv[0],
          file=sys.stderr)
    sys.exit(1)
txt_name, ann1_name, ann2_name = sys.argv[1:]
with io.open(txt_name, encoding='utf-8') as f:
    text = f.read()

tok_bound = list(gtb_token_boundary_gen(text))

ann1, level1 = read_annot(ann1_name)
ann2, level2 = read_annot(ann2_name)

wr = csv.writer(sys.stdout, dialect='excel')
wr.writerow(['Token',
             #'Start', 'End',
             'Label1', 'Level1', 'Label2', 'Level2'])
for o in tok_bound:
    start, end = o
    label1, end1, ident1 = ann1.get(start, ('', 0, ''))
    label2, end2, ident2 = ann2.get(start, ('', 0, ''))
    lvl1 = level1.get(ident1, '')
    lvl2 = level2.get(ident2, '')
    wr.writerow([text[start:end].encode('utf-8'),
                 #start, end,
                 label1, lvl1, label2, lvl2])