# Quick and dirty Brat-to-CSV conversion. from __future__ import print_function import csv import io import re import sys # copy server/src/{gtbtokenize,tokenise}.py from Brat from tokenise import gtb_token_boundary_gen def read_annot(fname): ann = {} level = {} with open(fname) as f: for ln in f: if ln.startswith('T'): ident, label, start, end, text = ln.split(None, 4) start, end = int(start), int(end) for t_start, t_end in gtb_token_boundary_gen(text): # Index annotations by token start only, because it's # too hard to get the tokenizer to behave just like it # does in Brat and the ends tend to go wrong. #ann[(start + int(t_start), start + int(t_end))] = label ann[start + int(t_start)] = (label, int(t_end), ident) else: try: _, lev, ident, value = ln.split() if lev != 'Level': continue level[ident] = value except ValueError: pass return ann, level if len(sys.argv) != 4: print("usage: %s review.txt user1.ann user2.ann" % sys.argv[0], file=sys.stderr) sys.exit(1) txt_name, ann1_name, ann2_name = sys.argv[1:] with io.open(txt_name, encoding='utf-8') as f: text = f.read() tok_bound = list(gtb_token_boundary_gen(text)) ann1, level1 = read_annot(ann1_name) ann2, level2 = read_annot(ann2_name) wr = csv.writer(sys.stdout, dialect='excel') wr.writerow(['Token', #'Start', 'End', 'Label1', 'Level1', 'Label2', 'Level2']) for o in tok_bound: start, end = o label1, end1, ident1 = ann1.get(start, ('', 0, '')) label2, end2, ident2 = ann2.get(start, ('', 0, '')) lvl1 = level1.get(ident1, '') lvl2 = level2.get(ident2, '') wr.writerow([text[start:end].encode('utf-8'), #start, end, label1, lvl1, label2, lvl2])