Created
December 29, 2022 11:37
-
-
Save DrDub/9413410626b5a77d8f1f576f6447d64e to your computer and use it in GitHub Desktop.
Python UIMA-CPP Concept code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this is a concept file showcasing what a deep Python-UIMACPP could enable | |
from uima import AnalysisEngine, AnalysisEngineType | |
from uima.framework import buildPipeline, TypeMapper, SetFeature, Remote | |
from uima.index import Index, AnnotationIndex | |
from uima.typesystem.fs import ( | |
TOP, | |
Annotation, | |
FSFloatArray, | |
FSString, | |
FSBoolean, | |
FSFloat, | |
) | |
from nltk.uima import PunkTokenizer, NEChunkParser | |
from uima.wrappers import SpacyAnnotator | |
from uima.wrappers import BertAnnotator | |
import numpy as np | |
# custom type system | |
class MyToken(Annotation): | |
pass | |
class MySentence(Annotation): | |
Embedding = FloatArray() | |
Score = FSFloat | |
class MyNER(Annotation): | |
Source = FSString | |
Selected = FSBoolean | |
MAIN_EMBEDDING = [] # this is a hack, it would be nice to be able to hack things | |
@AnalysisEngineType( | |
input=[MySentence], | |
output=[MySentence.Score, MyNER.Selected], | |
indexes=[ | |
Index( | |
"SelectedIndex", | |
type_="sorted", | |
fs=MyNER, | |
key=MyNER.Selected, | |
comparator="inverted", | |
), | |
Index( | |
"ScoredIndex", | |
type="_sorted", | |
fs=MySentence, | |
key=FSFloat, | |
comparator="inverted", | |
), | |
], | |
) | |
class MyAE(AnalysisEngine): | |
def __init__(self, top_sentences): | |
super().__init__() | |
self.top_sentences = top_sentences | |
def process(self, cas): | |
global MAIN_EMBEDDING | |
if MAIN_EMBEDDING: | |
for sentence in cas.indices[AnnotationIndex(MySentence)]: | |
sentence[MySentence.Score] = np.dot( | |
sentence[MySentence.Embedding], MAIN_EMBEDDING | |
) | |
cas.addToIndexes | |
for sentence in cas.indices["ScoredIndex"][: self.top_sentences]: | |
for ner in sentence.subiterator(cas.indices[AnnotationIndex(MyNer)]): | |
ner[MyNER.Selected] = True | |
else: | |
for sentence in cas.indices[AnnotationIndex(MySentence)]: | |
MAIN_EMBEDDING = clone(sentence[MySentence.Embedding]) | |
pipeline = buildPipeline( | |
[ | |
# NLTK tokens | |
TypeMapper(output={nltk.Token: MyToken}).wrap(PunkTokenizer()), | |
# spacy sentence boundaries and NER | |
SetFeature({MyNER.Source: "spaCy"}).wrap( | |
TypeMapper(output={spacy.Sentence: MySentence, spacy.NER: MyNER}).wrap( | |
SpacyAnnotator({"load": "en"}) | |
) | |
), | |
# NLTK NERs over spaCy sentences | |
SetFeature({MyNER.Source: "spaCy"}).wrap( | |
TypeMapper( | |
input={MyToken: nltk.Token, MySentence: nltk.Sentence}, | |
output={nltk.NamedEntity: MyNER}, | |
).wrap(NEChunkParser()) | |
), | |
# BERT embeddings over spaCy sentences | |
# here only text and sentences go over the wire and only embeddings come back | |
TypeMapper( | |
input={bert.Text: MySentence}, | |
output={bert.Text.FullEmbedding: MySentence.Embedding}, | |
).wrap( | |
Remote(server="http://localhost:8000", protocol="zmq").wrap( | |
BertAnnotator({"model": "uncased_L-12_H-768_A-12"}) | |
) | |
), | |
# Compute embedding distance to a global query and select NERs inside | |
MyAE(2), | |
] | |
) | |
aCas = pipeline.newCAS() | |
aCas.setDocumentText("query sentence") # (internally resets the cas) | |
pipeline.process(aCas) # compute embedding for query | |
aCas.setDocumentText("very long text with NERs") | |
pipeline.process(aCas) # compute selected NERs and embeddings | |
# get the tokens of the NERs out for further use | |
selected = [] | |
for ner in aCas.indices["SelectedIndex"]: | |
if not ner[ | |
MyNER.Selected | |
]: # as the index is sorted, we can stop after the true values | |
break | |
# something could be done with the 'Source' field over here... | |
selected.append( | |
[ | |
token.coveredText() | |
for token in ner.subiterator(cas.indices[AnnotationIndex(MyToken)]) | |
] | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment