Last active
August 2, 2022 13:19
-
-
Save afparsons/b40d84f1e20b21fa53bab7b20dd8957b to your computer and use it in GitHub Desktop.
Spacy: Tabular View of Token Attributes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# standard library imports | |
from operator import attrgetter | |
from typing import Union, Generator | |
# third-party library imports | |
from pandas import DataFrame | |
from spacy.tokens import Token, Span, Doc | |
def analyze_tokens( | |
doclike: Union[Doc, Span], | |
*attributes | |
) -> DataFrame: | |
""" | |
Example: | |
`analyze_tokens(doc, 'like_num', 'lemma_', 'pos_', 'children', '_.custom')` | |
""" | |
columns = ('text', *attributes) | |
data = ( | |
( | |
(*attribute,) if isinstance(attribute, Generator) else attribute | |
for attribute in attrgetter(*columns)(token) | |
) for token in doclike | |
) | |
return DataFrame(data=data, columns=columns).T |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This returns a Pandas DataFrame.
There is nothing revolutionary about this function; it is simply a convenient way to evaluate a given
Doc
orSpan
in, for example, a Jupyter notebook.