Last active
January 23, 2023 22:04
-
-
Save afparsons/34ddca6428e4a74a70670d49b1ba5af2 to your computer and use it in GitHub Desktop.
spaCy 3.x RegularExpressionMatcher: Patterns and on_match pattern handlers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A quick excerpt demonstrating usage of a custom `RegularExpressionMatcher` for spaCy 3. | |
This is from one of my personal projects (HaleyNLP/Irnerius). Module-level imports and other code blocks have been elided. | |
""" | |
@registry.misc('haleynlp.common.extraction.handler.on_match.bibliography._european_union_ecli') | |
def _european_union_ecli( | |
span: Span, | |
match: re.Match, | |
) -> None: | |
""" | |
An `on_match` pattern handler for extracting ECLI citations. | |
Args: | |
span (Span): | |
The spaCy Span in which a regular expression match was found. | |
match (re.Match): | |
A found regular expression match. | |
Returns: | |
None | |
""" | |
# The majority of this function's implementation has been omitted | |
# ... In short, it assembles an `Annotation` object from the regular expression `Match` and stores it on the `Doc` object | |
# ... (the `_.annotations` extension is created by another custom component not shown in this example) | |
annotation: Annotation = ... | |
span.doc._.annotations.citations.add(annotation) | |
MATCHING_RULES: Tuple = ( | |
{ | |
'key': '_european_union_ecli', | |
'patterns': [ | |
re.compile( | |
pattern=r""" | |
(?P<ECLI> | |
(?P<IDENTIFIER>ECLI): | |
(?P<COUNTRY>[A-Z]{2}): | |
(?P<COURT>[A-Z][A-Z0-9]{0,6}): | |
(?P<YEAR>\d{4}): | |
(?P<CASE>(\d|\.){1,25}) | |
) | |
""", | |
flags=re.VERBOSE, | |
) | |
], | |
'on_match': 'haleynlp.common.extraction.handler.on_match.bibliography._european_union_ecli', | |
}, | |
# ...other elements for different citation types, like United States Acts, Code, Laws, and court cases... | |
) | |
@registry.misc('haleynlp.en.component.config.bibliography') | |
def create_patterns_bibliography() -> Tuple: | |
return MATCHING_RULES |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment