Created
April 10, 2023 11:07
-
-
Save dynamicguy/afc2147b79bbd283a7a91b53eca5e299 to your computer and use it in GitHub Desktop.
remove standard noise from text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def text_cleaner(text): | |
rules = [ | |
{r'>\s+': u'>'}, # remove spaces after a tag opens or closes | |
{r'\s+': u' '}, # replace consecutive spaces | |
{r'\s*<br\s*/?>\s*': u'\n'}, # newline after a <br> | |
{r'</(div)\s*>\s*': u'\n'}, # newline after </p> and </div> and <h1/>... | |
{r'</(p|h\d)\s*>\s*': u'\n\n'}, # newline after </p> and </div> and <h1/>... | |
{r'<head>.*<\s*(/head|body)[^>]*>': u''}, # remove <head> to </head> | |
{r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # show links instead of texts | |
{r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags | |
{r'^\s+': u''} # remove spaces at the beginning | |
] | |
for rule in rules: | |
for (k, v) in rule.items(): | |
regex = re.compile(k) | |
text = regex.sub(v, text) | |
text = text.rstrip() | |
return text.lower() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment