Last active
April 13, 2017 16:51
-
-
Save averagehuman/0ea1455c2fe76c0b5eac to your computer and use it in GitHub Desktop.
Markup twitter hashtags and usernames within a html fragment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
The twitter-text-python library (https://pypi.python.org/pypi/twitter-text-python) can be used | |
to urlify text containing @<username>s and #<hashtag>s. It is a bit trickier if you want to do | |
the same with HTML, but BeautifulSoup makes it straightforward. | |
""" | |
from bs4 import BeautifulSoup, NavigableString | |
from ttp import ttp | |
parse_text = ttp.Parser().parse | |
#non-exhaustive list of tags you want to leave as is | |
EXCLUDE_TAGS = frozenset(['a', 'style', 'script', 'title', 'link']) | |
def transform_html(html): | |
"""Markup twitter hashtags and usernames within a html fragment | |
>>> fragment = '<p>my name is @SorenKQuotes and my site is <a href="http://site.com/index.html#link">here</a></p>' | |
>>> for a in BeautifulSoup(fragment).findAll('a'): | |
... print(a) | |
<a href="http://site.com/index.html#link">here</a> | |
>>> fragment = transform_html(fragment) | |
>>> for a in BeautifulSoup(fragment).findAll('a'): | |
... print(a) | |
<a href="https://twitter.com/SorenKQuotes">@SorenKQuotes</a> | |
<a href="http://site.com/index.html#link">here</a> | |
""" | |
soup = BeautifulSoup(html) | |
for tag in soup.findAll(): | |
if tag.name.lower() in EXCLUDE_TAGS: | |
continue | |
for child in tag.contents: | |
if isinstance(child, NavigableString): | |
child.replace_with( | |
BeautifulSoup(parse_text(child).html) | |
) | |
return unicode(soup) | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment