Skip to content

Instantly share code, notes, and snippets.

@ppope
Forked from tokestermw/preprocess-twitter.py
Last active May 13, 2021 14:32

Revisions

  1. ppope renamed this gist Feb 20, 2018. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. ppope revised this gist Feb 20, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion preprocess-twitter.py
    Original file line number Diff line number Diff line change
    @@ -64,4 +64,4 @@ def re_sub(pattern, repl):
    if text == "test":
    text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
    tokens = tokenize(text)
    print tokens
    print(tokens)
  3. ppope revised this gist Feb 20, 2018. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions preprocess-twitter.py
    Original file line number Diff line number Diff line change
    @@ -12,15 +12,15 @@
    """

    import sys
    import re
    import regex as re

    FLAGS = re.MULTILINE | re.DOTALL

    def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
    result = "<hashtag> {} <allcaps>".format(hashtag_body)
    result = " {} ".format(hashtag_body.lower())
    else:
    result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result
    @@ -40,12 +40,12 @@ def re_sub(pattern, repl):
    return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
  4. ppope revised this gist Feb 20, 2018. No changes.
  5. @tokestermw tokestermw revised this gist May 7, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion preprocess-twitter.py
    Original file line number Diff line number Diff line change
    @@ -56,7 +56,7 @@ def re_sub(pattern, repl):
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text
    return text.lower()


    if __name__ == '__main__':
  6. @tokestermw tokestermw revised this gist May 7, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion preprocess-twitter.py
    Original file line number Diff line number Diff line change
    @@ -53,7 +53,7 @@ def re_sub(pattern, repl):
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^ a-z0-9()<>'`\-]){2,}", allcaps)
    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text
  7. @tokestermw tokestermw created this gist May 7, 2015.
    67 changes: 67 additions & 0 deletions preprocess-twitter.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,67 @@
    """
    preprocess-twitter.py
    python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
    Script for preprocessing tweets by Romain Paulus
    with small modifications by Jeffrey Pennington
    with translation to Python by Motoki Wu
    Translation of Ruby script to create features for GloVe vectors for Twitter data.
    http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
    """

    import sys
    import re

    FLAGS = re.MULTILINE | re.DOTALL

    def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    if hashtag_body.isupper():
    result = "<hashtag> {} <allcaps>".format(hashtag_body)
    else:
    result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
    return result

    def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


    def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
    return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"#\S+", hashtag)
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")

    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
    # text = re_sub(r"([^ a-z0-9()<>'`\-]){2,}", allcaps)
    text = re_sub(r"([A-Z]){2,}", allcaps)

    return text


    if __name__ == '__main__':
    _, text = sys.argv
    if text == "test":
    text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
    tokens = tokenize(text)
    print tokens