ppope · May 13, 2021 14:32 · Feb 20, 2018 · Feb 20, 2018 · Feb 20, 2018 · Feb 20, 2018
diff --git a/preprocess-twitter.py → preprocess_twitter.py b/preprocess-twitter.py → preprocess_twitter.py
diff --git a/preprocess-twitter.py b/preprocess-twitter.py
@@ -64,4 +64,4 @@ def re_sub(pattern, repl):
     if text == "test":
         text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
     tokens = tokenize(text)
-    print tokens
+    print(tokens)
diff --git a/preprocess-twitter.py b/preprocess-twitter.py
@@ -12,15 +12,15 @@
 """
 
 import sys
-import re
+import regex as re
 
 FLAGS = re.MULTILINE | re.DOTALL
 
 def hashtag(text):
     text = text.group()
     hashtag_body = text[1:]
     if hashtag_body.isupper():
-        result = "<hashtag> {} <allcaps>".format(hashtag_body)
+        result = " {} ".format(hashtag_body.lower())
     else:
         result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
     return result
@@ -40,12 +40,12 @@ def re_sub(pattern, repl):
         return re.sub(pattern, repl, text, flags=FLAGS)
 
     text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
-    text = re_sub(r"/"," / ")
     text = re_sub(r"@\w+", "<user>")
     text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
     text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
     text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
     text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
+    text = re_sub(r"/"," / ")
     text = re_sub(r"<3","<heart>")
     text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
     text = re_sub(r"#\S+", hashtag)

diff --git a/preprocess-twitter.py b/preprocess-twitter.py
@@ -56,7 +56,7 @@ def re_sub(pattern, repl):
     # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
     text = re_sub(r"([A-Z]){2,}", allcaps)
 
-    return text
+    return text.lower()
 
 
 if __name__ == '__main__':

diff --git a/preprocess-twitter.py b/preprocess-twitter.py
@@ -53,7 +53,7 @@ def re_sub(pattern, repl):
     text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
 
     ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
-    # text = re_sub(r"([^ a-z0-9()<>'`\-]){2,}", allcaps)
+    # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
     text = re_sub(r"([A-Z]){2,}", allcaps)
 
     return text

diff --git a/preprocess-twitter.py b/preprocess-twitter.py
@@ -0,0 +1,67 @@
+"""
+preprocess-twitter.py
+
+python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
+
+Script for preprocessing tweets by Romain Paulus
+with small modifications by Jeffrey Pennington
+with translation to Python by Motoki Wu
+
+Translation of Ruby script to create features for GloVe vectors for Twitter data.
+http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
+"""
+
+import sys
+import re
+
+FLAGS = re.MULTILINE | re.DOTALL
+
+def hashtag(text):
+    text = text.group()
+    hashtag_body = text[1:]
+    if hashtag_body.isupper():
+        result = "<hashtag> {} <allcaps>".format(hashtag_body)
+    else:
+        result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
+    return result
+
+def allcaps(text):
+    text = text.group()
+    return text.lower() + " <allcaps>"
+
+
+def tokenize(text):
+    # Different regex parts for smiley faces
+    eyes = r"[8:=;]"
+    nose = r"['`\-]?"
+
+    # function so code less repetitive
+    def re_sub(pattern, repl):
+        return re.sub(pattern, repl, text, flags=FLAGS)
+
+    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
+    text = re_sub(r"/"," / ")
+    text = re_sub(r"@\w+", "<user>")
+    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
+    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
+    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
+    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
+    text = re_sub(r"<3","<heart>")
+    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
+    text = re_sub(r"#\S+", hashtag)
+    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
+    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
+
+    ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
+    # text = re_sub(r"([^ a-z0-9()<>'`\-]){2,}", allcaps)
+    text = re_sub(r"([A-Z]){2,}", allcaps)
+
+    return text
+
+
+if __name__ == '__main__':
+    _, text = sys.argv
+    if text == "test":
+        text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
+    tokens = tokenize(text)
+    print tokens