Revisions
-
ppope renamed this gist
Feb 20, 2018 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
ppope revised this gist
Feb 20, 2018 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -64,4 +64,4 @@ def re_sub(pattern, repl): if text == "test": text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!" tokens = tokenize(text) print(tokens) -
ppope revised this gist
Feb 20, 2018 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -12,15 +12,15 @@ """ import sys import regex as re FLAGS = re.MULTILINE | re.DOTALL def hashtag(text): text = text.group() hashtag_body = text[1:] if hashtag_body.isupper(): result = " {} ".format(hashtag_body.lower()) else: result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS)) return result @@ -40,12 +40,12 @@ def re_sub(pattern, repl): return re.sub(pattern, repl, text, flags=FLAGS) text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>") text = re_sub(r"@\w+", "<user>") text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>") text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>") text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>") text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>") text = re_sub(r"/"," / ") text = re_sub(r"<3","<heart>") text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>") text = re_sub(r"#\S+", hashtag) -
ppope revised this gist
Feb 20, 2018 . No changes.There are no files selected for viewing
-
tokestermw revised this gist
May 7, 2015 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -56,7 +56,7 @@ def re_sub(pattern, repl): # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps) text = re_sub(r"([A-Z]){2,}", allcaps) return text.lower() if __name__ == '__main__': -
tokestermw revised this gist
May 7, 2015 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -53,7 +53,7 @@ def re_sub(pattern, repl): text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>") ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection. # text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps) text = re_sub(r"([A-Z]){2,}", allcaps) return text -
tokestermw created this gist
May 7, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,67 @@ """ preprocess-twitter.py python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)" Script for preprocessing tweets by Romain Paulus with small modifications by Jeffrey Pennington with translation to Python by Motoki Wu Translation of Ruby script to create features for GloVe vectors for Twitter data. http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb """ import sys import re FLAGS = re.MULTILINE | re.DOTALL def hashtag(text): text = text.group() hashtag_body = text[1:] if hashtag_body.isupper(): result = "<hashtag> {} <allcaps>".format(hashtag_body) else: result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS)) return result def allcaps(text): text = text.group() return text.lower() + " <allcaps>" def tokenize(text): # Different regex parts for smiley faces eyes = r"[8:=;]" nose = r"['`\-]?" # function so code less repetitive def re_sub(pattern, repl): return re.sub(pattern, repl, text, flags=FLAGS) text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>") text = re_sub(r"/"," / ") text = re_sub(r"@\w+", "<user>") text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>") text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>") text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>") text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>") text = re_sub(r"<3","<heart>") text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>") text = re_sub(r"#\S+", hashtag) text = re_sub(r"([!?.]){2,}", r"\1 <repeat>") text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>") ## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection. # text = re_sub(r"([^ a-z0-9()<>'`\-]){2,}", allcaps) text = re_sub(r"([A-Z]){2,}", allcaps) return text if __name__ == '__main__': _, text = sys.argv if text == "test": text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!" tokens = tokenize(text) print tokens