Skip to content

Instantly share code, notes, and snippets.

@driscoll
Created October 18, 2011 00:36

Revisions

  1. driscoll created this gist Oct 18, 2011.
    95 changes: 95 additions & 0 deletions parsegnip.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,95 @@
    #!/usr/bin/python
    """ TODO
    Need better search strategy because
    'ows' is matching '#throwstrikes'
    Blacklist?
    What's up with these blank lines?
    "No text field found."
    Need progress bar of some sort. Takes 4 ever.
    """

    import sys
    import re

    RE_TEXT_FIELD = re.compile(r'"text":"([^"]*)"')

    BLACKLIST = ['throwstrikes']

    def die():
    print "Usage: {0} keywordfn.txt gnipfn.txt outputfn.json".format(sys.argv[0])
    sys.exit(1)

    def get_keywords(filename):
    """Read keywords from filename
    Return list of keywords
    """
    try:
    keywords = []
    with open(filename,'r') as keywordf:
    for line in keywordf:
    keywords.append(line.strip().lower())
    except:
    print "Error reading keyword file."
    die()
    return keywords

    def parse_tweets(keywords, gnipfn, outfn, verbose=False):
    """Parse tweets from file infn for keywords
    Write results one at a time to outfn
    Return dict of keyword:frequency pairs"""

    try:
    outputf = open(outfn, 'a')
    except:
    print "Error opening output file."
    die()

    try:
    gnipf = open(gnipfn, 'r')
    except:
    print "Error opening file with tweets from Gnip."
    die()

    frequency = dict([(kw, 0) for kw in keywords])
    for line in gnipf:
    found = False
    for kw in keywords:
    try:
    text = RE_TEXT_FIELD.search(line).group(1)
    except:
    print line
    print "No text field found."
    continue
    if (text.find(kw) > -1):
    # Need to test for BLACKLIST
    frequency[kw] += 1
    if verbose: print kw
    found = True
    if found:
    if verbose: print text
    outputf.write(line)
    outputf.flush()
    outputf.close()
    return frequency

    def report_freq(keywfreq):
    """Print table of keyword:frequency pairs in dict"""
    print "Found {0} total matching tweets.".format(sum([freq for freq in keywfreq.values()]))
    for kw, freq in keywfreq.iteritems():
    print "{0:16}: {1}".format(kw, freq)


    if __name__=='__main__':

    if len(sys.argv) < 4:
    die()

    keywfreq = parse_tweets(get_keywords(sys.argv[1]),sys.argv[2],sys.argv[3], verbose=False)
    report_freq(keywfreq)