Last active
October 31, 2016 19:09
-
-
Save jazzlw/95052e9275e09a63c990a25e8b63920f to your computer and use it in GitHub Desktop.
onegram play
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from one_gram_reader import * | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# Returns total occurrences of the given word by adding up the counts for each year as given by word_data | |
def total_occurrences(word_data, word): | |
total = 0 | |
counts = word_data.get(word, []) # returns an empty list if the word is not in word_data | |
# Go through the years and add up the counts. | |
for year in counts: | |
total += year[1] | |
return total | |
# Returns a list of length 26 | |
# corresponding to the relative frequency of each letter in the dataset given by word_data | |
def count_letters(word_data): | |
total_letters = 0 | |
letter_counts = {} | |
alphabet = [chr(i) for i in range(ord('a'),ord('z')+1)] # list of letters in the alphabet, a to z | |
letter_count_list = [] | |
# Go through each word in word_data and add the count of each letter to the letter_count | |
# Also count the total number of letters in total_letters. | |
for word in word_data: | |
occurences = total_occurrences(word_data, word) | |
total_letters += occurences * len(word) | |
# Go through the letters of each word and add them to letter_counts | |
for letter in word: | |
letter_counts[letter] = letter_counts.get(letter, 0) + occurences | |
# Normalize letter_counts by the total_letters | |
for letter in letter_counts: | |
letter_counts[letter] /= float(total_letters) | |
# Convert letter_counts into a list of counts in alphabetical order. | |
for letter in alphabet: | |
letter_count_list += [letter_counts.get(letter, 0)] | |
return letter_count_list | |
# Plots frequencies of letters in English, calculated from word_data. | |
def bar_plot_of_letter_frequencies(word_data): | |
letter_counts = count_letters(word_data) | |
alphabet = [chr(i) for i in range(ord('A'),ord('Z')+1)] | |
x_pos = np.arange(len(alphabet)) | |
plt.bar(x_pos, letter_counts, 1, align = 'center') | |
plt.xticks(x_pos, alphabet) | |
plt.xlim([-.5, len(alphabet)]) | |
plt.ylabel('Frequency') | |
plt.xlabel('Letter') | |
plt.show() | |
# creates a log-log plot of total occurrences of each word vs the rank of that word | |
# and annotates the occurrences of the words in words. | |
def plot_aggregate_counts(word_data, words): | |
word_occurrences = [] | |
annotated_words = {} | |
annotated_counts = [] | |
annotated_ranks = [] | |
# remove any words from the annotation list that don't show up in the dataset. | |
for word in words: | |
if word not in word_data: | |
words.remove(word) | |
# Create a list of the total occurences of each word | |
for word in word_data: | |
word_occurrences += [total_occurrences(word_data, word)] | |
if word in words: | |
annotated_words[word] = total_occurrences(word_data, word) | |
# sort them by rank, and make a coresponding list of ranks. | |
word_occurrences.sort(reverse = True) | |
ranks = range(1, len(word_occurrences)+1) | |
# plot the annotated_words as words, and makes lists to plot them as stars | |
for word in words: | |
annotated_counts += [annotated_words[word]] | |
annotated_ranks += [word_occurrences.index(annotated_words[word]) + 1] | |
plt.annotate(word, xy = (1.3 * word_occurrences.index(annotated_words[word]), 1.1 * annotated_words[word]) ) | |
# adjust the limits of the graph so that it will be tight around the data and plot the data as a line | |
plt.xlim([1, 1.1* max(ranks)]) | |
plt.loglog(ranks, word_occurrences) | |
# When plotting a large dataset, the individual points are not useful, whereas with a small one they can be | |
# this plots those points if there are less than 100 of them. | |
if len(ranks) < 100: | |
plt.loglog(ranks, word_occurrences, 'g.', ms = 12) | |
# Plot the annotated_words as stars. | |
plt.loglog(annotated_ranks, annotated_counts, 'r*', ms = 12) | |
plt.xlabel("Rank of Word") | |
plt.ylabel("Total Occurrences") | |
plt.show() | |
def most_common_words(word_data, n): | |
words = [] | |
counts = [] | |
top_words =[] | |
for word in word_data: | |
words += [word] | |
counts += [total_occurrences(word_data, word)] | |
ranked_counts = sorted(counts, reverse = True) | |
for i in range(n): | |
top_words += [(words[counts.index(ranked_counts[i])], ranked_counts[i] )] | |
return top_words | |
def get_occurrences_in_year(word_data, word, year): | |
counts = word_data.get(word, []) | |
for count in counts: | |
if count[0] == year: | |
return count[1] | |
return 0 | |
# Returns the average word length in the data for the year given. | |
def get_average_word_length(word_data, year): | |
total_words = 0 | |
total_letters = 0 | |
# Goes through each word in word_data and gets the total number of occurrences and then updates | |
# total_words and total_letters accordingly. | |
for word in word_data: | |
occurrences = get_occurrences_in_year(word_data, word, year) | |
total_words += occurrences | |
total_letters += occurrences * len(word) | |
try: | |
return float(total_letters) / total_words | |
except: | |
return 0 | |
# Plots the average word length for each year in year range for which there is data. | |
def plot_average_word_length(word_data, year_range): | |
year_list = [] | |
length_list = [] | |
# go through the years and get the average word length for each | |
for year in range(year_range[0], year_range[1] + 1): | |
length = get_average_word_length(word_data, year) | |
# no data is represented by length = 0, so don't include those years. | |
if length: | |
print length | |
length_list += [length] | |
year_list += [year] | |
#plot everything. | |
plt.plot(year_list, length_list) | |
plt.plot(year_list, length_list, 'r.') | |
plt.xlabel('Year') | |
plt.ylabel('Average Word Length') | |
plt.show() | |
## some test code | |
word_data = read_entire_word_file("words_that_start_with_q.csv") | |
# word_data = read_entire_word_file("all_words.csv") | |
# word_data = read_entire_word_file("very_short.csv") | |
print word_data['question'] | |
plot_average_word_length(word_data, [1600, 2008]) | |
# | |
# word_data = read_entire_word_file("words_that_start_with_q.csv") | |
# #word_data = read_entire_word_file("all_words.csv") | |
# print len(word_data) | |
# plot_aggregate_counts(word_data, ["quest", "questions", "he"]) | |
# |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
# Given a word and a date range, returns two lists, | |
# years lists all the years the word was found within the year range, | |
# counts contains the number of occurences for the word during the given year, | |
# indexes in each list correspond to each other | |
# If your not sure that your input is alphabetical, comment out the elif statement and the break. | |
# With an alphabetical word_file, as all of the google sourced ones are, the elif will save some time. | |
def read_word_file(word, year_range, word_file): | |
word_found = False | |
years = [] | |
counts = [] | |
# Using the with as construction automates the f.close() operation | |
with open(word_file, "rb") as f: | |
csv_reader = csv.reader(f, delimiter='\t') | |
for row in csv_reader: | |
if row[0] == word and int(row[1]) >= year_range[0] and int(row[1]) <= year_range[1]: | |
word_found = True | |
years += [int(row[1])] | |
counts += [int(row[2])] | |
elif word_found: ## comment out this block if you're not sure that your input is alphabetical. | |
break | |
return years, counts | |
# Creates a dictionary indexed on years, where each value is the total | |
# number of words recorded from that year | |
def read_total_counts(total_file): | |
count_dict = {} | |
with open(total_file, "rb") as f: | |
csv_reader = csv.reader(f, delimiter=',') | |
for row in csv_reader: | |
count_dict[int(row[0])] = int(row[1]) | |
return count_dict | |
# Reads the file specified in word_file and returns a dictionary of words mapped to | |
# lists of tuples, wheere each tuple is a year and count pair for that word. | |
def read_entire_word_file(word_file): | |
word_data = {} | |
with open(word_file, 'rb') as f: | |
csv_reader = csv.reader(f, delimiter='\t') | |
for row in csv_reader: | |
word_data[row[0]] = word_data.get(row[0], []) + [(int(row[1]), int(row[2]))] | |
return word_data | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment