Last active
November 28, 2023 03:14
-
-
Save ftfarias/4bae08b493bcbf287ac212e132ef5143 to your computer and use it in GitHub Desktop.
Bigram Detector
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from collections import Counter | |
def calculate_mutual_information(words, bigrams): | |
# Count the frequency of individual words and bigrams | |
word_counts = Counter(words) | |
bigram_counts = Counter(bigrams) | |
# Calculate the total number of words and bigrams | |
total_words = sum(word_counts.values()) | |
total_bigrams = sum(bigram_counts.values()) | |
# Calculate the mutual information for each word and bigram | |
mutual_information = {} | |
# Calculate mutual information for individual words | |
for word, count in word_counts.items(): | |
word_probability = count / total_words | |
mutual_information[word] = 0 | |
# Calculate the mutual information for the word with each bigram | |
for bigram, bigram_count in bigram_counts.items(): | |
if word in bigram: | |
bigram_probability = bigram_count / total_bigrams | |
# Calculate the joint probability of the word and bigram | |
joint_probability = bigram_count / total_words | |
# Calculate the mutual information using the formula | |
mutual_info = math.log2(joint_probability / (word_probability * bigram_probability)) | |
mutual_information[word] += mutual_info | |
# Calculate mutual information for bigrams | |
for bigram, count in bigram_counts.items(): | |
bigram_probability = count / total_bigrams | |
mutual_information[bigram] = 0 | |
# Calculate the mutual information for the bigram with each word | |
for word, word_count in word_counts.items(): | |
if word in bigram: | |
word_probability = word_count / total_words | |
# Calculate the joint probability of the word and bigram | |
joint_probability = count / total_words | |
# Calculate the mutual information using the formula | |
mutual_info = math.log2(joint_probability / (word_probability * bigram_probability)) | |
mutual_information[bigram] += mutual_info | |
return mutual_information | |
""" | |
In this function, the words parameter represents a list of individual words, | |
and the bigrams parameter represents a list of word pairs (bigrams). | |
The function uses the Counter class from the collections module to count the | |
frequency of each word and bigram. | |
The mutual information is calculated for each word by iterating over the word | |
counts and bigram counts. For each word, the function calculates the word | |
probability and then iterates over the bigrams to check if the word is present in them. | |
If the word is present in a bigram, the mutual information is calculated using the | |
formula log2(joint_probability / (word_probability * bigram_probability)). | |
The function returns a dictionary where the keys are the individual words, | |
and the values are their corresponding mutual information scores. | |
You can use this function as follows: | |
""" | |
words = ["Tatooine", "Palpatine", "Darth", "Vader", "Han", "Solo"] | |
bigrams = ["Darth Vader", "Han Solo"] | |
mutual_info_scores = calculate_mutual_information(words, bigrams) | |
for word, score in mutual_info_scores.items(): | |
print(f"Word: {word}, Mutual Information: {score}") | |
""" | |
Word: Tatooine, Mutual Information: 0.0 | |
Word: Palpatine, Mutual Information: 0.0 | |
Word: Darth, Mutual Information: 1.0 | |
Word: Vader, Mutual Information: 1.0 | |
Word: Han, Mutual Information: 1.0 | |
Word: Solo, Mutual Information: 1.0 | |
Item: Darth Vader, Mutual Information: 1.0 | |
Item: Han Solo, Mutual Information: 1.0 | |
In this example, the words "Darth," "Vader," "Han," and "Solo" | |
show mutual information scores of 1.0, indicating a strong | |
association with the bigrams. The words "Tatooine" and "Palpatine" | |
have a mutual information score of 0.0, indicating no | |
association with the given bigrams. | |
""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment