Created
February 17, 2015 11:35
-
-
Save guenodz/d5add59b31114a3a3c66 to your computer and use it in GitHub Desktop.
a simple implementation of TF-IDF algorithm in Java.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.guendouz.textclustering.preprocessing; | |
import java.util.Arrays; | |
import java.util.List; | |
/** | |
* @author Mohamed Guendouz | |
*/ | |
public class TFIDFCalculator { | |
/** | |
* @param doc list of strings | |
* @param term String represents a term | |
* @return term frequency of term in document | |
*/ | |
public double tf(List<String> doc, String term) { | |
double result = 0; | |
for (String word : doc) { | |
if (term.equalsIgnoreCase(word)) | |
result++; | |
} | |
return result / doc.size(); | |
} | |
/** | |
* @param docs list of list of strings represents the dataset | |
* @param term String represents a term | |
* @return the inverse term frequency of term in documents | |
*/ | |
public double idf(List<List<String>> docs, String term) { | |
double n = 0; | |
for (List<String> doc : docs) { | |
for (String word : doc) { | |
if (term.equalsIgnoreCase(word)) { | |
n++; | |
break; | |
} | |
} | |
} | |
return Math.log(docs.size() / n); | |
} | |
/** | |
* @param doc a text document | |
* @param docs all documents | |
* @param term term | |
* @return the TF-IDF of term | |
*/ | |
public double tfIdf(List<String> doc, List<List<String>> docs, String term) { | |
return tf(doc, term) * idf(docs, term); | |
} | |
public static void main(String[] args) { | |
List<String> doc1 = Arrays.asList("Lorem", "ipsum", "dolor", "ipsum", "sit", "ipsum"); | |
List<String> doc2 = Arrays.asList("Vituperata", "incorrupte", "at", "ipsum", "pro", "quo"); | |
List<String> doc3 = Arrays.asList("Has", "persius", "disputationi", "id", "simul"); | |
List<List<String>> documents = Arrays.asList(doc1, doc2, doc3); | |
TFIDFCalculator calculator = new TFIDFCalculator(); | |
double tfidf = calculator.tfIdf(doc1, documents, "ipsum"); | |
System.out.println("TF-IDF (ipsum) = " + tfidf); | |
} | |
} | |
return Math.log(docs.size() / n);
cast it..
return Math.log((double)docs.size() /*(double) n);
Everything allrigth
Thanks.
This is a great demo, thanks for putting this up
Thank you, just getting into text mining and this is very helpful.
Hi,
I'm currently looking into TF-IDF for the first time.
One detail question:
In line 39 you are using Math.log, which returns the natural logarithm (base e) (https://docs.oracle.com/javase/7/docs/api/java/lang/Math.html).
The wikipedia article (https://en.wikipedia.org/wiki/Tf%E2%80%93idf) states that base 10 logarithm should be used, so shouldn't this be changed to:
Math.log10(docs.size() / n)
Kind regards,
Michael
I think you will get exception in line return Math.log(docs.size() / n); in case n is 0.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
in idf method you should add the case where "term" doesn't exist because you can't /0