Last active
November 28, 2019 21:43
-
-
Save eribeiro/ebb24feb3fd84931b7c288b9b716ed49 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.*; | |
import org.apache.lucene.analysis.core.WhitespaceTokenizer; | |
import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; | |
import org.apache.lucene.document.Document; | |
import org.apache.lucene.document.Field; | |
import org.apache.lucene.document.FieldType; | |
import org.apache.lucene.index.*; | |
import org.apache.lucene.search.IndexSearcher; | |
import org.apache.lucene.search.MatchAllDocsQuery; | |
import org.apache.lucene.search.TopDocs; | |
import org.apache.lucene.store.Directory; | |
import org.apache.lucene.store.FSDirectory; | |
import java.io.IOException; | |
import java.nio.file.Paths; | |
public class TestIndexer { | |
public static void main(String[] args) throws IOException { | |
Directory dir = FSDirectory.open(Paths.get("/tmp/test/")); | |
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(getAnalyzer())); | |
Document document = new Document(); | |
FieldType fieldType = new FieldType(); | |
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); | |
fieldType.setTokenized(true); | |
fieldType.setStored(true); | |
fieldType.setOmitNorms(true); | |
fieldType.setStoreTermVectorOffsets(false); | |
fieldType.setStoreTermVectorPositions(false); | |
fieldType.freeze(); | |
Field text = new Field("text", "a|10 b|23 c|90", fieldType); | |
document.add(text); | |
indexWriter.addDocument(document); | |
indexWriter.commit(); | |
IndexReader reader = DirectoryReader.open(indexWriter); | |
IndexSearcher searcher = new IndexSearcher(reader); | |
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 100); | |
System.out.println(searcher.collectionStatistics("text")); | |
for (int i = 0; i < topDocs.totalHits.value; i++) { | |
Document doc = searcher.doc(topDocs.scoreDocs[i].doc); | |
IndexableField f = doc.getField("text"); | |
System.out.println(f.stringValue()); | |
System.out.println("SumTotalTermFreq: " + reader.getSumTotalTermFreq("text")); | |
System.out.println("SumDocFreq: " + reader.getSumDocFreq("text")); | |
} | |
} | |
private static Analyzer getAnalyzer() { | |
return new Analyzer() { | |
@Override | |
protected TokenStreamComponents createComponents(String fieldName) { | |
Tokenizer tokenizer = new WhitespaceTokenizer(); | |
TokenFilter tokenFilter = new DelimitedTermFrequencyTokenFilter(tokenizer); | |
TokenFilter stopFilter = new StopFilter(tokenFilter, CharArraySet.EMPTY_SET); | |
return new TokenStreamComponents(tokenizer, stopFilter); | |
} | |
}; | |
} | |
} |
Author
eribeiro
commented
Nov 28, 2019
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment