Created
March 21, 2015 06:12
-
-
Save mocobeta/af6904ffec7f81d71629 to your computer and use it in GitHub Desktop.
kuromoji with neologd
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.ja.JapaneseAnalyzer; | |
import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute; | |
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | |
import org.apache.lucene.analysis.util.CharArraySet; | |
import java.io.IOException; | |
import java.io.StringReader; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
public class HelloKuromoji { | |
public static final String[] contents = { | |
"10日放送の「中居正広のミになる図書館」(テレビ朝日系)で、SMAPの中居正広が、篠原信一の過去の勘違いを明かす一幕があった。" | |
}; | |
private Analyzer analyzer = new JapaneseAnalyzer(null, | |
JapaneseTokenizer.Mode.NORMAL, | |
CharArraySet.EMPTY_SET, new HashSet()); | |
public void displayTokenStream() throws IOException { | |
for (String content : contents) { | |
System.out.println("\n" + content); | |
System.out.println("===================================================================="); | |
StringReader reader = new StringReader(content); | |
TokenStream stream = analyzer.tokenStream("", reader); | |
stream.reset(); // must call TokenStream#reset() | |
displayTokens(stream); | |
stream.close(); | |
} | |
} | |
private void displayTokens(TokenStream stream) throws IOException { | |
System.out.println("|テキスト\t|開始\t|終了\t|読み\t\t|品詞"); | |
System.out.println("--------------------------------------------------------------------"); | |
while(stream.incrementToken()) { | |
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); | |
ReadingAttribute rAtt = stream.getAttribute(ReadingAttribute.class); | |
OffsetAttribute oAtt = stream.getAttribute(OffsetAttribute.class); | |
PartOfSpeechAttribute psAtt = stream.getAttribute(PartOfSpeechAttribute.class); | |
String text = termAtt.toString(); | |
String yomi = rAtt.getReading(); | |
int sOffset = oAtt.startOffset(); | |
int eOffset = oAtt.endOffset(); | |
String pos = psAtt.getPartOfSpeech(); | |
System.out.println( | |
"|" + text + "\t\t" + | |
"|" + Integer.toString(sOffset) + "\t" + | |
"|" + Integer.toString(eOffset) + "\t" + | |
"|" + yomi + "\t\t" + | |
"|" + pos + "\t" | |
); | |
} | |
} | |
public static void main(String[] args) throws IOException { | |
HelloKuromoji test = new HelloKuromoji(); | |
test.displayTokenStream(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment