Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Created March 21, 2015 06:12
Show Gist options
  • Save mocobeta/af6904ffec7f81d71629 to your computer and use it in GitHub Desktop.
Save mocobeta/af6904ffec7f81d71629 to your computer and use it in GitHub Desktop.
kuromoji with neologd
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashSet;
public class HelloKuromoji {
public static final String[] contents = {
"10日放送の「中居正広のミになる図書館」(テレビ朝日系)で、SMAPの中居正広が、篠原信一の過去の勘違いを明かす一幕があった。"
};
private Analyzer analyzer = new JapaneseAnalyzer(null,
JapaneseTokenizer.Mode.NORMAL,
CharArraySet.EMPTY_SET, new HashSet());
public void displayTokenStream() throws IOException {
for (String content : contents) {
System.out.println("\n" + content);
System.out.println("====================================================================");
StringReader reader = new StringReader(content);
TokenStream stream = analyzer.tokenStream("", reader);
stream.reset(); // must call TokenStream#reset()
displayTokens(stream);
stream.close();
}
}
private void displayTokens(TokenStream stream) throws IOException {
System.out.println("|テキスト\t|開始\t|終了\t|読み\t\t|品詞");
System.out.println("--------------------------------------------------------------------");
while(stream.incrementToken()) {
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
ReadingAttribute rAtt = stream.getAttribute(ReadingAttribute.class);
OffsetAttribute oAtt = stream.getAttribute(OffsetAttribute.class);
PartOfSpeechAttribute psAtt = stream.getAttribute(PartOfSpeechAttribute.class);
String text = termAtt.toString();
String yomi = rAtt.getReading();
int sOffset = oAtt.startOffset();
int eOffset = oAtt.endOffset();
String pos = psAtt.getPartOfSpeech();
System.out.println(
"|" + text + "\t\t" +
"|" + Integer.toString(sOffset) + "\t" +
"|" + Integer.toString(eOffset) + "\t" +
"|" + yomi + "\t\t" +
"|" + pos + "\t"
);
}
}
public static void main(String[] args) throws IOException {
HelloKuromoji test = new HelloKuromoji();
test.displayTokenStream();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment