class JapaneseHyphenationTokenizer { protected String mSourceString = ""; protected int mTokenLength = 1; protected int mCurrentPositionInLine = 0; protected int mCurrentLine = 0; protected String[] mLines = new String[0]; public JapaneseHyphenationTokenizer(int tokenLength) { super(); this.mTokenLength = tokenLength; } // Width is measured using multibytes = 2 single byte=1 manner public int widthOfStringBetweenIndeces(String target, int begin, int end) { int res = 0; char[] chars = target.toCharArray(); for (int i = begin; i < end; ++i) { String str = Character.toString(chars[i]); byte[] bytes = new byte[0]; try { bytes = str.getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } res += Math.min(bytes.length, 2); } return res; } public int widthOfString(String string) { return widthOfStringBetweenIndeces(string, 0, string.length()); } // Specify string to be tokenized public void setSourceString(String source) { this.mSourceString = source; mLines = mSourceString.split("\\r?\\n"); } // Another token can be extracted or not public boolean hasNextToken() { return mLines.length > mCurrentLine && (mCurrentPositionInLine < mLines[mCurrentLine] .length() || mLines[mCurrentLine].length() == 0); // if this line is blank, still we need to proceed to the other // line with // once returning a blank string } // Seeks for end position that fills one of the following conditions // 1. end of the line // 2. width of [begin, end) string is maximized under condition // that it does not exceed specified string width. public int seekForEndPositionOfStringWithStringWidth(String target, int begin, int stringWidth) { int end = target.length(); int currentWidth = 0; for (int i = begin; i < target.length(); ++i) { int thisWidth = widthOfStringBetweenIndeces(target, i, i + 1); currentWidth += thisWidth; if (stringWidth == currentWidth) { // end is the one next to the last character end = i + 1; break; } else if (stringWidth < currentWidth) { // if width exceeds limit, then the index must be // reverted one in the left end = i; break; } } return end; } // Substring using specified width of string that takes multibytes // into account. public String substringWithStringWidth(String target, int begin, int stringWidth) { return target.substring( begin, seekForEndPositionOfStringWithStringWidth(target, begin, stringWidth)); } // Returns next line token public String popNextToken() { // The remaining width of this line that still not is tokenized int remainingStringWidthOfThisLine = widthOfStringBetweenIndeces( mLines[mCurrentLine], mCurrentPositionInLine, mLines[mCurrentLine].length()); // if remaining width is shorter than max width, then output all // remainings of this line // and move to the next line if (remainingStringWidthOfThisLine < mTokenLength) { String res = mLines[mCurrentLine] .substring(mCurrentPositionInLine); moveToNextLine(); return res; } else { // last position filling condition of specified max token // length int expectedLastPositionOfThisToken = seekForEndPositionOfStringWithStringWidth( mLines[mCurrentLine], mCurrentPositionInLine, mTokenLength); // if treating hanging character is required boolean hangingRequired = hasHangingCharacterNextToPositionInString( expectedLastPositionOfThisToken - 1, mLines[mCurrentLine]); // we include another character, i.e. a punctuation into // this token. if (hangingRequired) { expectedLastPositionOfThisToken += 1; } String res = mLines[mCurrentLine].substring( mCurrentPositionInLine, expectedLastPositionOfThisToken); if (expectedLastPositionOfThisToken == mLines[mCurrentLine] .length()) { // if the end position is the end of the line // then move to next line moveToNextLine(); } else { mCurrentPositionInLine = expectedLastPositionOfThisToken; } return res; } } protected void moveToNextLine() { mCurrentLine += 1; mCurrentPositionInLine = 0; } final int kKutenCodePoint = "。".codePointAt(0); final int kToutenCodePoint = "、".codePointAt(0); // a following character at the position is punctuation or not protected boolean hasHangingCharacterNextToPositionInString( int position, String str) { if (str.length() - 1 == position) { return false; } else { int targetCodePoint = str.codePointAt(position + 1); return targetCodePoint == kKutenCodePoint || targetCodePoint == kToutenCodePoint; } } }