Last active
January 4, 2016 17:09
-
-
Save gibrown/8652399 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function es_api_detect_lang( $text ) { | |
$lang = false; | |
//if we can't tell the lang with 5000 characters we probably can't tell the language | |
$text = mb_substr( $text, 0, 5000 ); | |
//replace non-breaking spaces so they don't match the \p{L} char class | |
$text = preg_replace( '/[\x{00A0}\x{2007}\x{202F}]/u', ' ', $text ); | |
//replace unicode symbols: see: http://www.utf8-chartable.de/unicode-utf8-table.pl | |
$text = preg_replace( '/[\x{2000}-\x{2BFF}]/u', ' ', $text ); //symbols | |
$text = preg_replace( '/[\x{1f000}-\x{1ffff}]/u', ' ', $text ); //symbols, emoticons | |
//try and get rid of URLs that the lang detect api will strip out anyways | |
// avoid getting ES exceptions for empty posts | |
// this regex is simplistic, but should get 99% of cases | |
$text = preg_replace( '#(http|https)\:\/\/[a-z0-9\-_.]+\.[a-z]{2,}(/\S*)?#i', ' ', $text ); | |
//and remove email addresses too | |
$text = preg_replace( '#([.0-9a-z_+-]+)@(([0-9a-z-_]+\.)+[0-9a-z]{2,})#i', ' ', $text ); | |
if ( ! preg_match( '/\p{L}/', $text ) ) | |
return false; //no utf-8 letters, we can't detect anything | |
//remove punctuation - leading punctuation seems to cause lang detect to fail | |
// so just strip it all | |
$text = preg_replace( '/\p{P}/', ' ', $text ); | |
//do our own detection of some languages that the langdetect plugin doesn't handle | |
$khmer_lang = es_langdetect_by_chars( $text, '/[\x{1780}-\x{17FF}]/u', 0.25 ); | |
if ( $khmer_lang ) { | |
return 'km'; | |
} | |
//Capital O with a tilde occurs a lot in armenian text, so include that in the charset | |
$armenian_lang = es_langdetect_by_chars( $text, '/[\x{0530}-\x{058F}\x{00D5}]/u', 0.15 ); | |
if ( $armenian_lang ) { | |
return 'hy'; | |
} | |
$ethiopic_lang = es_langdetect_by_chars( $text, '/[\x{1200}-\x{137F}]/u', 0.25 ); | |
if ( $ethiopic_lang ) { | |
//Amharic - assuming this is the best choice, there are multiple dialects | |
// this appears to be the largest | |
return 'am'; | |
} | |
$maldivian_lang = es_langdetect_by_chars( $text, '/[\x{0780}-\x{07BF}]/u', 0.25 ); | |
if ( $maldivian_lang ) { | |
return 'dv'; | |
} | |
$myanmar_lang = es_langdetect_by_chars( $text, '/[\x{1000}-\x{109F}]/u', 0.25 ); | |
if ( $myanmar_lang ) { | |
return 'my'; | |
} | |
//see if we have enough characters to do language detection. | |
// Short text is hard to detect so we shouldn't try to get it right | |
if ( strlen( $text ) < 300 ) { | |
//We use strlen to count number of bytes rather than number of UTF-8 chars. | |
// this is a hack to (mostly) adjust for the fact that a Chinese/Japanese word takes fewer characters | |
// but actually takes a similar number of bytes. | |
// English is average 5 chars/word (== 5 bytes), Chinese is 1.5 chars/word (==3*1.5==4.5 bytes) | |
// So this cutoff is about 60 words in both English and Chinese | |
// this should probably be made smarter at some point | |
return false; | |
} | |
//else run es lang detect | |
$es_client = new \Elastica\Client(); | |
$es_req = new \Elastica\Request( '_langdetect', 'POST', $text, array(), $es_client->getConnection() ); | |
$es_resp = $es_req->send(); | |
if ( $es_resp->isOk() ) { | |
$data = $es_resp->getData(); | |
if ( $data['languages'][0]['probability'] > 0.5 ) | |
$lang = $data['languages'][0]['language']; | |
} | |
} | |
//detect language for text entirely based on a regex | |
// presumably a regex that matches unicode ranges | |
function es_langdetect_by_chars( $text, $unicode_regex, $percentage ) { | |
$khmer_char_cnt = preg_match_all( $unicode_regex, $text ); | |
if ( $khmer_char_cnt ) { | |
$char_cnt = preg_match_all( '/\p{L}/', $text ); | |
//if at least X% of letters are of this language symbols, assume that is the language | |
// Choose X% below 50% because there are also spaces, dates, numbers and other extraneous symbols | |
if ( $khmer_char_cnt / $char_cnt > $percentage ) | |
return true; | |
} | |
return false; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a hacked up version of our code calling the ES langdetect plugin. I haven't actually run it, just pasted it together as an example.