gibrown · January 4, 2016 17:09 · gibrown · Jan 27, 2014
diff --git a/gistfile1.php b/gistfile1.php
 function es_api_detect_lang( $text ) {
 	$lang = false;

 	//if we can't tell the lang with 5000 characters we probably can't tell the language
 	$text = mb_substr( $text, 0, 5000 );

 	//replace non-breaking spaces so they don't match the \p{L} char class
 	$text = preg_replace( '/[\x{00A0}\x{2007}\x{202F}]/u', ' ', $text );

 	//replace unicode symbols: see: http://www.utf8-chartable.de/unicode-utf8-table.pl
 	$text = preg_replace( '/[\x{2000}-\x{2BFF}]/u', ' ', $text ); //symbols
 	$text = preg_replace( '/[\x{1f000}-\x{1ffff}]/u', ' ', $text ); //symbols, emoticons

 	//try and get rid of URLs that the lang detect api will strip out anyways
 	// avoid getting ES exceptions for empty posts
 	// this regex is simplistic, but should get 99% of cases
 	$text = preg_replace( '#(http|https)\:\/\/[a-z0-9\-_.]+\.[a-z]{2,}(/\S*)?#i', ' ', $text );
 	//and remove email addresses too
 	$text = preg_replace( '#([.0-9a-z_+-]+)@(([0-9a-z-_]+\.)+[0-9a-z]{2,})#i', ' ', $text );

 	if ( ! preg_match( '/\p{L}/', $text ) )
 		return false; //no utf-8 letters, we can't detect anything

 	//remove punctuation - leading punctuation seems to cause lang detect to fail
 	// so just strip it all
 	$text = preg_replace( '/\p{P}/', ' ', $text );

 	//do our own detection of some languages that the langdetect plugin doesn't handle
 	$khmer_lang = es_langdetect_by_chars( $text, '/[\x{1780}-\x{17FF}]/u', 0.25 );
 	if ( $khmer_lang ) {
 		return 'km';
 	}
 	//Capital O with a tilde occurs a lot in armenian text, so include that in the charset
 	$armenian_lang = es_langdetect_by_chars( $text, '/[\x{0530}-\x{058F}\x{00D5}]/u', 0.15 );
 	if ( $armenian_lang ) {
 		return 'hy';
 	}
 	$ethiopic_lang = es_langdetect_by_chars( $text, '/[\x{1200}-\x{137F}]/u', 0.25 );
 	if ( $ethiopic_lang ) {
 		//Amharic - assuming this is the best choice, there are multiple dialects
 		// this appears to be the largest
 		return 'am';
 	}
 	$maldivian_lang = es_langdetect_by_chars( $text, '/[\x{0780}-\x{07BF}]/u', 0.25 );
 	if ( $maldivian_lang ) {
 		return 'dv';
 	}
 	$myanmar_lang = es_langdetect_by_chars( $text, '/[\x{1000}-\x{109F}]/u', 0.25 );
 	if ( $myanmar_lang ) {
 		return 'my';
 	}

 	//see if we have enough characters to do language detection. 
 	// Short text is hard to detect so we shouldn't try to get it right
 	if ( strlen( $text ) < 300 ) {
 		//We use strlen to count number of bytes rather than number of UTF-8 chars.
 		// this is a hack to (mostly) adjust for the fact that a Chinese/Japanese word takes fewer characters
 		// but actually takes a similar number of bytes.
 		// English is average 5 chars/word (== 5 bytes), Chinese is 1.5 chars/word (==3*1.5==4.5 bytes)
 		// So this cutoff is about 60 words in both English and Chinese
 		//  this should probably be made smarter at some point
 		return false;
 	}

  //else run es lang detect
  $es_client = new \Elastica\Client();
  $es_req = new \Elastica\Request( '_langdetect', 'POST', $text, array(), $es_client->getConnection() );
 	$es_resp = $es_req->send();
 	if ( $es_resp->isOk() ) {
 		$data = $es_resp->getData();
 		if ( $data['languages'][0]['probability'] > 0.5 )
 			$lang = $data['languages'][0]['language'];
  }
 }

 //detect language for text entirely based on a regex
 //  presumably a regex that matches unicode ranges
 function es_langdetect_by_chars( $text, $unicode_regex, $percentage ) {
 	$khmer_char_cnt = preg_match_all( $unicode_regex, $text );
 	if ( $khmer_char_cnt ) {
 		$char_cnt = preg_match_all( '/\p{L}/', $text );
 		//if at least X% of letters are of this language symbols, assume that is the language
 		// Choose X% below 50% because there are also spaces, dates, numbers and other extraneous symbols
 		if ( $khmer_char_cnt / $char_cnt > $percentage )
 			return true;
 	}
 	return false;
 }
	function es_api_detect_lang( $text ) {
	$lang = false;

	//if we can't tell the lang with 5000 characters we probably can't tell the language
	$text = mb_substr( $text, 0, 5000 );

	//replace non-breaking spaces so they don't match the \p{L} char class
	$text = preg_replace( '/[\x{00A0}\x{2007}\x{202F}]/u', ' ', $text );

	//replace unicode symbols: see: http://www.utf8-chartable.de/unicode-utf8-table.pl
	$text = preg_replace( '/[\x{2000}-\x{2BFF}]/u', ' ', $text ); //symbols
	$text = preg_replace( '/[\x{1f000}-\x{1ffff}]/u', ' ', $text ); //symbols, emoticons

	//try and get rid of URLs that the lang detect api will strip out anyways
	// avoid getting ES exceptions for empty posts
	// this regex is simplistic, but should get 99% of cases
	$text = preg_replace( '#(http\|https)\:\/\/[a-z0-9\-_.]+\.[a-z]{2,}(/\S*)?#i', ' ', $text );
	//and remove email addresses too
	$text = preg_replace( '#([.0-9a-z_+-]+)@(([0-9a-z-_]+\.)+[0-9a-z]{2,})#i', ' ', $text );

	if ( ! preg_match( '/\p{L}/', $text ) )
	return false; //no utf-8 letters, we can't detect anything

	//remove punctuation - leading punctuation seems to cause lang detect to fail
	// so just strip it all
	$text = preg_replace( '/\p{P}/', ' ', $text );

	//do our own detection of some languages that the langdetect plugin doesn't handle
	$khmer_lang = es_langdetect_by_chars( $text, '/[\x{1780}-\x{17FF}]/u', 0.25 );
	if ( $khmer_lang ) {
	return 'km';
	}
	//Capital O with a tilde occurs a lot in armenian text, so include that in the charset
	$armenian_lang = es_langdetect_by_chars( $text, '/[\x{0530}-\x{058F}\x{00D5}]/u', 0.15 );
	if ( $armenian_lang ) {
	return 'hy';
	}
	$ethiopic_lang = es_langdetect_by_chars( $text, '/[\x{1200}-\x{137F}]/u', 0.25 );
	if ( $ethiopic_lang ) {
	//Amharic - assuming this is the best choice, there are multiple dialects
	// this appears to be the largest
	return 'am';
	}
	$maldivian_lang = es_langdetect_by_chars( $text, '/[\x{0780}-\x{07BF}]/u', 0.25 );
	if ( $maldivian_lang ) {
	return 'dv';
	}
	$myanmar_lang = es_langdetect_by_chars( $text, '/[\x{1000}-\x{109F}]/u', 0.25 );
	if ( $myanmar_lang ) {
	return 'my';
	}

	//see if we have enough characters to do language detection.
	// Short text is hard to detect so we shouldn't try to get it right
	if ( strlen( $text ) < 300 ) {
	//We use strlen to count number of bytes rather than number of UTF-8 chars.
	// this is a hack to (mostly) adjust for the fact that a Chinese/Japanese word takes fewer characters
	// but actually takes a similar number of bytes.
	// English is average 5 chars/word (== 5 bytes), Chinese is 1.5 chars/word (==3*1.5==4.5 bytes)
	// So this cutoff is about 60 words in both English and Chinese
	// this should probably be made smarter at some point
	return false;
	}

	//else run es lang detect
	$es_client = new \Elastica\Client();
	$es_req = new \Elastica\Request( '_langdetect', 'POST', $text, array(), $es_client->getConnection() );
	$es_resp = $es_req->send();
	if ( $es_resp->isOk() ) {
	$data = $es_resp->getData();
	if ( $data['languages'][0]['probability'] > 0.5 )
	$lang = $data['languages'][0]['language'];
	}
	}

	//detect language for text entirely based on a regex
	// presumably a regex that matches unicode ranges
	function es_langdetect_by_chars( $text, $unicode_regex, $percentage ) {
	$khmer_char_cnt = preg_match_all( $unicode_regex, $text );
	if ( $khmer_char_cnt ) {
	$char_cnt = preg_match_all( '/\p{L}/', $text );
	//if at least X% of letters are of this language symbols, assume that is the language
	// Choose X% below 50% because there are also spaces, dates, numbers and other extraneous symbols
	if ( $khmer_char_cnt / $char_cnt > $percentage )
	return true;
	}
	return false;
	}