Created
August 11, 2018 13:43
-
-
Save gamalan/d50a5fe59a1b79e662ef7d85c72a4586 to your computer and use it in GitHub Desktop.
Multi Language Stemmer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
try: | |
from nltk import wordpunct_tokenize | |
from nltk.corpus import stopwords | |
except ImportError: | |
print('[!] You need to install nltk (http://nltk.org/index.html)') | |
class LanguageDetection(object): | |
def __init__(self): | |
try: | |
""" | |
Make sure stopwords is exist, if not download it first | |
""" | |
stopwords.fileids() | |
except NameError: | |
import nltk | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
def __calculate_languages_ratios(self, text): | |
""" | |
Calculate probability of given text to be written in several languages and | |
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} | |
@param text: Text whose language want to be detected | |
@type text: str | |
@return: Dictionary with languages and unique stopwords seen in analyzed text | |
@rtype: dict | |
""" | |
languages_ratios = {} | |
''' | |
nltk.wordpunct_tokenize() splits all punctuations into separate tokens | |
>>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.") | |
['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.'] | |
''' | |
tokens = wordpunct_tokenize(text) | |
words = [word.lower() for word in tokens] | |
# Compute per language included in nltk number of unique stopwords appearing in analyzed text | |
for language in stopwords.fileids(): | |
stopwords_set = set(stopwords.words(language)) | |
words_set = set(words) | |
common_elements = words_set.intersection(stopwords_set) | |
languages_ratios[language] = len(common_elements) # language "score" | |
return languages_ratios | |
# ---------------------------------------------------------------------- | |
def detect_language(self, text): | |
""" | |
Calculate probability of given text to be written in several languages and | |
return the highest scored. | |
It uses a stopwords based approach, counting how many unique stopwords | |
are seen in analyzed text. | |
@param text: Text whose language want to be detected | |
@type text: str | |
@return: Most scored language guessed | |
@rtype: str | |
""" | |
ratios = self.__calculate_languages_ratios(text) | |
most_rated_language = max(ratios, key=ratios.get) | |
return most_rated_language | |
if __name__ == '__main__': | |
text = ''' | |
Dapatkan lebih banyak pelanggan potensial yang siap membeli produk Anda kapan saja. | |
Buat akun Anda sekarang untuk mulai berkomunikasi secara personal dengan pelanggan Anda | |
''' | |
language_detection = LanguageDetection() | |
language = language_detection.detect_language(text) | |
print(language) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer | |
from nltk.stem import SnowballStemmer | |
from Sastrawi.Stemmer import StemmerFactory | |
import numpy as np | |
class StemmedCountVectorizer(CountVectorizer): | |
def get_stemmer(self, doc): | |
from .language_detection import LanguageDetection | |
lang_detect = LanguageDetection() | |
lang = lang_detect.detect_language(doc) | |
try: | |
if lang == 'indonesian': | |
factory = StemmerFactory() | |
return factory.create_stemmer() | |
else: | |
return SnowballStemmer(lang) | |
except Exception as err: | |
return SnowballStemmer('english') | |
def build_analyzer(self): | |
analyzer = super(StemmedCountVectorizer, self).build_analyzer() | |
return lambda doc: ([self.get_stemmer(doc).stem(w) for w in analyzer(doc)]) | |
class SelectStemmedCountVectorizer(CountVectorizer): | |
def __init__(self, input='content', encoding='utf-8', | |
decode_error='strict', strip_accents=None, | |
lowercase=True, preprocessor=None, tokenizer=None, | |
stop_words=None, token_pattern=r"(?u)\b\w\w+\b", | |
ngram_range=(1, 1), analyzer='word', | |
max_df=1.0, min_df=1, max_features=None, | |
vocabulary=None, binary=False, dtype=np.int64, stemmer_language='english'): | |
super(SelectStemmedCountVectorizer).__init__(input=input, encoding=encoding, decode_error=decode_error, | |
strip_accents=strip_accents, lowercase=lowercase, | |
preprocessor=preprocessor, tokenizer=tokenizer, | |
stop_words=stop_words, token_pattern=token_pattern, | |
ngram_range=ngram_range, analyzer=analyzer, max_df=max_df, | |
min_df=min_df, max_features=max_features, vocabulary=vocabulary, | |
binary=binary, dtype=dtype) | |
self.stemmer_language = stemmer_language | |
def get_stemmer(self): | |
try: | |
if self.stemmer_language == 'indonesian': | |
factory = StemmerFactory() | |
return factory.create_stemmer() | |
else: | |
return SnowballStemmer(self.stemmer_language) | |
except Exception as err: | |
return SnowballStemmer('english') | |
def build_analyzer(self): | |
analyzer = super(SelectStemmedCountVectorizer, self).build_analyzer() | |
return lambda doc: ([self.get_stemmer().stem(w) for w in analyzer(doc)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment