Skip to content

Instantly share code, notes, and snippets.

@gamalan
Created August 11, 2018 13:43
Show Gist options
  • Save gamalan/d50a5fe59a1b79e662ef7d85c72a4586 to your computer and use it in GitHub Desktop.
Save gamalan/d50a5fe59a1b79e662ef7d85c72a4586 to your computer and use it in GitHub Desktop.
Multi Language Stemmer
import sys
try:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
except ImportError:
print('[!] You need to install nltk (http://nltk.org/index.html)')
class LanguageDetection(object):
def __init__(self):
try:
"""
Make sure stopwords is exist, if not download it first
"""
stopwords.fileids()
except NameError:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
def __calculate_languages_ratios(self, text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
'''
nltk.wordpunct_tokenize() splits all punctuations into separate tokens
>>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
'''
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
# ----------------------------------------------------------------------
def detect_language(self, text):
"""
Calculate probability of given text to be written in several languages and
return the highest scored.
It uses a stopwords based approach, counting how many unique stopwords
are seen in analyzed text.
@param text: Text whose language want to be detected
@type text: str
@return: Most scored language guessed
@rtype: str
"""
ratios = self.__calculate_languages_ratios(text)
most_rated_language = max(ratios, key=ratios.get)
return most_rated_language
if __name__ == '__main__':
text = '''
Dapatkan lebih banyak pelanggan potensial yang siap membeli produk Anda kapan saja.
Buat akun Anda sekarang untuk mulai berkomunikasi secara personal dengan pelanggan Anda
'''
language_detection = LanguageDetection()
language = language_detection.detect_language(text)
print(language)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import SnowballStemmer
from Sastrawi.Stemmer import StemmerFactory
import numpy as np
class StemmedCountVectorizer(CountVectorizer):
def get_stemmer(self, doc):
from .language_detection import LanguageDetection
lang_detect = LanguageDetection()
lang = lang_detect.detect_language(doc)
try:
if lang == 'indonesian':
factory = StemmerFactory()
return factory.create_stemmer()
else:
return SnowballStemmer(lang)
except Exception as err:
return SnowballStemmer('english')
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
return lambda doc: ([self.get_stemmer(doc).stem(w) for w in analyzer(doc)])
class SelectStemmedCountVectorizer(CountVectorizer):
def __init__(self, input='content', encoding='utf-8',
decode_error='strict', strip_accents=None,
lowercase=True, preprocessor=None, tokenizer=None,
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 1), analyzer='word',
max_df=1.0, min_df=1, max_features=None,
vocabulary=None, binary=False, dtype=np.int64, stemmer_language='english'):
super(SelectStemmedCountVectorizer).__init__(input=input, encoding=encoding, decode_error=decode_error,
strip_accents=strip_accents, lowercase=lowercase,
preprocessor=preprocessor, tokenizer=tokenizer,
stop_words=stop_words, token_pattern=token_pattern,
ngram_range=ngram_range, analyzer=analyzer, max_df=max_df,
min_df=min_df, max_features=max_features, vocabulary=vocabulary,
binary=binary, dtype=dtype)
self.stemmer_language = stemmer_language
def get_stemmer(self):
try:
if self.stemmer_language == 'indonesian':
factory = StemmerFactory()
return factory.create_stemmer()
else:
return SnowballStemmer(self.stemmer_language)
except Exception as err:
return SnowballStemmer('english')
def build_analyzer(self):
analyzer = super(SelectStemmedCountVectorizer, self).build_analyzer()
return lambda doc: ([self.get_stemmer().stem(w) for w in analyzer(doc)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment