Created
April 5, 2024 11:31
-
-
Save Kudusch/0a0337c4b03338bbfd5b7a5d795b5538 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from bertopic import BERTopic | |
from hdbscan import HDBSCAN | |
from sentence_transformers import SentenceTransformer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from umap import UMAP | |
import pandas as pd | |
import pickle | |
import csv | |
# German stopwords from nltk, downloaded on 2024-02-23 | |
with open("Data/stopwords_german.txt", "r") as f: | |
german_stop_words = [l.strip() for l in f.readlines()] | |
print("Reading data ...", end = "\r") | |
df = pd.read_csv("Data/all_posts.csv") | |
docs = df["text"].tolist() | |
print(f"Read {df.shape[0]} lines") | |
print("Setting up models") | |
# this needs the protobuf package | |
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2", cache_folder = "Data", device = "cuda") | |
hdbscan_model = HDBSCAN( | |
min_cluster_size=round(df.shape[0]/1_000), | |
min_samples=20, | |
metric="euclidean", | |
cluster_selection_method="eom", | |
prediction_data=True | |
) | |
# Setup dimension reduction | |
umap_model = UMAP( | |
n_neighbors=df.shape[0]/100, | |
n_components=5, | |
min_dist=0.0, | |
metric="cosine" | |
) | |
# Create the CountVectorizer model | |
vectorizer_model = CountVectorizer( | |
stop_words=german_stop_words + ["http", "https", "html", "co"], | |
lowercase=True | |
) | |
# Create the BERTopic model | |
topic_model = BERTopic( | |
embedding_model=sentence_model, | |
hdbscan_model=hdbscan_model, | |
vectorizer_model=vectorizer_model, | |
umap_model=umap_model, | |
calculate_probabilities=True, | |
top_n_words=7, | |
verbose=True | |
) | |
print("Embedding documents ...", end = "\r") | |
try: | |
with open("Output/all_embedded_docs.pkl", "rb") as f: | |
docs_embeddings = pickle.load(f) | |
print("Embeddings read from disk") | |
except: | |
docs_embeddings = sentence_model.encode(docs, show_progress_bar=True) | |
with open("Output/all_embedded_docs.pkl", "wb") as f: | |
pickle.dump(docs_embeddings, f) | |
print("Fitting topic model ...", end = "\r") | |
topics, probabilities = topic_model.fit_transform(docs, embeddings=docs_embeddings) | |
print(f"{len(set(topics))-1} topics found") | |
print("Writing topic data to disk") | |
with open("Output/all_topics.pkl", "wb") as f: | |
pickle.dump((topics, probabilities), f) | |
print("Writing dtm to disk") | |
topic_term_matrix = topic_model.c_tf_idf_ | |
with open("Output/dtm.pkl") as f: | |
pickle.dump((topic_term_matrix)) | |
with open("Output/all_topics.csv", "w") as f: | |
writer = csv.writer(f, dialect="unix") | |
writer.writerow(["uri", "topic"] + [f"topic_{t}" for t in set(topics) if t != -1]) | |
for i in range(len(docs)): | |
writer.writerow([df.loc[i, "id"], topics[i]] + [p for p in probabilities[i]]) | |
with open("Output/all_topic_labels.csv", "w") as f: | |
f.write("topic, label") | |
topic_labels = topic_model.generate_topic_labels(nr_words=7, separator=", ") | |
for l in topic_labels: | |
f.write(f"{l}\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment