Skip to content

Instantly share code, notes, and snippets.

@csiebler
Last active August 4, 2023 08:40
Show Gist options
  • Save csiebler/fe1b62df47e10ccdcde94fcd475d5ef2 to your computer and use it in GitHub Desktop.
Save csiebler/fe1b62df47e10ccdcde94fcd475d5ef2 to your computer and use it in GitHub Desktop.
Quick sample notebooks for using Azure OpenAI for call transcription insights, and then subsequent topic clustering and naming
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook takes the output from the 00 notebook and performs topic detection using embeddings for clustering and turbo for cluster topic naming."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import openai\n",
"from dotenv import load_dotenv\n",
"from langchain.chat_models import AzureChatOpenAI\n",
"from langchain import LLMChain\n",
"from langchain.prompts.chat import (\n",
" ChatPromptTemplate,\n",
" SystemMessagePromptTemplate,\n",
" HumanMessagePromptTemplate,\n",
")\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"import pandas as pd\n",
"import tiktoken\n",
"encoding = tiktoken.get_encoding('cl100k_base')\n",
"\n",
"# Load environment variables (set OPENAI_API_KEY and OPENAI_API_BASE in .env)\n",
"load_dotenv()\n",
"\n",
"# Configure Azure OpenAI Service API\n",
"openai.api_type = \"azure\"\n",
"openai.api_version = \"2023-05-15\"\n",
"openai.api_base = os.getenv('OPENAI_API_BASE')\n",
"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
"\n",
"# Create embeddings\n",
"embeddings = OpenAIEmbeddings(deployment_id=\"text-embedding-ada-002\", chunk_size=1)\n",
"\n",
"\n",
"# Init LLM and embeddings model\n",
"llm = AzureChatOpenAI(deployment_name=\"gpt-35-turbo\", temperature=0)\n",
"\n",
"system_message = \"\"\"You are an AI assistant that extracts a common topic from a variety of similar topics.\n",
"You receive a list of topics and then return a common topic. When you reply, you only reply with the common topic, nothing else.\"\n",
"\"\"\"\n",
"\n",
"system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)\n",
"human_template=\"{text}\"\n",
"human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)\n",
"chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load call_reasons.csv via pandas and load in headers\n",
"df = pd.read_csv('results_call_analytics_cleaned.csv', on_bad_lines='skip', header=0, sep=',')\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_embedding(x):\n",
" print(f\"embedding: {x}\")\n",
" return embeddings.embed_query(x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# iterate through all calls and \n",
"df = df.assign(reason_embedding=df['reason_for_contact'].apply(lambda x: get_embedding(x)))\n",
"df.head()\n",
"\n",
"# pickle the dataframe\n",
"df.to_pickle(\"call_reasons_embeddings.pkl\")\n",
"\n",
"df = pd.read_pickle(\"call_reasons_embeddings.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load the dataframe\n",
"\n",
"df = pd.read_pickle(\"call_reasons_embeddings.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# train k-means on df embeddings\n",
"from sklearn.cluster import KMeans\n",
"\n",
"n_clusters = 5\n",
"kmeans = KMeans(n_clusters=n_clusters, init=\"k-means++\", random_state=42)\n",
"kmeans.fit(df['reason_embedding'].to_list())\n",
"df = df.assign(reason_cluster=kmeans.labels_)\n",
"df.head()\n",
"\n",
"# group df by reason_cluster and print all reason_for_contact for each group\n",
"for reason_cluster, group in df.groupby(\"reason_cluster\"):\n",
" print(f\"Reason cluster: {reason_cluster}\")\n",
" print(group[\"reason_for_contact\"].to_list())\n",
" print(\"\\n\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clusters = {}\n",
"\n",
"for i in range(n_clusters):\n",
" reasons = df[df['reason_cluster'] == i]['reason_for_contact'].sample(15)\n",
" reasons = \"\\n\".join(reasons.values.tolist())\n",
" print(f\"Topics: {reasons}\")\n",
" chain = LLMChain(llm=llm, prompt=chat_prompt)\n",
" result = chain.run(f\"Here is list of topics:\\n{reasons}\\Common topic:\")\n",
" print(result)\n",
" \n",
" # trim result and remove .\n",
" result = result.replace(\"Common topic:\", \"\").replace(\".\", \"\").strip()\n",
" \n",
" # then assign the result to the df for the corresodning cluster\n",
" clusters[i] = result\n",
" \n",
"print(clusters)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# make new column standardized_topic in df and assign from clusters dict\n",
"df = df.assign(standardized_topic=df['reason_cluster'].apply(lambda x: clusters[x]))\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# drop embedding colum, we do not need to save this\n",
"df.drop(\"reason_embedding\", axis=1).to_csv('results_call_analytics_cleaned_with_standardized_call_topics.csv', index=False)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "openai-qna-env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment