csiebler · August 4, 2023 08:40
diff --git a/00_extract_calls.ipynb b/00_extract_calls.ipynb
diff --git a/01_analyze_intents.ipynb b/01_analyze_intents.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook takes the output from the 00 notebook and performs topic detection using embeddings for clustering and turbo for cluster topic naming."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "from dotenv import load_dotenv\n",
    "from langchain.chat_models import AzureChatOpenAI\n",
    "from langchain import LLMChain\n",
    "from langchain.prompts.chat import (\n",
    "    ChatPromptTemplate,\n",
    "    SystemMessagePromptTemplate,\n",
    "    HumanMessagePromptTemplate,\n",
    ")\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "import pandas as pd\n",
    "import tiktoken\n",
    "encoding = tiktoken.get_encoding('cl100k_base')\n",
    "\n",
    "# Load environment variables (set OPENAI_API_KEY and OPENAI_API_BASE in .env)\n",
    "load_dotenv()\n",
    "\n",
    "# Configure Azure OpenAI Service API\n",
    "openai.api_type = \"azure\"\n",
    "openai.api_version = \"2023-05-15\"\n",
    "openai.api_base = os.getenv('OPENAI_API_BASE')\n",
    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "\n",
    "# Create embeddings\n",
    "embeddings = OpenAIEmbeddings(deployment_id=\"text-embedding-ada-002\", chunk_size=1)\n",
    "\n",
    "\n",
    "# Init LLM and embeddings model\n",
    "llm = AzureChatOpenAI(deployment_name=\"gpt-35-turbo\", temperature=0)\n",
    "\n",
    "system_message = \"\"\"You are an AI assistant that extracts a common topic from a variety of similar topics.\n",
    "You receive a list of topics and then return a common topic. When you reply, you only reply with the common topic, nothing else.\"\n",
    "\"\"\"\n",
    "\n",
    "system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)\n",
    "human_template=\"{text}\"\n",
    "human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)\n",
    "chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load call_reasons.csv via pandas and load in headers\n",
    "df = pd.read_csv('results_call_analytics_cleaned.csv', on_bad_lines='skip', header=0, sep=',')\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_embedding(x):\n",
    "    print(f\"embedding: {x}\")\n",
    "    return embeddings.embed_query(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# iterate through all calls and \n",
    "df = df.assign(reason_embedding=df['reason_for_contact'].apply(lambda x: get_embedding(x)))\n",
    "df.head()\n",
    "\n",
    "# pickle the dataframe\n",
    "df.to_pickle(\"call_reasons_embeddings.pkl\")\n",
    "\n",
    "df = pd.read_pickle(\"call_reasons_embeddings.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the dataframe\n",
    "\n",
    "df = pd.read_pickle(\"call_reasons_embeddings.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train k-means on df embeddings\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "n_clusters = 5\n",
    "kmeans = KMeans(n_clusters=n_clusters, init=\"k-means++\", random_state=42)\n",
    "kmeans.fit(df['reason_embedding'].to_list())\n",
    "df = df.assign(reason_cluster=kmeans.labels_)\n",
    "df.head()\n",
    "\n",
    "# group df by reason_cluster and print all reason_for_contact for each group\n",
    "for reason_cluster, group in df.groupby(\"reason_cluster\"):\n",
    "    print(f\"Reason cluster: {reason_cluster}\")\n",
    "    print(group[\"reason_for_contact\"].to_list())\n",
    "    print(\"\\n\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clusters = {}\n",
    "\n",
    "for i in range(n_clusters):\n",
    "    reasons = df[df['reason_cluster'] == i]['reason_for_contact'].sample(15)\n",
    "    reasons = \"\\n\".join(reasons.values.tolist())\n",
    "    print(f\"Topics: {reasons}\")\n",
    "    chain = LLMChain(llm=llm, prompt=chat_prompt)\n",
    "    result = chain.run(f\"Here is list of topics:\\n{reasons}\\Common topic:\")\n",
    "    print(result)\n",
    "    \n",
    "    # trim result and remove .\n",
    "    result = result.replace(\"Common topic:\", \"\").replace(\".\", \"\").strip()\n",
    "    \n",
    "    # then assign the result to the df for the corresodning cluster\n",
    "    clusters[i] = result\n",
    "    \n",
    "print(clusters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make new column standardized_topic in df and assign from clusters dict\n",
    "df = df.assign(standardized_topic=df['reason_cluster'].apply(lambda x: clusters[x]))\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# drop embedding colum, we do not need to save this\n",
    "df.drop(\"reason_embedding\", axis=1).to_csv('results_call_analytics_cleaned_with_standardized_call_topics.csv', index=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "openai-qna-env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This notebook takes the output from the 00 notebook and performs topic detection using embeddings for clustering and turbo for cluster topic naming."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import openai\n",
	"from dotenv import load_dotenv\n",
	"from langchain.chat_models import AzureChatOpenAI\n",
	"from langchain import LLMChain\n",
	"from langchain.prompts.chat import (\n",
	" ChatPromptTemplate,\n",
	" SystemMessagePromptTemplate,\n",
	" HumanMessagePromptTemplate,\n",
	")\n",
	"from langchain.embeddings import OpenAIEmbeddings\n",
	"import pandas as pd\n",
	"import tiktoken\n",
	"encoding = tiktoken.get_encoding('cl100k_base')\n",
	"\n",
	"# Load environment variables (set OPENAI_API_KEY and OPENAI_API_BASE in .env)\n",
	"load_dotenv()\n",
	"\n",
	"# Configure Azure OpenAI Service API\n",
	"openai.api_type = \"azure\"\n",
	"openai.api_version = \"2023-05-15\"\n",
	"openai.api_base = os.getenv('OPENAI_API_BASE')\n",
	"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
	"\n",
	"# Create embeddings\n",
	"embeddings = OpenAIEmbeddings(deployment_id=\"text-embedding-ada-002\", chunk_size=1)\n",
	"\n",
	"\n",
	"# Init LLM and embeddings model\n",
	"llm = AzureChatOpenAI(deployment_name=\"gpt-35-turbo\", temperature=0)\n",
	"\n",
	"system_message = \"\"\"You are an AI assistant that extracts a common topic from a variety of similar topics.\n",
	"You receive a list of topics and then return a common topic. When you reply, you only reply with the common topic, nothing else.\"\n",
	"\"\"\"\n",
	"\n",
	"system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)\n",
	"human_template=\"{text}\"\n",
	"human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)\n",
	"chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# load call_reasons.csv via pandas and load in headers\n",
	"df = pd.read_csv('results_call_analytics_cleaned.csv', on_bad_lines='skip', header=0, sep=',')\n",
	"\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_embedding(x):\n",
	" print(f\"embedding: {x}\")\n",
	" return embeddings.embed_query(x)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# iterate through all calls and \n",
	"df = df.assign(reason_embedding=df['reason_for_contact'].apply(lambda x: get_embedding(x)))\n",
	"df.head()\n",
	"\n",
	"# pickle the dataframe\n",
	"df.to_pickle(\"call_reasons_embeddings.pkl\")\n",
	"\n",
	"df = pd.read_pickle(\"call_reasons_embeddings.pkl\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# load the dataframe\n",
	"\n",
	"df = pd.read_pickle(\"call_reasons_embeddings.pkl\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# train k-means on df embeddings\n",
	"from sklearn.cluster import KMeans\n",
	"\n",
	"n_clusters = 5\n",
	"kmeans = KMeans(n_clusters=n_clusters, init=\"k-means++\", random_state=42)\n",
	"kmeans.fit(df['reason_embedding'].to_list())\n",
	"df = df.assign(reason_cluster=kmeans.labels_)\n",
	"df.head()\n",
	"\n",
	"# group df by reason_cluster and print all reason_for_contact for each group\n",
	"for reason_cluster, group in df.groupby(\"reason_cluster\"):\n",
	" print(f\"Reason cluster: {reason_cluster}\")\n",
	" print(group[\"reason_for_contact\"].to_list())\n",
	" print(\"\\n\")\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"clusters = {}\n",
	"\n",
	"for i in range(n_clusters):\n",
	" reasons = df[df['reason_cluster'] == i]['reason_for_contact'].sample(15)\n",
	" reasons = \"\\n\".join(reasons.values.tolist())\n",
	" print(f\"Topics: {reasons}\")\n",
	" chain = LLMChain(llm=llm, prompt=chat_prompt)\n",
	" result = chain.run(f\"Here is list of topics:\\n{reasons}\\Common topic:\")\n",
	" print(result)\n",
	" \n",
	" # trim result and remove .\n",
	" result = result.replace(\"Common topic:\", \"\").replace(\".\", \"\").strip()\n",
	" \n",
	" # then assign the result to the df for the corresodning cluster\n",
	" clusters[i] = result\n",
	" \n",
	"print(clusters)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# make new column standardized_topic in df and assign from clusters dict\n",
	"df = df.assign(standardized_topic=df['reason_cluster'].apply(lambda x: clusters[x]))\n",
	"\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"# drop embedding colum, we do not need to save this\n",
	"df.drop(\"reason_embedding\", axis=1).to_csv('results_call_analytics_cleaned_with_standardized_call_topics.csv', index=False)\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "openai-qna-env",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.9"
	},
	"orig_nbformat": 4
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}