csiebler · August 4, 2023 08:40
diff --git a/00_extract_calls.ipynb b/00_extract_calls.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook takes call transcripts and extracts insights from them. You will need to adapt the first few cells to load your data correctly and format to a string in the following format:\n",
    "\n",
    "```\n",
    "agent: ...\\n\n",
    "customer: ...\\n\n",
    "and so on\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install openai --upgrade\n",
    "!pip install langchain --upgrade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "# load Transcriptions.csv via pandas and load in headers\n",
    "df = pd.read_csv('Transcriptions.csv', on_bad_lines='skip', header=0, sep=';')\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conversations = {}\n",
    "\n",
    "# now convert the df into a dict of conversations, grouped by ConnectionID\n",
    "# for this iterate each row and get chanale and transcription\n",
    "\n",
    "for index, row in df.iterrows():\n",
    "    conversation_id = row['ConnectionID']\n",
    "    if not conversation_id in conversations:\n",
    "        conversations[conversation_id] = []\n",
    "    conversations[conversation_id].append({\n",
    "        'cannel': \"agent\" if row['Canale'] == '1' else \"customer\",\n",
    "        'text': row['Transcription']\n",
    "    })\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print number of keys in conversations\n",
    "print(len(conversations))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_call_to_string(id):\n",
    "    conversation = conversations[id]\n",
    "    conversation_string = \"\"\n",
    "    for turn in conversation:\n",
    "        conversation_string += turn['cannel'] + \": \" + turn['text'] + \"\\n\"\n",
    "    return conversation_string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "from dotenv import load_dotenv\n",
    "from langchain.chat_models import AzureChatOpenAI\n",
    "from langchain import LLMChain\n",
    "from langchain.prompts.chat import (\n",
    "    ChatPromptTemplate,\n",
    "    SystemMessagePromptTemplate,\n",
    "    HumanMessagePromptTemplate,\n",
    ")\n",
    "import json\n",
    "import tiktoken\n",
    "encoding = tiktoken.get_encoding('cl100k_base')\n",
    "\n",
    "# Load environment variables (set OPENAI_API_KEY and OPENAI_API_BASE in .env)\n",
    "load_dotenv()\n",
    "\n",
    "# Configure Azure OpenAI Service API\n",
    "openai.api_type = \"azure\"\n",
    "openai.api_version = \"2023-05-15\"\n",
    "openai.api_base = os.getenv('OPENAI_API_BASE')\n",
    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
    "\n",
    "# Init LLM and embeddings model\n",
    "llm = AzureChatOpenAI(deployment_name=\"gpt-35-turbo\", temperature=0)\n",
    "\n",
    "system_message = \"\"\"You are an AI assistant that extracts information from contact center call transcriptions between an agent and a customer. You extract the following information:\n",
    "\n",
    "- reason for contact, using a max of 3 words (key: reason_for_contact)\n",
    "- call summary, a brief description of what the customer and the agent said during the call (key: call_summary)\n",
    "- customer sentiment, which can be one of the following: positive, mixed, negative, n/a (key: customer_sentiment)\n",
    "- whether the customer's problem was solved during the call, can be yes, no, partially, n/a (key: problem_solved)\n",
    "\n",
    "When asked to analyze a transcript, you answer in JSON using the keys from above, nothing else. You answer brief. For example, a reason_for_contact could be 'service interruption'. Make sure customer_sentiment is only positive, mixed, negative, or n/a. Other values are not allowed. For problem_solved, only use yes, no, partially, n/a.\n",
    "ONLY reply in JSON using the keys mentioned above, nothing else.)\"\"\"\n",
    "\n",
    "system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)\n",
    "human_template=\"{text}\"\n",
    "human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)\n",
    "chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create new dataframe with call id, call_intents, dissatisfied_reasons\n",
    "df2 = pd.DataFrame(columns=['call_id', 'reason_for_contact', 'call_summary', 'customer_sentiment', 'problem_solved'])\n",
    "\n",
    "i = 0\n",
    "\n",
    "# analyze each call\n",
    "for id in conversations:\n",
    "    \n",
    "    # Depending on the data, this will look different\n",
    "    c = convert_call_to_string(id)\n",
    "    \n",
    "    # TODO: Use bigger model in case call is longer than max length (e.g. gpt-35-turbo-16k or gpt-4-32k)\n",
    "    tokens = encoding.encode(c)\n",
    "    if len(tokens) > 3000:\n",
    "        print(f\"Skipped call with id {id} because it is too long ({len(tokens)} tokens)\")\n",
    "        continue\n",
    "    try:\n",
    "        chain = LLMChain(llm=llm, prompt=chat_prompt)\n",
    "        result = chain.run(f\"Here is a new call transcript:\\n{c}\\nJSON:\")\n",
    "        print(result)\n",
    "        # parse results\n",
    "        r = json.loads(result)\n",
    "        \n",
    "        # Check if result was empty, mostly happened because call was empty or unusable\n",
    "        if 'reason_for_contact' not in r or len(r['reason_for_contact']) < 1:\n",
    "            continue\n",
    "        # save call results\n",
    "        df2 = pd.concat([df2, pd.DataFrame([[id, r['reason_for_contact'], r['call_summary'], r['customer_sentiment'], r['problem_solved']]], columns=['call_id', 'reason_for_contact', 'call_summary', 'customer_sentiment', 'problem_solved'])], ignore_index=True)\n",
    "        i += 1\n",
    "    except:\n",
    "        continue\n",
    "    \n",
    "    if i % 10 == 0:\n",
    "        df2.to_csv('results_call_analytics.csv', index=False)\n",
    "    if i > 10:\n",
    "        break\n",
    "\n",
    "df2.head()\n",
    "df2.to_csv('results_call_analytics.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make customer_sentiment and problem_solved lowercase\n",
    "\n",
    "df2.customer_sentiment = df2.customer_sentiment.str.lower()\n",
    "df2.problem_solved = df2.problem_solved.str.lower()\n",
    "\n",
    "df2.to_csv('results_call_analytics_cleaned.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "openai-qna-env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/01_analyze_intents.ipynb b/01_analyze_intents.ipynb
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This notebook takes call transcripts and extracts insights from them. You will need to adapt the first few cells to load your data correctly and format to a string in the following format:\n",
	"\n",
	"```\n",
	"agent: ...\\n\n",
	"customer: ...\\n\n",
	"and so on\n",
	"```"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"!pip install openai --upgrade\n",
	"!pip install langchain --upgrade"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import pandas as pd\n",
	"\n",
	"# load Transcriptions.csv via pandas and load in headers\n",
	"df = pd.read_csv('Transcriptions.csv', on_bad_lines='skip', header=0, sep=';')\n",
	"\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"conversations = {}\n",
	"\n",
	"# now convert the df into a dict of conversations, grouped by ConnectionID\n",
	"# for this iterate each row and get chanale and transcription\n",
	"\n",
	"for index, row in df.iterrows():\n",
	" conversation_id = row['ConnectionID']\n",
	" if not conversation_id in conversations:\n",
	" conversations[conversation_id] = []\n",
	" conversations[conversation_id].append({\n",
	" 'cannel': \"agent\" if row['Canale'] == '1' else \"customer\",\n",
	" 'text': row['Transcription']\n",
	" })\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# print number of keys in conversations\n",
	"print(len(conversations))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def convert_call_to_string(id):\n",
	" conversation = conversations[id]\n",
	" conversation_string = \"\"\n",
	" for turn in conversation:\n",
	" conversation_string += turn['cannel'] + \": \" + turn['text'] + \"\\n\"\n",
	" return conversation_string"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import openai\n",
	"from dotenv import load_dotenv\n",
	"from langchain.chat_models import AzureChatOpenAI\n",
	"from langchain import LLMChain\n",
	"from langchain.prompts.chat import (\n",
	" ChatPromptTemplate,\n",
	" SystemMessagePromptTemplate,\n",
	" HumanMessagePromptTemplate,\n",
	")\n",
	"import json\n",
	"import tiktoken\n",
	"encoding = tiktoken.get_encoding('cl100k_base')\n",
	"\n",
	"# Load environment variables (set OPENAI_API_KEY and OPENAI_API_BASE in .env)\n",
	"load_dotenv()\n",
	"\n",
	"# Configure Azure OpenAI Service API\n",
	"openai.api_type = \"azure\"\n",
	"openai.api_version = \"2023-05-15\"\n",
	"openai.api_base = os.getenv('OPENAI_API_BASE')\n",
	"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
	"\n",
	"# Init LLM and embeddings model\n",
	"llm = AzureChatOpenAI(deployment_name=\"gpt-35-turbo\", temperature=0)\n",
	"\n",
	"system_message = \"\"\"You are an AI assistant that extracts information from contact center call transcriptions between an agent and a customer. You extract the following information:\n",
	"\n",
	"- reason for contact, using a max of 3 words (key: reason_for_contact)\n",
	"- call summary, a brief description of what the customer and the agent said during the call (key: call_summary)\n",
	"- customer sentiment, which can be one of the following: positive, mixed, negative, n/a (key: customer_sentiment)\n",
	"- whether the customer's problem was solved during the call, can be yes, no, partially, n/a (key: problem_solved)\n",
	"\n",
	"When asked to analyze a transcript, you answer in JSON using the keys from above, nothing else. You answer brief. For example, a reason_for_contact could be 'service interruption'. Make sure customer_sentiment is only positive, mixed, negative, or n/a. Other values are not allowed. For problem_solved, only use yes, no, partially, n/a.\n",
	"ONLY reply in JSON using the keys mentioned above, nothing else.)\"\"\"\n",
	"\n",
	"system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)\n",
	"human_template=\"{text}\"\n",
	"human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)\n",
	"chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# create new dataframe with call id, call_intents, dissatisfied_reasons\n",
	"df2 = pd.DataFrame(columns=['call_id', 'reason_for_contact', 'call_summary', 'customer_sentiment', 'problem_solved'])\n",
	"\n",
	"i = 0\n",
	"\n",
	"# analyze each call\n",
	"for id in conversations:\n",
	" \n",
	" # Depending on the data, this will look different\n",
	" c = convert_call_to_string(id)\n",
	" \n",
	" # TODO: Use bigger model in case call is longer than max length (e.g. gpt-35-turbo-16k or gpt-4-32k)\n",
	" tokens = encoding.encode(c)\n",
	" if len(tokens) > 3000:\n",
	" print(f\"Skipped call with id {id} because it is too long ({len(tokens)} tokens)\")\n",
	" continue\n",
	" try:\n",
	" chain = LLMChain(llm=llm, prompt=chat_prompt)\n",
	" result = chain.run(f\"Here is a new call transcript:\\n{c}\\nJSON:\")\n",
	" print(result)\n",
	" # parse results\n",
	" r = json.loads(result)\n",
	" \n",
	" # Check if result was empty, mostly happened because call was empty or unusable\n",
	" if 'reason_for_contact' not in r or len(r['reason_for_contact']) < 1:\n",
	" continue\n",
	" # save call results\n",
	" df2 = pd.concat([df2, pd.DataFrame([[id, r['reason_for_contact'], r['call_summary'], r['customer_sentiment'], r['problem_solved']]], columns=['call_id', 'reason_for_contact', 'call_summary', 'customer_sentiment', 'problem_solved'])], ignore_index=True)\n",
	" i += 1\n",
	" except:\n",
	" continue\n",
	" \n",
	" if i % 10 == 0:\n",
	" df2.to_csv('results_call_analytics.csv', index=False)\n",
	" if i > 10:\n",
	" break\n",
	"\n",
	"df2.head()\n",
	"df2.to_csv('results_call_analytics.csv', index=False)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# make customer_sentiment and problem_solved lowercase\n",
	"\n",
	"df2.customer_sentiment = df2.customer_sentiment.str.lower()\n",
	"df2.problem_solved = df2.problem_solved.str.lower()\n",
	"\n",
	"df2.to_csv('results_call_analytics_cleaned.csv', index=False)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "openai-qna-env",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.9"
	},
	"orig_nbformat": 4
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}