Last active
August 4, 2023 08:40
-
-
Save csiebler/fe1b62df47e10ccdcde94fcd475d5ef2 to your computer and use it in GitHub Desktop.
Quick sample notebooks for using Azure OpenAI for call transcription insights, and then subsequent topic clustering and naming
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This notebook takes call transcripts and extracts insights from them. You will need to adapt the first few cells to load your data correctly and format to a string in the following format:\n", | |
"\n", | |
"```\n", | |
"agent: ...\\n\n", | |
"customer: ...\\n\n", | |
"and so on\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!pip install openai --upgrade\n", | |
"!pip install langchain --upgrade" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import pandas as pd\n", | |
"\n", | |
"# load Transcriptions.csv via pandas and load in headers\n", | |
"df = pd.read_csv('Transcriptions.csv', on_bad_lines='skip', header=0, sep=';')\n", | |
"\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"conversations = {}\n", | |
"\n", | |
"# now convert the df into a dict of conversations, grouped by ConnectionID\n", | |
"# for this iterate each row and get chanale and transcription\n", | |
"\n", | |
"for index, row in df.iterrows():\n", | |
" conversation_id = row['ConnectionID']\n", | |
" if not conversation_id in conversations:\n", | |
" conversations[conversation_id] = []\n", | |
" conversations[conversation_id].append({\n", | |
" 'cannel': \"agent\" if row['Canale'] == '1' else \"customer\",\n", | |
" 'text': row['Transcription']\n", | |
" })\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# print number of keys in conversations\n", | |
"print(len(conversations))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def convert_call_to_string(id):\n", | |
" conversation = conversations[id]\n", | |
" conversation_string = \"\"\n", | |
" for turn in conversation:\n", | |
" conversation_string += turn['cannel'] + \": \" + turn['text'] + \"\\n\"\n", | |
" return conversation_string" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import openai\n", | |
"from dotenv import load_dotenv\n", | |
"from langchain.chat_models import AzureChatOpenAI\n", | |
"from langchain import LLMChain\n", | |
"from langchain.prompts.chat import (\n", | |
" ChatPromptTemplate,\n", | |
" SystemMessagePromptTemplate,\n", | |
" HumanMessagePromptTemplate,\n", | |
")\n", | |
"import json\n", | |
"import tiktoken\n", | |
"encoding = tiktoken.get_encoding('cl100k_base')\n", | |
"\n", | |
"# Load environment variables (set OPENAI_API_KEY and OPENAI_API_BASE in .env)\n", | |
"load_dotenv()\n", | |
"\n", | |
"# Configure Azure OpenAI Service API\n", | |
"openai.api_type = \"azure\"\n", | |
"openai.api_version = \"2023-05-15\"\n", | |
"openai.api_base = os.getenv('OPENAI_API_BASE')\n", | |
"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n", | |
"\n", | |
"# Init LLM and embeddings model\n", | |
"llm = AzureChatOpenAI(deployment_name=\"gpt-35-turbo\", temperature=0)\n", | |
"\n", | |
"system_message = \"\"\"You are an AI assistant that extracts information from contact center call transcriptions between an agent and a customer. You extract the following information:\n", | |
"\n", | |
"- reason for contact, using a max of 3 words (key: reason_for_contact)\n", | |
"- call summary, a brief description of what the customer and the agent said during the call (key: call_summary)\n", | |
"- customer sentiment, which can be one of the following: positive, mixed, negative, n/a (key: customer_sentiment)\n", | |
"- whether the customer's problem was solved during the call, can be yes, no, partially, n/a (key: problem_solved)\n", | |
"\n", | |
"When asked to analyze a transcript, you answer in JSON using the keys from above, nothing else. You answer brief. For example, a reason_for_contact could be 'service interruption'. Make sure customer_sentiment is only positive, mixed, negative, or n/a. Other values are not allowed. For problem_solved, only use yes, no, partially, n/a.\n", | |
"ONLY reply in JSON using the keys mentioned above, nothing else.)\"\"\"\n", | |
"\n", | |
"system_message_prompt = SystemMessagePromptTemplate.from_template(system_message)\n", | |
"human_template=\"{text}\"\n", | |
"human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)\n", | |
"chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# create new dataframe with call id, call_intents, dissatisfied_reasons\n", | |
"df2 = pd.DataFrame(columns=['call_id', 'reason_for_contact', 'call_summary', 'customer_sentiment', 'problem_solved'])\n", | |
"\n", | |
"i = 0\n", | |
"\n", | |
"# analyze each call\n", | |
"for id in conversations:\n", | |
" \n", | |
" # Depending on the data, this will look different\n", | |
" c = convert_call_to_string(id)\n", | |
" \n", | |
" # TODO: Use bigger model in case call is longer than max length (e.g. gpt-35-turbo-16k or gpt-4-32k)\n", | |
" tokens = encoding.encode(c)\n", | |
" if len(tokens) > 3000:\n", | |
" print(f\"Skipped call with id {id} because it is too long ({len(tokens)} tokens)\")\n", | |
" continue\n", | |
" try:\n", | |
" chain = LLMChain(llm=llm, prompt=chat_prompt)\n", | |
" result = chain.run(f\"Here is a new call transcript:\\n{c}\\nJSON:\")\n", | |
" print(result)\n", | |
" # parse results\n", | |
" r = json.loads(result)\n", | |
" \n", | |
" # Check if result was empty, mostly happened because call was empty or unusable\n", | |
" if 'reason_for_contact' not in r or len(r['reason_for_contact']) < 1:\n", | |
" continue\n", | |
" # save call results\n", | |
" df2 = pd.concat([df2, pd.DataFrame([[id, r['reason_for_contact'], r['call_summary'], r['customer_sentiment'], r['problem_solved']]], columns=['call_id', 'reason_for_contact', 'call_summary', 'customer_sentiment', 'problem_solved'])], ignore_index=True)\n", | |
" i += 1\n", | |
" except:\n", | |
" continue\n", | |
" \n", | |
" if i % 10 == 0:\n", | |
" df2.to_csv('results_call_analytics.csv', index=False)\n", | |
" if i > 10:\n", | |
" break\n", | |
"\n", | |
"df2.head()\n", | |
"df2.to_csv('results_call_analytics.csv', index=False)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# make customer_sentiment and problem_solved lowercase\n", | |
"\n", | |
"df2.customer_sentiment = df2.customer_sentiment.str.lower()\n", | |
"df2.problem_solved = df2.problem_solved.str.lower()\n", | |
"\n", | |
"df2.to_csv('results_call_analytics_cleaned.csv', index=False)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "openai-qna-env", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.9" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment