Skip to content

Instantly share code, notes, and snippets.

@jyouturner
Last active February 20, 2024 01:51
Show Gist options
  • Save jyouturner/8897c343bb19efba6ff0070d759816f4 to your computer and use it in GitHub Desktop.
Save jyouturner/8897c343bb19efba6ff0070d759816f4 to your computer and use it in GitHub Desktop.
langchain_chat_with_bigquery.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNLI0RUubo6DDkCKEyaxAVF",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/jyouturner/8897c343bb19efba6ff0070d759816f4/langchain_chat_with_bigquery.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"This notebook demonstrates querying GCP BigQuery dataset with Langchain agent.\n",
"\n",
"It is inspired by https://www.datascienceengineer.com/blog/post-chat-with-bigquery\n",
"\n",
"Note: must specifify langchain and sqlalchemy package version (only centain versions work...)\n",
"\n",
"Example dataset is from https://github.com/datasciencedojo/datasets/tree/master/WorldDBTables\n"
],
"metadata": {
"id": "zl57OZQq2xhi"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Cit21L1acRuQ"
},
"outputs": [],
"source": [
"!pip install langchain==0.1.6 langchain-community==0.0.18 google-cloud-bigquery"
]
},
{
"cell_type": "code",
"source": [
"!pip install openai tiktoken tabulate sqlalchemy==1.4.51 sqlalchemy-bigquery==1.9.0"
],
"metadata": {
"id": "vyEO91b_gJ7D"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip install pyodbc"
],
"metadata": {
"id": "S3J9cX5Upy4v"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from google.cloud import bigquery\n",
"from sqlalchemy import *\n",
"from sqlalchemy.engine import create_engine\n",
"from sqlalchemy.schema import *\n",
"import os\n",
"from langchain.agents import create_sql_agent\n",
"from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n",
"from langchain.sql_database import SQLDatabase\n",
"from langchain.llms.openai import OpenAI\n",
"from langchain.agents import AgentExecutor"
],
"metadata": {
"id": "uO5HxJh5n5u4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from google.colab import auth\n",
"auth.authenticate_user()"
],
"metadata": {
"id": "WvxBWbPTdkmE"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# enter your GCP project id\n",
"#@title Enter Project Id, DataSet name amd Region\n",
"project_id = \"\" #@param {type:\"string\"}\n",
"dataset_name = \"world_data2\" #@param {type:\"string\"}\n",
"region = \"us-east1\" #@param {type:\"string\"}"
],
"metadata": {
"id": "G_rYABd3drHf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# create dataset\n",
"!bq --project_id={project_id} mk -d --location={region} {dataset_name}\n"
],
"metadata": {
"id": "LLceky4-yN64"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# download the csv files\n",
"!curl -L https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/CityTable.csv -o CityTable.csv\n",
"!curl -L https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/CountryTable.csv -o CountryTable.csv\n",
"!curl -L https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/LanguageTable.csv -o LanguageTable.csv"
],
"metadata": {
"id": "5__FAXH8z006"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# upload CSV files\n",
"!bq --project_id={project_id} load --autodetect --source_format=CSV {dataset_name}.city CityTable.csv\n",
"!bq --project_id={project_id} load --autodetect --source_format=CSV {dataset_name}.country CountryTable.csv\n",
"!bq --project_id={project_id} load --autodetect --source_format=CSV {dataset_name}.language LanguageTable.csv\n"
],
"metadata": {
"id": "fNX0fpL_yzAz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from google.cloud import bigquery\n",
"client = bigquery.Client(project=project_id)"
],
"metadata": {
"id": "Qlszlzy4dyM0"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"sqlalchemy_url = f'bigquery://{project_id}/{dataset_name}'"
],
"metadata": {
"id": "wb0907xJgkgR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# we are using OpenAI, make sure to add OpenAI API Key in the secrets section\n",
"from google.colab import userdata\n",
"os.environ[\"OPENAI_API_KEY\"] = userdata.get('OPENAI_API_KEY')"
],
"metadata": {
"id": "qdKneGncottE"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"db = SQLDatabase.from_uri(sqlalchemy_url)\n",
"llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-instruct\")\n",
"toolkit = SQLDatabaseToolkit(db=db, llm=llm)\n",
"agent_executor = create_sql_agent(\n",
" llm=llm,\n",
" toolkit=toolkit,\n",
" verbose=True,\n",
" top_k=10\n",
")"
],
"metadata": {
"id": "dqbUPseKo7_1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"agent_executor.run(\"show me the top 5 countries with the most population, with their capital cities\")"
],
"metadata": {
"collapsed": true,
"id": "fwuL7UfLscTA"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment