Last active
February 20, 2024 01:51
-
-
Save jyouturner/8897c343bb19efba6ff0070d759816f4 to your computer and use it in GitHub Desktop.
langchain_chat_with_bigquery.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"authorship_tag": "ABX9TyNLI0RUubo6DDkCKEyaxAVF", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/jyouturner/8897c343bb19efba6ff0070d759816f4/langchain_chat_with_bigquery.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"This notebook demonstrates querying GCP BigQuery dataset with Langchain agent.\n", | |
"\n", | |
"It is inspired by https://www.datascienceengineer.com/blog/post-chat-with-bigquery\n", | |
"\n", | |
"Note: must specifify langchain and sqlalchemy package version (only centain versions work...)\n", | |
"\n", | |
"Example dataset is from https://github.com/datasciencedojo/datasets/tree/master/WorldDBTables\n" | |
], | |
"metadata": { | |
"id": "zl57OZQq2xhi" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "Cit21L1acRuQ" | |
}, | |
"outputs": [], | |
"source": [ | |
"!pip install langchain==0.1.6 langchain-community==0.0.18 google-cloud-bigquery" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install openai tiktoken tabulate sqlalchemy==1.4.51 sqlalchemy-bigquery==1.9.0" | |
], | |
"metadata": { | |
"id": "vyEO91b_gJ7D" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install pyodbc" | |
], | |
"metadata": { | |
"id": "S3J9cX5Upy4v" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from google.cloud import bigquery\n", | |
"from sqlalchemy import *\n", | |
"from sqlalchemy.engine import create_engine\n", | |
"from sqlalchemy.schema import *\n", | |
"import os\n", | |
"from langchain.agents import create_sql_agent\n", | |
"from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n", | |
"from langchain.sql_database import SQLDatabase\n", | |
"from langchain.llms.openai import OpenAI\n", | |
"from langchain.agents import AgentExecutor" | |
], | |
"metadata": { | |
"id": "uO5HxJh5n5u4" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from google.colab import auth\n", | |
"auth.authenticate_user()" | |
], | |
"metadata": { | |
"id": "WvxBWbPTdkmE" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# enter your GCP project id\n", | |
"#@title Enter Project Id, DataSet name amd Region\n", | |
"project_id = \"\" #@param {type:\"string\"}\n", | |
"dataset_name = \"world_data2\" #@param {type:\"string\"}\n", | |
"region = \"us-east1\" #@param {type:\"string\"}" | |
], | |
"metadata": { | |
"id": "G_rYABd3drHf" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# create dataset\n", | |
"!bq --project_id={project_id} mk -d --location={region} {dataset_name}\n" | |
], | |
"metadata": { | |
"id": "LLceky4-yN64" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# download the csv files\n", | |
"!curl -L https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/CityTable.csv -o CityTable.csv\n", | |
"!curl -L https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/CountryTable.csv -o CountryTable.csv\n", | |
"!curl -L https://raw.githubusercontent.com/datasciencedojo/datasets/master/WorldDBTables/LanguageTable.csv -o LanguageTable.csv" | |
], | |
"metadata": { | |
"id": "5__FAXH8z006" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# upload CSV files\n", | |
"!bq --project_id={project_id} load --autodetect --source_format=CSV {dataset_name}.city CityTable.csv\n", | |
"!bq --project_id={project_id} load --autodetect --source_format=CSV {dataset_name}.country CountryTable.csv\n", | |
"!bq --project_id={project_id} load --autodetect --source_format=CSV {dataset_name}.language LanguageTable.csv\n" | |
], | |
"metadata": { | |
"id": "fNX0fpL_yzAz" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from google.cloud import bigquery\n", | |
"client = bigquery.Client(project=project_id)" | |
], | |
"metadata": { | |
"id": "Qlszlzy4dyM0" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"sqlalchemy_url = f'bigquery://{project_id}/{dataset_name}'" | |
], | |
"metadata": { | |
"id": "wb0907xJgkgR" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# we are using OpenAI, make sure to add OpenAI API Key in the secrets section\n", | |
"from google.colab import userdata\n", | |
"os.environ[\"OPENAI_API_KEY\"] = userdata.get('OPENAI_API_KEY')" | |
], | |
"metadata": { | |
"id": "qdKneGncottE" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"db = SQLDatabase.from_uri(sqlalchemy_url)\n", | |
"llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo-instruct\")\n", | |
"toolkit = SQLDatabaseToolkit(db=db, llm=llm)\n", | |
"agent_executor = create_sql_agent(\n", | |
" llm=llm,\n", | |
" toolkit=toolkit,\n", | |
" verbose=True,\n", | |
" top_k=10\n", | |
")" | |
], | |
"metadata": { | |
"id": "dqbUPseKo7_1" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"agent_executor.run(\"show me the top 5 countries with the most population, with their capital cities\")" | |
], | |
"metadata": { | |
"collapsed": true, | |
"id": "fwuL7UfLscTA" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment