Last active
May 9, 2023 16:23
-
-
Save orcaman/6f5be32ad9fb919d56f19d7b88fc91e6 to your computer and use it in GitHub Desktop.
LangChain Retrieval Question/Answering
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.utilities import WikipediaAPIWrapper | |
import dotenv | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain import OpenAI | |
from langchain.chains import RetrievalQA | |
import langchain | |
def api_client() -> WikipediaAPIWrapper: | |
return WikipediaAPIWrapper() | |
def get_wikipedia_search_term_from_command_line(): | |
if len(sys.argv) < 2: | |
print("Missing argument: wikipedia search term") | |
sys.exit(1) | |
return sys.argv[1] | |
def load_wikipedia_page(wikipedia_search_term: str) -> list[langchain.schema.Document]: | |
docs = api_client().load(wikipedia_search_term) | |
return docs | |
def split_documents_into_chunks(documents, chunk_size=800, chunk_overlap=0): | |
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
return text_splitter.split_documents(documents) | |
def get_query_from_command_line(): | |
if len(sys.argv) < 3: | |
print("Missing argument: query") | |
sys.exit(1) | |
return sys.argv[2] | |
def load_environment_variables(): | |
dotenv.load_dotenv() | |
def prepare_model_embedding(texts): | |
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY']) | |
doc_search = Chroma.from_documents(texts, embeddings) | |
return RetrievalQA.from_chain_type(llm=OpenAI(), retriever=doc_search.as_retriever()) | |
def main(): | |
load_environment_variables() | |
wikipedia_search_term = get_wikipedia_search_term_from_command_line() | |
if ',' in wikipedia_search_term: | |
wikipedia_search_terms = wikipedia_search_term.split(',') | |
documents = [] | |
for term in wikipedia_search_terms: | |
documents.extend(load_wikipedia_page(term.strip())) | |
else: | |
documents = load_wikipedia_page(wikipedia_search_term.strip()) | |
texts = split_documents_into_chunks(documents) | |
chain = prepare_model_embedding(texts) | |
query = get_query_from_command_line() | |
print('\n\n\n\n\n-----------------') | |
print('wikipedia search terms:', wikipedia_search_term) | |
print('question:', query) | |
print('answer:', chain.run(query)) | |
print('-----------------\n\n') | |
if __name__ == "__main__": | |
main() |
requirements.txt:
openai==0.27.6
langchain==0.0.161
python-dotenv==1.0.0
nltk==3.8.1
unstructured==0.6.3
pdfminer.six==20221105
chromadb==0.3.21
tiktoken==0.3.3
wikipedia==1.4.0
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To try it:
python wikipedia_learner.py 'Oded Menashe, Eden Harel' 'When was Oded Menashe, the Israeli presenter, born? Who is he married to?'