import copy
import time
import os
from ragas.testset import TestsetGenerator
from ragas import EvaluationDataset, evaluate, RunConfig
from ragas.metrics import (
LLMContextRecall,
Faithfulness,
FactualCorrectness,
ResponseRelevancy,
ContextEntityRecall,
NoiseSensitivity,
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain.prompts import ChatPromptTemplate
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict
from langchain_core.documents import Document
The notebook has two main issues with how it handles the dataset:
- Generic variable naming – using
dataset
doesn’t clearly indicate its purpose - In-place modification – modifying the original dataset directly instead of creating copies
These issues occur in three specific code blocks in the notebook.
# Improved – Descriptive variable name
generator = TestsetGenerator(
llm=LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1")),
embedding_model=LangchainEmbeddingsWrapper(OpenAIEmbeddings())
)
ragas_dataset = generator.generate_with_langchain_docs(docs, testset_size=10)
# Improved – Creates copy first and updates references
baseline_dataset = copy.deepcopy(ragas_dataset)
for test_row in baseline_dataset:
response = graph.invoke({"question": test_row.eval_sample.user_input})
test_row.eval_sample.response = response["response"]
test_row.eval_sample.retrieved_contexts = [
context.page_content for context in response["context"]
]
# Convert to evaluation dataset
baseline_evaluation_dataset = EvaluationDataset.from_pandas(
baseline_dataset.to_pandas()
)
result = evaluate(
dataset=baseline_evaluation_dataset,
metrics=[
LLMContextRecall(),
Faithfulness(),
FactualCorrectness(),
ResponseRelevancy(),
ContextEntityRecall(),
NoiseSensitivity()
],
llm=LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")),
run_config=RunConfig(timeout=300, max_retries=15, max_wait=90, max_workers=8, log_tenacity=True)
)
# Improved – Creates a fresh copy and uses a new evaluation dataset
reranked_dataset = copy.deepcopy(ragas_dataset)
for test_row in reranked_dataset:
response = graph.invoke({"question": test_row.eval_sample.user_input})
test_row.eval_sample.response = response["response"]
test_row.eval_sample.retrieved_contexts = [
context.page_content for context in response["context"]
]
time.sleep(2) # to help avoid rate limiting
# Create a new evaluation dataset for the reranked results
reranked_evaluation_dataset = EvaluationDataset.from_pandas(
reranked_dataset.to_pandas()
)
result_reranked = evaluate(
dataset=reranked_evaluation_dataset,
metrics=[
LLMContextRecall(),
Faithfulness(),
FactualCorrectness(),
ResponseRelevancy(),
ContextEntityRecall(),
NoiseSensitivity()
],
llm=LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")),
run_config=RunConfig(timeout=300, max_retries=15, max_wait=90, max_workers=8, log_tenacity=True)
)
-
Descriptive Naming
Variable names should clearly indicate their purpose and content. Usingragas_dataset
immediately communicates the type of data, rather than a genericdataset
. -
Data Preservation
Always preserve the original data by creating copies before modification, especially when:- The original will be needed again
- You want to compare before/after states
- Different processing steps will be applied
-
Clear Data Flow
Using distinct variable names at each stage (ragas_dataset
,baseline_dataset
,reranked_dataset
) makes the pipeline’s flow explicit and easier to understand.
This solution applies the principle of parsimony by making only the minimum necessary changes to fix the specific issues while maintaining the original logic and functionality of the notebook.