sayakpaul · June 8, 2025 15:46
diff --git a/grade_images_with_gemini.py b/grade_images_with_gemini.py
 from google import genai
 from google.genai import types
 import typing_extensions as typing
 from PIL import Image
 import requests
 import io
 import json
 import os


 class Score(typing.TypedDict):
    score: float
    explanation: str


 class Grading(typing.TypedDict):
    accuracy_to_prompt: Score
    creativity_and_originality: Score
    visual_quality_and_realism: Score
    consistency_and_cohesion: Score
    emotional_or_thematic_resonance: Score
    overall_score: Score


 def load_image(path_or_url: str) -> Image.Image:
    """Load an image from a local path or a URL and return a PIL Image object."""
    if path_or_url.startswith("http"):
        response = requests.get(path_or_url, stream=True)
        response.raise_for_status()
        return Image.open(io.BytesIO(response.content))
    return Image.open(path_or_url)


 def convert_to_bytes(path_or_url: str) -> bytes:
    """Load an image from a path or URL and convert it to bytes."""
    image = load_image(path_or_url).convert("RGB")
    image_bytes_io = io.BytesIO()
    image.save(image_bytes_io, format="PNG")
    return image_bytes_io.getvalue()


 def prepare_inputs(prompt: str, image_path_or_uri: str):
    """Prepare inputs for the API from a given prompt and image."""
    inputs = [
        types.Part.from_text(text=prompt),
        types.Part.from_bytes(data=convert_to_bytes(image_path_or_uri), mime_type="image/png"),
    ]
    return inputs


 def load_verifier_prompt():
    """Loads the system prompt for Gemini when it acts as a verifier to grade images."""
    with open("verifier_prompt.txt", "r") as f:
        verifier_prompt = f.read().replace('"""', "")

    return verifier_prompt


 client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
 system_instruction = load_verifier_prompt()
 generation_config = types.GenerateContentConfig(
    system_instruction=system_instruction,
    response_mime_type="application/json",
    response_schema=list[Grading],
    seed=1994,
 )

 # Define inputs
 # one can use local paths too.
 image_urls = [
    (
        "realistic photo a shiny black SUV car with a mountain in the background.",
        "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/flux-edit-artifacts/assets/car.jpg",
    ),
    (
        "photo a green and funny creature standing in front a lightweight forest.",
        "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/flux-edit-artifacts/assets/green_creature.jpg",
    ),
 ]

 inputs = []
 for text, path_or_url in image_urls:
    inputs.extend(prepare_inputs(prompt=text, image_path_or_uri=path_or_url))

 # # Single image
 # response = client.models.generate_content(
 #     model='gemini-2.0-flash',
 #     contents=[
 #         "realistic photo a shiny black SUV car with a mountain in the background.",
 #         load_image("https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/flux-edit-artifacts/assets/car.jpg")
 #     ],
 #     config=generation_config
 # )

 response = client.models.generate_content(
    model="gemini-2.0-flash", contents=types.Content(parts=inputs, role="user"), config=generation_config
 )

 with open("results.json", "w") as f:
    json.dump(response.parsed, f)

 print(json.dumps(response.parsed, indent=4))
diff --git a/notes.md b/notes.md
diff --git a/requirements.txt b/requirements.txt
 google-genai
 typing-extensions
 Pillow
diff --git a/verifier_prompt.txt b/verifier_prompt.txt
 """
 You are a multimodal large-language model tasked with evaluating images
 generated by a text-to-image model. Your goal is to assess each generated
 image based on specific aspects and provide a detailed critique, along with
 a scoring system. The final output should be formatted as a JSON object
 containing individual scores for each aspect and an overall score. The keys
 in the JSON object should be: `accuracy_to_prompt`, `creativity_and_originality`,
 `visual_quality_and_realism`, `consistency_and_cohesion`,
 `emotional_or_thematic_resonance`, and `overall_score`. Below is a comprehensive
 guide to follow in your evaluation process:

 1. Key Evaluation Aspects and Scoring Criteria:
 For each aspect, provide a score from 0 to 10, where 0 represents poor
 performance and 10 represents excellent performance. For each score, include
 a short explanation or justification (1-2 sentences) explaining why that
 score was given. The aspects to evaluate are as follows:

 a) Accuracy to Prompt
 Assess how well the image matches the description given in the prompt.
 Consider whether all requested elements are present and if the scene,
 objects, and setting align accurately with the text. Score: 0 (no
 alignment) to 10 (perfect match to prompt).

 b) Creativity and Originality
 Evaluate the uniqueness and creativity of the generated image. Does the
 model present an imaginative or aesthetically engaging interpretation of the
 prompt? Is there any evidence of creativity beyond a literal interpretation?
 Score: 0 (lacks creativity) to 10 (highly creative and original).

 c) Visual Quality and Realism
 Assess the overall visual quality, including resolution, detail, and realism.
 Look for coherence in lighting, shading, and perspective. Even if the image
 is stylized or abstract, judge whether the visual elements are well-rendered
 and visually appealing. Score: 0 (poor quality) to 10 (high-quality and
 realistic).

 d) Consistency and Cohesion
 Check for internal consistency within the image. Are all elements cohesive
 and aligned with the prompt? For instance, does the perspective make sense,
 and do objects fit naturally within the scene without visual anomalies?
 Score: 0 (inconsistent) to 10 (fully cohesive and consistent).

 e) Emotional or Thematic Resonance
 Evaluate how well the image evokes the intended emotional or thematic tone of
 the prompt. For example, if the prompt is meant to be serene, does the image
 convey calmness? If it’s adventurous, does it evoke excitement? Score: 0
 (no resonance) to 10 (strong resonance with the prompt’s theme).

 2. Overall Score
 After scoring each aspect individually, provide an overall score,
 representing the model’s general performance on this image. This should be
 a weighted average based on the importance of each aspect to the prompt or an
 average of all aspects.
 """
	from google import genai
	from google.genai import types
	import typing_extensions as typing
	from PIL import Image
	import requests
	import io
	import json
	import os


	class Score(typing.TypedDict):
	score: float
	explanation: str


	class Grading(typing.TypedDict):
	accuracy_to_prompt: Score
	creativity_and_originality: Score
	visual_quality_and_realism: Score
	consistency_and_cohesion: Score
	emotional_or_thematic_resonance: Score
	overall_score: Score


	def load_image(path_or_url: str) -> Image.Image:
	"""Load an image from a local path or a URL and return a PIL Image object."""
	if path_or_url.startswith("http"):
	response = requests.get(path_or_url, stream=True)
	response.raise_for_status()
	return Image.open(io.BytesIO(response.content))
	return Image.open(path_or_url)


	def convert_to_bytes(path_or_url: str) -> bytes:
	"""Load an image from a path or URL and convert it to bytes."""
	image = load_image(path_or_url).convert("RGB")
	image_bytes_io = io.BytesIO()
	image.save(image_bytes_io, format="PNG")
	return image_bytes_io.getvalue()


	def prepare_inputs(prompt: str, image_path_or_uri: str):
	"""Prepare inputs for the API from a given prompt and image."""
	inputs = [
	types.Part.from_text(text=prompt),
	types.Part.from_bytes(data=convert_to_bytes(image_path_or_uri), mime_type="image/png"),
	]
	return inputs


	def load_verifier_prompt():
	"""Loads the system prompt for Gemini when it acts as a verifier to grade images."""
	with open("verifier_prompt.txt", "r") as f:
	verifier_prompt = f.read().replace('"""', "")

	return verifier_prompt


	client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
	system_instruction = load_verifier_prompt()
	generation_config = types.GenerateContentConfig(
	system_instruction=system_instruction,
	response_mime_type="application/json",
	response_schema=list[Grading],
	seed=1994,
	)

	# Define inputs
	# one can use local paths too.
	image_urls = [
	(
	"realistic photo a shiny black SUV car with a mountain in the background.",
	"https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/flux-edit-artifacts/assets/car.jpg",
	),
	(
	"photo a green and funny creature standing in front a lightweight forest.",
	"https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/flux-edit-artifacts/assets/green_creature.jpg",
	),
	]

	inputs = []
	for text, path_or_url in image_urls:
	inputs.extend(prepare_inputs(prompt=text, image_path_or_uri=path_or_url))

	# # Single image
	# response = client.models.generate_content(
	# model='gemini-2.0-flash',
	# contents=[
	# "realistic photo a shiny black SUV car with a mountain in the background.",
	# load_image("https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/flux-edit-artifacts/assets/car.jpg")
	# ],
	# config=generation_config
	# )

	response = client.models.generate_content(
	model="gemini-2.0-flash", contents=types.Content(parts=inputs, role="user"), config=generation_config
	)

	with open("results.json", "w") as f:
	json.dump(response.parsed, f)

	print(json.dumps(response.parsed, indent=4))
	"""
	You are a multimodal large-language model tasked with evaluating images
	generated by a text-to-image model. Your goal is to assess each generated
	image based on specific aspects and provide a detailed critique, along with
	a scoring system. The final output should be formatted as a JSON object
	containing individual scores for each aspect and an overall score. The keys
	in the JSON object should be: `accuracy_to_prompt`, `creativity_and_originality`,
	`visual_quality_and_realism`, `consistency_and_cohesion`,
	`emotional_or_thematic_resonance`, and `overall_score`. Below is a comprehensive
	guide to follow in your evaluation process:

	1. Key Evaluation Aspects and Scoring Criteria:
	For each aspect, provide a score from 0 to 10, where 0 represents poor
	performance and 10 represents excellent performance. For each score, include
	a short explanation or justification (1-2 sentences) explaining why that
	score was given. The aspects to evaluate are as follows:

	a) Accuracy to Prompt
	Assess how well the image matches the description given in the prompt.
	Consider whether all requested elements are present and if the scene,
	objects, and setting align accurately with the text. Score: 0 (no
	alignment) to 10 (perfect match to prompt).

	b) Creativity and Originality
	Evaluate the uniqueness and creativity of the generated image. Does the
	model present an imaginative or aesthetically engaging interpretation of the
	prompt? Is there any evidence of creativity beyond a literal interpretation?
	Score: 0 (lacks creativity) to 10 (highly creative and original).

	c) Visual Quality and Realism
	Assess the overall visual quality, including resolution, detail, and realism.
	Look for coherence in lighting, shading, and perspective. Even if the image
	is stylized or abstract, judge whether the visual elements are well-rendered
	and visually appealing. Score: 0 (poor quality) to 10 (high-quality and
	realistic).

	d) Consistency and Cohesion
	Check for internal consistency within the image. Are all elements cohesive
	and aligned with the prompt? For instance, does the perspective make sense,
	and do objects fit naturally within the scene without visual anomalies?
	Score: 0 (inconsistent) to 10 (fully cohesive and consistent).

	e) Emotional or Thematic Resonance
	Evaluate how well the image evokes the intended emotional or thematic tone of
	the prompt. For example, if the prompt is meant to be serene, does the image
	convey calmness? If it’s adventurous, does it evoke excitement? Score: 0
	(no resonance) to 10 (strong resonance with the prompt’s theme).

	2. Overall Score
	After scoring each aspect individually, provide an overall score,
	representing the model’s general performance on this image. This should be
	a weighted average based on the importance of each aspect to the prompt or an
	average of all aspects.
	"""