nateraw · July 10, 2025 18:50
diff --git a/gemini_audio_captioning.py b/gemini_audio_captioning.py
 import io, os
 from pathlib import Path

 import torchaudio
 from google import genai
 from pydantic import BaseModel

 # If in colab...
 # from google.colab import userdata
 # os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

 client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

 class AudioInfo(BaseModel):
    # naive because you'd want variability, but works
    prompt: str

    # ---
    # below is what you'd actually want...see stable audio papers for guidance.
    #   - @ train time metadata fields joined randomly to become prompt
    #   - shuffle the list[str] fields
    #   - Use random separators (| and , with different spacing)
    #   - % field dropout
    # ---
    # tags: list[str]
    # genre: str
    # ...

 MODEL_NAME = "gemini-2.5-pro-preview-05-06"

 # Quick example for prompt-only case...would want to rework for real use case mentioned above
 PROMPT = (
    "Using the provided audio, respond with information that could be used to train a text-to-music model.\n\n"
    "Separate the prompt into comma-separated fragments and be specific."
 )

 def describe_audio(audio_path: str) -> str:
    # using torchaudio + in memory buffer as reference here because...
    waveform, sr = torchaudio.load(audio_path)
    #
    # ...here you probably want to add insert slicing/uniform crop/etc and loop over the below
    #
    buffer = io.BytesIO()
    torchaudio.save(buffer, waveform, sr, format="wav")
    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=[
            PROMPT,
            genai.types.Part.from_bytes(data=buffer.getvalue(), mime_type="audio/wav"),
        ],
        config={
            "response_mime_type": "application/json",
            "response_schema": AudioInfo,
        },
    )
    return response.parsed.prompt

 # Example
 if __name__ == "__main__":
    from huggingface_hub import hf_hub_download
    audio_fpath = hf_hub_download(
        repo_id="nateraw/misc",
        filename="thoughts_small_clip.wav",
        repo_type="dataset"
    )
    print(describe_audio(audio_fpath))
	import io, os
	from pathlib import Path

	import torchaudio
	from google import genai
	from pydantic import BaseModel

	# If in colab...
	# from google.colab import userdata
	# os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

	client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

	class AudioInfo(BaseModel):
	# naive because you'd want variability, but works
	prompt: str

	# ---
	# below is what you'd actually want...see stable audio papers for guidance.
	# - @ train time metadata fields joined randomly to become prompt
	# - shuffle the list[str] fields
	# - Use random separators (\| and , with different spacing)
	# - % field dropout
	# ---
	# tags: list[str]
	# genre: str
	# ...

	MODEL_NAME = "gemini-2.5-pro-preview-05-06"

	# Quick example for prompt-only case...would want to rework for real use case mentioned above
	PROMPT = (
	"Using the provided audio, respond with information that could be used to train a text-to-music model.\n\n"
	"Separate the prompt into comma-separated fragments and be specific."
	)

	def describe_audio(audio_path: str) -> str:
	# using torchaudio + in memory buffer as reference here because...
	waveform, sr = torchaudio.load(audio_path)
	#
	# ...here you probably want to add insert slicing/uniform crop/etc and loop over the below
	#
	buffer = io.BytesIO()
	torchaudio.save(buffer, waveform, sr, format="wav")
	response = client.models.generate_content(
	model=MODEL_NAME,
	contents=[
	PROMPT,
	genai.types.Part.from_bytes(data=buffer.getvalue(), mime_type="audio/wav"),
	],
	config={
	"response_mime_type": "application/json",
	"response_schema": AudioInfo,
	},
	)
	return response.parsed.prompt

	# Example
	if __name__ == "__main__":
	from huggingface_hub import hf_hub_download
	audio_fpath = hf_hub_download(
	repo_id="nateraw/misc",
	filename="thoughts_small_clip.wav",
	repo_type="dataset"
	)
	print(describe_audio(audio_fpath))
No results found