Skip to content

Instantly share code, notes, and snippets.

@nateraw
Last active July 10, 2025 18:50
Show Gist options
  • Select an option

  • Save nateraw/f8a8545e68f0b63dc3b09667cf3884a5 to your computer and use it in GitHub Desktop.

Select an option

Save nateraw/f8a8545e68f0b63dc3b09667cf3884a5 to your computer and use it in GitHub Desktop.
Gemini audio captioning
import io, os
from pathlib import Path
import torchaudio
from google import genai
from pydantic import BaseModel
# If in colab...
# from google.colab import userdata
# os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
class AudioInfo(BaseModel):
# naive because you'd want variability, but works
prompt: str
# ---
# below is what you'd actually want...see stable audio papers for guidance.
# - @ train time metadata fields joined randomly to become prompt
# - shuffle the list[str] fields
# - Use random separators (| and , with different spacing)
# - % field dropout
# ---
# tags: list[str]
# genre: str
# ...
MODEL_NAME = "gemini-2.5-pro-preview-05-06"
# Quick example for prompt-only case...would want to rework for real use case mentioned above
PROMPT = (
"Using the provided audio, respond with information that could be used to train a text-to-music model.\n\n"
"Separate the prompt into comma-separated fragments and be specific."
)
def describe_audio(audio_path: str) -> str:
# using torchaudio + in memory buffer as reference here because...
waveform, sr = torchaudio.load(audio_path)
#
# ...here you probably want to add insert slicing/uniform crop/etc and loop over the below
#
buffer = io.BytesIO()
torchaudio.save(buffer, waveform, sr, format="wav")
response = client.models.generate_content(
model=MODEL_NAME,
contents=[
PROMPT,
genai.types.Part.from_bytes(data=buffer.getvalue(), mime_type="audio/wav"),
],
config={
"response_mime_type": "application/json",
"response_schema": AudioInfo,
},
)
return response.parsed.prompt
# Example
if __name__ == "__main__":
from huggingface_hub import hf_hub_download
audio_fpath = hf_hub_download(
repo_id="nateraw/misc",
filename="thoughts_small_clip.wav",
repo_type="dataset"
)
print(describe_audio(audio_fpath))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment