Last active
July 10, 2025 18:50
-
-
Save nateraw/f8a8545e68f0b63dc3b09667cf3884a5 to your computer and use it in GitHub Desktop.
Gemini audio captioning
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import io, os | |
| from pathlib import Path | |
| import torchaudio | |
| from google import genai | |
| from pydantic import BaseModel | |
| # If in colab... | |
| # from google.colab import userdata | |
| # os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY') | |
| client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) | |
| class AudioInfo(BaseModel): | |
| # naive because you'd want variability, but works | |
| prompt: str | |
| # --- | |
| # below is what you'd actually want...see stable audio papers for guidance. | |
| # - @ train time metadata fields joined randomly to become prompt | |
| # - shuffle the list[str] fields | |
| # - Use random separators (| and , with different spacing) | |
| # - % field dropout | |
| # --- | |
| # tags: list[str] | |
| # genre: str | |
| # ... | |
| MODEL_NAME = "gemini-2.5-pro-preview-05-06" | |
| # Quick example for prompt-only case...would want to rework for real use case mentioned above | |
| PROMPT = ( | |
| "Using the provided audio, respond with information that could be used to train a text-to-music model.\n\n" | |
| "Separate the prompt into comma-separated fragments and be specific." | |
| ) | |
| def describe_audio(audio_path: str) -> str: | |
| # using torchaudio + in memory buffer as reference here because... | |
| waveform, sr = torchaudio.load(audio_path) | |
| # | |
| # ...here you probably want to add insert slicing/uniform crop/etc and loop over the below | |
| # | |
| buffer = io.BytesIO() | |
| torchaudio.save(buffer, waveform, sr, format="wav") | |
| response = client.models.generate_content( | |
| model=MODEL_NAME, | |
| contents=[ | |
| PROMPT, | |
| genai.types.Part.from_bytes(data=buffer.getvalue(), mime_type="audio/wav"), | |
| ], | |
| config={ | |
| "response_mime_type": "application/json", | |
| "response_schema": AudioInfo, | |
| }, | |
| ) | |
| return response.parsed.prompt | |
| # Example | |
| if __name__ == "__main__": | |
| from huggingface_hub import hf_hub_download | |
| audio_fpath = hf_hub_download( | |
| repo_id="nateraw/misc", | |
| filename="thoughts_small_clip.wav", | |
| repo_type="dataset" | |
| ) | |
| print(describe_audio(audio_fpath)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment