Skip to content

Instantly share code, notes, and snippets.

@sebington
Last active February 1, 2025 01:36
Show Gist options
  • Save sebington/5b3ac4bb03e747f084dace454d017b8d to your computer and use it in GitHub Desktop.
Save sebington/5b3ac4bb03e747f084dace454d017b8d to your computer and use it in GitHub Desktop.
Transcribe audio file at word-level and write output to .srt
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Word-level transcriptions with Faster-Whisper"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "xRe-wO2gWNN4"
},
"outputs": [],
"source": [
"# pip install faster-whisper -q\n",
"from faster_whisper import WhisperModel"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "i8yDvreNvETR"
},
"outputs": [],
"source": [
"# model initialization (run on GPU with FP16 or on CPU with int8)\n",
"model = WhisperModel(\"small.en\", device=\"cpu\", compute_type=\"int8\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "UCZ_UwAiH5sG"
},
"outputs": [],
"source": [
"# load an audio file\n",
"audio = \"bbc_ai_edit.mp3\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Transcribe a file at word level"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.00s -> 0.48s] Stuart,\n",
"[0.58s -> 0.74s] thank\n",
"[0.74s -> 0.94s] you\n",
"[0.94s -> 1.30s] so\n",
"[1.30s -> 1.70s] much.\n",
"[2.26s -> 2.28s] You\n",
"[2.28s -> 2.38s] know,\n",
"[2.40s -> 2.70s] it's\n",
"[2.70s -> 3.30s] absolutely\n",
"[3.30s -> 3.94s] fascinating\n",
"[3.94s -> 4.38s] to\n",
"[4.38s -> 4.60s] me\n",
"[4.60s -> 4.86s] that\n",
"[4.86s -> 5.62s] you\n",
"[5.62s -> 5.78s] are\n",
"[5.78s -> 5.90s] an\n",
"[5.90s -> 6.22s] expert\n",
"[6.22s -> 6.46s] in\n",
"[6.46s -> 6.80s] artificial\n",
"[6.80s -> 7.46s] intelligence\n",
"[7.46s -> 7.80s] and\n",
"[7.80s -> 8.16s] you\n",
"[8.16s -> 8.32s] have\n",
"[8.32s -> 8.52s] been\n",
"[8.52s -> 8.88s] taking\n",
"[8.88s -> 9.18s] us\n",
"[9.18s -> 9.32s] on\n",
"[9.32s -> 9.56s] this\n",
"[9.56s -> 10.44s] relay\n",
"[10.44s -> 11.10s] race\n",
"[11.10s -> 11.46s] from\n",
"[11.46s -> 11.82s] Bagret\n",
"[11.82s -> 12.04s] to\n",
"[12.04s -> 12.38s] yourself\n",
"[12.38s -> 12.84s] to\n",
"[12.84s -> 13.18s] who\n",
"[13.18s -> 13.38s] knows\n",
"[13.38s -> 13.78s] what.\n",
"[14.26s -> 14.42s] And\n",
"[14.42s -> 14.76s] all\n",
"[14.76s -> 14.96s] I\n",
"[14.96s -> 15.12s] kept\n",
"[15.12s -> 15.44s] thinking\n",
"[15.44s -> 15.72s] about\n",
"[15.72s -> 15.92s] is\n",
"[15.92s -> 16.42s] what\n",
"[16.42s -> 16.60s] does\n",
"[16.60s -> 16.76s] this\n",
"[16.76s -> 16.92s] say\n",
"[16.92s -> 17.10s] about\n",
"[17.10s -> 17.22s] the\n",
"[17.22s -> 17.46s] human\n",
"[17.46s -> 18.00s] condition?\n",
"[18.56s -> 18.64s] You\n",
"[18.64s -> 18.74s] know,\n",
"[18.80s -> 18.90s] what\n",
"[18.90s -> 19.36s] it\n",
"[19.36s -> 19.54s] is\n",
"[19.54s -> 19.70s] to\n",
"[19.70s -> 19.88s] be\n",
"[19.88s -> 20.36s] human.\n",
"[20.84s -> 21.06s] And\n",
"[21.06s -> 21.26s] you\n",
"[21.26s -> 21.82s] laid\n",
"[21.82s -> 22.06s] out\n",
"[22.06s -> 22.30s] very\n",
"[22.30s -> 22.84s] clearly\n",
"[22.84s -> 23.60s] what\n",
"[23.60s -> 23.92s] people\n",
"[23.92s -> 24.28s] think\n",
"[24.28s -> 24.60s] might\n",
"[24.60s -> 24.88s] be\n",
"[24.88s -> 25.02s] the\n",
"[25.02s -> 25.78s] eventuality\n",
"[25.78s -> 25.94s] of\n",
"[25.94s -> 26.24s] having\n",
"[26.24s -> 26.42s] the\n",
"[26.42s -> 26.54s] end\n",
"[26.54s -> 26.68s] of\n",
"[26.68s -> 27.08s] work.\n",
"[27.08s -> 27.92s] I\n",
"[27.92s -> 28.04s] want\n",
"[28.04s -> 28.18s] to\n",
"[28.18s -> 28.28s] know\n",
"[28.28s -> 28.40s] what\n",
"[28.40s -> 28.70s] you\n",
"[28.70s -> 29.10s] think\n",
"[29.10s -> 29.78s] it\n",
"[29.78s -> 30.02s] will\n",
"[30.02s -> 30.22s] be\n",
"[30.22s -> 30.46s] like.\n",
"[30.58s -> 30.58s] You\n",
"[30.58s -> 30.68s] know,\n",
"[30.80s -> 31.02s] there\n",
"[31.02s -> 31.16s] are\n",
"[31.16s -> 31.40s] two\n",
"[31.40s -> 31.86s] scenarios.\n",
"[32.06s -> 32.28s] One,\n",
"[32.80s -> 33.04s] I\n",
"[33.04s -> 33.38s] lose\n",
"[33.38s -> 34.08s] every\n",
"[34.08s -> 34.60s] excuse\n",
"[34.60s -> 35.18s] not\n",
"[35.18s -> 35.40s] to\n",
"[35.40s -> 35.54s] learn\n",
"[35.54s -> 35.68s] the\n",
"[35.68s -> 36.04s] piano\n",
"[36.04s -> 36.24s] and\n",
"[36.24s -> 36.42s] I'm\n",
"[36.42s -> 36.60s] much\n",
"[36.60s -> 36.88s] nicer\n",
"[36.88s -> 37.08s] to\n",
"[37.08s -> 37.24s] my\n",
"[37.24s -> 37.52s] children\n",
"[37.52s -> 37.82s] and\n",
"[37.82s -> 37.98s] I\n",
"[37.98s -> 38.52s] catch\n",
"[38.52s -> 38.78s] up\n",
"[38.78s -> 38.96s] on\n",
"[38.96s -> 39.18s] all\n",
"[39.18s -> 39.28s] of\n",
"[39.28s -> 39.36s] the\n",
"[39.36s -> 39.58s] wonderful\n",
"[39.58s -> 39.82s] things\n",
"[39.82s -> 40.02s] on\n",
"[40.02s -> 40.22s] Radio\n",
"[40.22s -> 40.66s] 4.\n",
"[40.98s -> 41.38s] Or,\n",
"[41.70s -> 41.84s] you\n",
"[41.84s -> 41.98s] know,\n",
"[42.22s -> 42.36s] that\n",
"[42.36s -> 43.10s] terrifying\n",
"[43.10s -> 44.00s] dystopia,\n",
"[44.26s -> 44.34s] the\n",
"[44.34s -> 44.86s] animation\n",
"[44.86s -> 45.18s] Wall\n",
"[45.18s -> 45.48s] -E,\n",
"[45.68s -> 45.76s] where\n",
"[45.76s -> 45.90s] we\n",
"[45.90s -> 46.04s] all\n",
"[46.04s -> 46.16s] sit\n",
"[46.16s -> 46.30s] on\n",
"[46.30s -> 46.42s] our\n",
"[46.42s -> 46.74s] bottoms\n",
"[46.74s -> 46.94s] getting\n",
"[46.94s -> 47.48s] fatter,\n",
"[47.82s -> 48.12s] watching\n",
"[48.12s -> 49.02s] infomercials.\n",
"[49.02s -> 49.08s] I\n",
"[49.08s -> 49.18s] mean,\n",
"[49.24s -> 49.32s] what\n",
"[49.32s -> 49.48s] do\n",
"[49.48s -> 49.66s] you\n",
"[49.66s -> 49.98s] think\n",
"[49.98s -> 50.52s] the\n",
"[50.52s -> 50.82s] human\n",
"[50.82s -> 51.32s] condition\n",
"[51.32s -> 51.76s] leans\n",
"[51.76s -> 52.22s] towards?\n"
]
}
],
"source": [
"# transcribes and displays results (word level)\n",
"segments, _ = model.transcribe(audio, language=\"en\", word_timestamps=True)\n",
"segments = list(segments)\n",
"\n",
"for segment_wl in segments:\n",
" for word in segment_wl.words:\n",
" print(\"[%.2fs -> %.2fs] %s\" % (word.start, word.end, word.word))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# create SRT subs and write to file\n",
"import math\n",
"\n",
"def convert_seconds_to_hms(seconds):\n",
" hours, remainder = divmod(seconds, 3600)\n",
" minutes, seconds = divmod(remainder, 60)\n",
" milliseconds = math.floor((seconds % 1) * 1000)\n",
" output = f\"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}\"\n",
" return output\n",
"\n",
"count = 0\n",
"with open(f\"{audio}_word-level.srt\", 'w') as f: # enter subtitle file name\n",
" for segment in segments:\n",
" for word in segment.words:\n",
" count +=1\n",
" duration = f\"{convert_seconds_to_hms(word.start)} --> {convert_seconds_to_hms(word.end)}\\n\"\n",
" text = f\"{word.word.lstrip()}\\n\\n\"\n",
" f.write(f\"{count}\\n{duration}{text}\") # Write formatted string to the file"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment