sebington · February 1, 2025 01:36
diff --git a/faster-whisper_word-level.ipynb b/faster-whisper_word-level.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Word-level transcriptions with Faster-Whisper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "xRe-wO2gWNN4"
   },
   "outputs": [],
   "source": [
    "# pip install faster-whisper -q\n",
    "from faster_whisper import WhisperModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "i8yDvreNvETR"
   },
   "outputs": [],
   "source": [
    "# model initialization (run on GPU with FP16 or on CPU with int8)\n",
    "model = WhisperModel(\"small.en\", device=\"cpu\", compute_type=\"int8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "UCZ_UwAiH5sG"
   },
   "outputs": [],
   "source": [
    "# load an audio file\n",
    "audio = \"bbc_ai_edit.mp3\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Transcribe a file at word level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00s -> 0.48s]  Stuart,\n",
      "[0.58s -> 0.74s]  thank\n",
      "[0.74s -> 0.94s]  you\n",
      "[0.94s -> 1.30s]  so\n",
      "[1.30s -> 1.70s]  much.\n",
      "[2.26s -> 2.28s]  You\n",
      "[2.28s -> 2.38s]  know,\n",
      "[2.40s -> 2.70s]  it's\n",
      "[2.70s -> 3.30s]  absolutely\n",
      "[3.30s -> 3.94s]  fascinating\n",
      "[3.94s -> 4.38s]  to\n",
      "[4.38s -> 4.60s]  me\n",
      "[4.60s -> 4.86s]  that\n",
      "[4.86s -> 5.62s]  you\n",
      "[5.62s -> 5.78s]  are\n",
      "[5.78s -> 5.90s]  an\n",
      "[5.90s -> 6.22s]  expert\n",
      "[6.22s -> 6.46s]  in\n",
      "[6.46s -> 6.80s]  artificial\n",
      "[6.80s -> 7.46s]  intelligence\n",
      "[7.46s -> 7.80s]  and\n",
      "[7.80s -> 8.16s]  you\n",
      "[8.16s -> 8.32s]  have\n",
      "[8.32s -> 8.52s]  been\n",
      "[8.52s -> 8.88s]  taking\n",
      "[8.88s -> 9.18s]  us\n",
      "[9.18s -> 9.32s]  on\n",
      "[9.32s -> 9.56s]  this\n",
      "[9.56s -> 10.44s]  relay\n",
      "[10.44s -> 11.10s]  race\n",
      "[11.10s -> 11.46s]  from\n",
      "[11.46s -> 11.82s]  Bagret\n",
      "[11.82s -> 12.04s]  to\n",
      "[12.04s -> 12.38s]  yourself\n",
      "[12.38s -> 12.84s]  to\n",
      "[12.84s -> 13.18s]  who\n",
      "[13.18s -> 13.38s]  knows\n",
      "[13.38s -> 13.78s]  what.\n",
      "[14.26s -> 14.42s]  And\n",
      "[14.42s -> 14.76s]  all\n",
      "[14.76s -> 14.96s]  I\n",
      "[14.96s -> 15.12s]  kept\n",
      "[15.12s -> 15.44s]  thinking\n",
      "[15.44s -> 15.72s]  about\n",
      "[15.72s -> 15.92s]  is\n",
      "[15.92s -> 16.42s]  what\n",
      "[16.42s -> 16.60s]  does\n",
      "[16.60s -> 16.76s]  this\n",
      "[16.76s -> 16.92s]  say\n",
      "[16.92s -> 17.10s]  about\n",
      "[17.10s -> 17.22s]  the\n",
      "[17.22s -> 17.46s]  human\n",
      "[17.46s -> 18.00s]  condition?\n",
      "[18.56s -> 18.64s]  You\n",
      "[18.64s -> 18.74s]  know,\n",
      "[18.80s -> 18.90s]  what\n",
      "[18.90s -> 19.36s]  it\n",
      "[19.36s -> 19.54s]  is\n",
      "[19.54s -> 19.70s]  to\n",
      "[19.70s -> 19.88s]  be\n",
      "[19.88s -> 20.36s]  human.\n",
      "[20.84s -> 21.06s]  And\n",
      "[21.06s -> 21.26s]  you\n",
      "[21.26s -> 21.82s]  laid\n",
      "[21.82s -> 22.06s]  out\n",
      "[22.06s -> 22.30s]  very\n",
      "[22.30s -> 22.84s]  clearly\n",
      "[22.84s -> 23.60s]  what\n",
      "[23.60s -> 23.92s]  people\n",
      "[23.92s -> 24.28s]  think\n",
      "[24.28s -> 24.60s]  might\n",
      "[24.60s -> 24.88s]  be\n",
      "[24.88s -> 25.02s]  the\n",
      "[25.02s -> 25.78s]  eventuality\n",
      "[25.78s -> 25.94s]  of\n",
      "[25.94s -> 26.24s]  having\n",
      "[26.24s -> 26.42s]  the\n",
      "[26.42s -> 26.54s]  end\n",
      "[26.54s -> 26.68s]  of\n",
      "[26.68s -> 27.08s]  work.\n",
      "[27.08s -> 27.92s]  I\n",
      "[27.92s -> 28.04s]  want\n",
      "[28.04s -> 28.18s]  to\n",
      "[28.18s -> 28.28s]  know\n",
      "[28.28s -> 28.40s]  what\n",
      "[28.40s -> 28.70s]  you\n",
      "[28.70s -> 29.10s]  think\n",
      "[29.10s -> 29.78s]  it\n",
      "[29.78s -> 30.02s]  will\n",
      "[30.02s -> 30.22s]  be\n",
      "[30.22s -> 30.46s]  like.\n",
      "[30.58s -> 30.58s]  You\n",
      "[30.58s -> 30.68s]  know,\n",
      "[30.80s -> 31.02s]  there\n",
      "[31.02s -> 31.16s]  are\n",
      "[31.16s -> 31.40s]  two\n",
      "[31.40s -> 31.86s]  scenarios.\n",
      "[32.06s -> 32.28s]  One,\n",
      "[32.80s -> 33.04s]  I\n",
      "[33.04s -> 33.38s]  lose\n",
      "[33.38s -> 34.08s]  every\n",
      "[34.08s -> 34.60s]  excuse\n",
      "[34.60s -> 35.18s]  not\n",
      "[35.18s -> 35.40s]  to\n",
      "[35.40s -> 35.54s]  learn\n",
      "[35.54s -> 35.68s]  the\n",
      "[35.68s -> 36.04s]  piano\n",
      "[36.04s -> 36.24s]  and\n",
      "[36.24s -> 36.42s]  I'm\n",
      "[36.42s -> 36.60s]  much\n",
      "[36.60s -> 36.88s]  nicer\n",
      "[36.88s -> 37.08s]  to\n",
      "[37.08s -> 37.24s]  my\n",
      "[37.24s -> 37.52s]  children\n",
      "[37.52s -> 37.82s]  and\n",
      "[37.82s -> 37.98s]  I\n",
      "[37.98s -> 38.52s]  catch\n",
      "[38.52s -> 38.78s]  up\n",
      "[38.78s -> 38.96s]  on\n",
      "[38.96s -> 39.18s]  all\n",
      "[39.18s -> 39.28s]  of\n",
      "[39.28s -> 39.36s]  the\n",
      "[39.36s -> 39.58s]  wonderful\n",
      "[39.58s -> 39.82s]  things\n",
      "[39.82s -> 40.02s]  on\n",
      "[40.02s -> 40.22s]  Radio\n",
      "[40.22s -> 40.66s]  4.\n",
      "[40.98s -> 41.38s]  Or,\n",
      "[41.70s -> 41.84s]  you\n",
      "[41.84s -> 41.98s]  know,\n",
      "[42.22s -> 42.36s]  that\n",
      "[42.36s -> 43.10s]  terrifying\n",
      "[43.10s -> 44.00s]  dystopia,\n",
      "[44.26s -> 44.34s]  the\n",
      "[44.34s -> 44.86s]  animation\n",
      "[44.86s -> 45.18s]  Wall\n",
      "[45.18s -> 45.48s] -E,\n",
      "[45.68s -> 45.76s]  where\n",
      "[45.76s -> 45.90s]  we\n",
      "[45.90s -> 46.04s]  all\n",
      "[46.04s -> 46.16s]  sit\n",
      "[46.16s -> 46.30s]  on\n",
      "[46.30s -> 46.42s]  our\n",
      "[46.42s -> 46.74s]  bottoms\n",
      "[46.74s -> 46.94s]  getting\n",
      "[46.94s -> 47.48s]  fatter,\n",
      "[47.82s -> 48.12s]  watching\n",
      "[48.12s -> 49.02s]  infomercials.\n",
      "[49.02s -> 49.08s]  I\n",
      "[49.08s -> 49.18s]  mean,\n",
      "[49.24s -> 49.32s]  what\n",
      "[49.32s -> 49.48s]  do\n",
      "[49.48s -> 49.66s]  you\n",
      "[49.66s -> 49.98s]  think\n",
      "[49.98s -> 50.52s]  the\n",
      "[50.52s -> 50.82s]  human\n",
      "[50.82s -> 51.32s]  condition\n",
      "[51.32s -> 51.76s]  leans\n",
      "[51.76s -> 52.22s]  towards?\n"
     ]
    }
   ],
   "source": [
    "# transcribes and displays results (word level)\n",
    "segments, _ = model.transcribe(audio, language=\"en\", word_timestamps=True)\n",
    "segments = list(segments)\n",
    "\n",
    "for segment_wl in segments:\n",
    "    for word in segment_wl.words:\n",
    "        print(\"[%.2fs -> %.2fs] %s\" % (word.start, word.end, word.word))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create SRT subs and write to file\n",
    "import math\n",
    "\n",
    "def convert_seconds_to_hms(seconds):\n",
    "    hours, remainder = divmod(seconds, 3600)\n",
    "    minutes, seconds = divmod(remainder, 60)\n",
    "    milliseconds = math.floor((seconds % 1) * 1000)\n",
    "    output = f\"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}\"\n",
    "    return output\n",
    "\n",
    "count = 0\n",
    "with open(f\"{audio}_word-level.srt\", 'w') as f: # enter subtitle file name\n",
    "      for segment in segments:\n",
    "        for word in segment.words:\n",
    "            count +=1\n",
    "            duration = f\"{convert_seconds_to_hms(word.start)} --> {convert_seconds_to_hms(word.end)}\\n\"\n",
    "            text = f\"{word.word.lstrip()}\\n\\n\"\n",
    "            f.write(f\"{count}\\n{duration}{text}\")  # Write formatted string to the file"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Word-level transcriptions with Faster-Whisper"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"id": "xRe-wO2gWNN4"
	},
	"outputs": [],
	"source": [
	"# pip install faster-whisper -q\n",
	"from faster_whisper import WhisperModel"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"id": "i8yDvreNvETR"
	},
	"outputs": [],
	"source": [
	"# model initialization (run on GPU with FP16 or on CPU with int8)\n",
	"model = WhisperModel(\"small.en\", device=\"cpu\", compute_type=\"int8\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"id": "UCZ_UwAiH5sG"
	},
	"outputs": [],
	"source": [
	"# load an audio file\n",
	"audio = \"bbc_ai_edit.mp3\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Transcribe a file at word level"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[0.00s -> 0.48s] Stuart,\n",
	"[0.58s -> 0.74s] thank\n",
	"[0.74s -> 0.94s] you\n",
	"[0.94s -> 1.30s] so\n",
	"[1.30s -> 1.70s] much.\n",
	"[2.26s -> 2.28s] You\n",
	"[2.28s -> 2.38s] know,\n",
	"[2.40s -> 2.70s] it's\n",
	"[2.70s -> 3.30s] absolutely\n",
	"[3.30s -> 3.94s] fascinating\n",
	"[3.94s -> 4.38s] to\n",
	"[4.38s -> 4.60s] me\n",
	"[4.60s -> 4.86s] that\n",
	"[4.86s -> 5.62s] you\n",
	"[5.62s -> 5.78s] are\n",
	"[5.78s -> 5.90s] an\n",
	"[5.90s -> 6.22s] expert\n",
	"[6.22s -> 6.46s] in\n",
	"[6.46s -> 6.80s] artificial\n",
	"[6.80s -> 7.46s] intelligence\n",
	"[7.46s -> 7.80s] and\n",
	"[7.80s -> 8.16s] you\n",
	"[8.16s -> 8.32s] have\n",
	"[8.32s -> 8.52s] been\n",
	"[8.52s -> 8.88s] taking\n",
	"[8.88s -> 9.18s] us\n",
	"[9.18s -> 9.32s] on\n",
	"[9.32s -> 9.56s] this\n",
	"[9.56s -> 10.44s] relay\n",
	"[10.44s -> 11.10s] race\n",
	"[11.10s -> 11.46s] from\n",
	"[11.46s -> 11.82s] Bagret\n",
	"[11.82s -> 12.04s] to\n",
	"[12.04s -> 12.38s] yourself\n",
	"[12.38s -> 12.84s] to\n",
	"[12.84s -> 13.18s] who\n",
	"[13.18s -> 13.38s] knows\n",
	"[13.38s -> 13.78s] what.\n",
	"[14.26s -> 14.42s] And\n",
	"[14.42s -> 14.76s] all\n",
	"[14.76s -> 14.96s] I\n",
	"[14.96s -> 15.12s] kept\n",
	"[15.12s -> 15.44s] thinking\n",
	"[15.44s -> 15.72s] about\n",
	"[15.72s -> 15.92s] is\n",
	"[15.92s -> 16.42s] what\n",
	"[16.42s -> 16.60s] does\n",
	"[16.60s -> 16.76s] this\n",
	"[16.76s -> 16.92s] say\n",
	"[16.92s -> 17.10s] about\n",
	"[17.10s -> 17.22s] the\n",
	"[17.22s -> 17.46s] human\n",
	"[17.46s -> 18.00s] condition?\n",
	"[18.56s -> 18.64s] You\n",
	"[18.64s -> 18.74s] know,\n",
	"[18.80s -> 18.90s] what\n",
	"[18.90s -> 19.36s] it\n",
	"[19.36s -> 19.54s] is\n",
	"[19.54s -> 19.70s] to\n",
	"[19.70s -> 19.88s] be\n",
	"[19.88s -> 20.36s] human.\n",
	"[20.84s -> 21.06s] And\n",
	"[21.06s -> 21.26s] you\n",
	"[21.26s -> 21.82s] laid\n",
	"[21.82s -> 22.06s] out\n",
	"[22.06s -> 22.30s] very\n",
	"[22.30s -> 22.84s] clearly\n",
	"[22.84s -> 23.60s] what\n",
	"[23.60s -> 23.92s] people\n",
	"[23.92s -> 24.28s] think\n",
	"[24.28s -> 24.60s] might\n",
	"[24.60s -> 24.88s] be\n",
	"[24.88s -> 25.02s] the\n",
	"[25.02s -> 25.78s] eventuality\n",
	"[25.78s -> 25.94s] of\n",
	"[25.94s -> 26.24s] having\n",
	"[26.24s -> 26.42s] the\n",
	"[26.42s -> 26.54s] end\n",
	"[26.54s -> 26.68s] of\n",
	"[26.68s -> 27.08s] work.\n",
	"[27.08s -> 27.92s] I\n",
	"[27.92s -> 28.04s] want\n",
	"[28.04s -> 28.18s] to\n",
	"[28.18s -> 28.28s] know\n",
	"[28.28s -> 28.40s] what\n",
	"[28.40s -> 28.70s] you\n",
	"[28.70s -> 29.10s] think\n",
	"[29.10s -> 29.78s] it\n",
	"[29.78s -> 30.02s] will\n",
	"[30.02s -> 30.22s] be\n",
	"[30.22s -> 30.46s] like.\n",
	"[30.58s -> 30.58s] You\n",
	"[30.58s -> 30.68s] know,\n",
	"[30.80s -> 31.02s] there\n",
	"[31.02s -> 31.16s] are\n",
	"[31.16s -> 31.40s] two\n",
	"[31.40s -> 31.86s] scenarios.\n",
	"[32.06s -> 32.28s] One,\n",
	"[32.80s -> 33.04s] I\n",
	"[33.04s -> 33.38s] lose\n",
	"[33.38s -> 34.08s] every\n",
	"[34.08s -> 34.60s] excuse\n",
	"[34.60s -> 35.18s] not\n",
	"[35.18s -> 35.40s] to\n",
	"[35.40s -> 35.54s] learn\n",
	"[35.54s -> 35.68s] the\n",
	"[35.68s -> 36.04s] piano\n",
	"[36.04s -> 36.24s] and\n",
	"[36.24s -> 36.42s] I'm\n",
	"[36.42s -> 36.60s] much\n",
	"[36.60s -> 36.88s] nicer\n",
	"[36.88s -> 37.08s] to\n",
	"[37.08s -> 37.24s] my\n",
	"[37.24s -> 37.52s] children\n",
	"[37.52s -> 37.82s] and\n",
	"[37.82s -> 37.98s] I\n",
	"[37.98s -> 38.52s] catch\n",
	"[38.52s -> 38.78s] up\n",
	"[38.78s -> 38.96s] on\n",
	"[38.96s -> 39.18s] all\n",
	"[39.18s -> 39.28s] of\n",
	"[39.28s -> 39.36s] the\n",
	"[39.36s -> 39.58s] wonderful\n",
	"[39.58s -> 39.82s] things\n",
	"[39.82s -> 40.02s] on\n",
	"[40.02s -> 40.22s] Radio\n",
	"[40.22s -> 40.66s] 4.\n",
	"[40.98s -> 41.38s] Or,\n",
	"[41.70s -> 41.84s] you\n",
	"[41.84s -> 41.98s] know,\n",
	"[42.22s -> 42.36s] that\n",
	"[42.36s -> 43.10s] terrifying\n",
	"[43.10s -> 44.00s] dystopia,\n",
	"[44.26s -> 44.34s] the\n",
	"[44.34s -> 44.86s] animation\n",
	"[44.86s -> 45.18s] Wall\n",
	"[45.18s -> 45.48s] -E,\n",
	"[45.68s -> 45.76s] where\n",
	"[45.76s -> 45.90s] we\n",
	"[45.90s -> 46.04s] all\n",
	"[46.04s -> 46.16s] sit\n",
	"[46.16s -> 46.30s] on\n",
	"[46.30s -> 46.42s] our\n",
	"[46.42s -> 46.74s] bottoms\n",
	"[46.74s -> 46.94s] getting\n",
	"[46.94s -> 47.48s] fatter,\n",
	"[47.82s -> 48.12s] watching\n",
	"[48.12s -> 49.02s] infomercials.\n",
	"[49.02s -> 49.08s] I\n",
	"[49.08s -> 49.18s] mean,\n",
	"[49.24s -> 49.32s] what\n",
	"[49.32s -> 49.48s] do\n",
	"[49.48s -> 49.66s] you\n",
	"[49.66s -> 49.98s] think\n",
	"[49.98s -> 50.52s] the\n",
	"[50.52s -> 50.82s] human\n",
	"[50.82s -> 51.32s] condition\n",
	"[51.32s -> 51.76s] leans\n",
	"[51.76s -> 52.22s] towards?\n"
	]
	}
	],
	"source": [
	"# transcribes and displays results (word level)\n",
	"segments, _ = model.transcribe(audio, language=\"en\", word_timestamps=True)\n",
	"segments = list(segments)\n",
	"\n",
	"for segment_wl in segments:\n",
	" for word in segment_wl.words:\n",
	" print(\"[%.2fs -> %.2fs] %s\" % (word.start, word.end, word.word))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"# create SRT subs and write to file\n",
	"import math\n",
	"\n",
	"def convert_seconds_to_hms(seconds):\n",
	" hours, remainder = divmod(seconds, 3600)\n",
	" minutes, seconds = divmod(remainder, 60)\n",
	" milliseconds = math.floor((seconds % 1) * 1000)\n",
	" output = f\"{int(hours):02}:{int(minutes):02}:{int(seconds):02},{milliseconds:03}\"\n",
	" return output\n",
	"\n",
	"count = 0\n",
	"with open(f\"{audio}_word-level.srt\", 'w') as f: # enter subtitle file name\n",
	" for segment in segments:\n",
	" for word in segment.words:\n",
	" count +=1\n",
	" duration = f\"{convert_seconds_to_hms(word.start)} --> {convert_seconds_to_hms(word.end)}\\n\"\n",
	" text = f\"{word.word.lstrip()}\\n\\n\"\n",
	" f.write(f\"{count}\\n{duration}{text}\") # Write formatted string to the file"
	]
	}
	],
	"metadata": {
	"accelerator": "GPU",
	"colab": {
	"provenance": []
	},
	"kernelspec": {
	"display_name": "base",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}