Skip to content

Instantly share code, notes, and snippets.

@yunginnanet
Last active March 11, 2025 05:54
Show Gist options
  • Save yunginnanet/dc76feb350a97fa5188ccc723ccaf4f3 to your computer and use it in GitHub Desktop.
Save yunginnanet/dc76feb350a97fa5188ccc723ccaf4f3 to your computer and use it in GitHub Desktop.
bash script for using whisper.cpp to transcribe mp4 files and then count word occurrences in the transcriptions and store in an sqlite database
#!/usr/bin/env bash
_model="/usr/local/share/whisper.cpp/models/ggml-small.en-q5_1.bin"
_threads=3
declare -f _grep
export _grep
declare -g _csv="/media/unsafe/pr0n/counts.csv"
declare -g _sql="/media/unsafe/pr0n/counts.db"
declare -a -g _merged
declare only_none=false
function _grep() {
args=("$@")
if ! which rg >/dev/null; then
$(which grep) -a "${args[@]}"
else
$(which rg) -u "${args[@]}"
fi
}
function lt3() {
# shellcheck disable=SC2002
if [ "$(cat "${1}" | wc -l)" -lt 3 ]; then
echo -e "'${1}' has less than 3 lines:\n- - - - -\n$(cat "${1}")\n- - - - -\n"
return 1
fi
return 0
}
function _stt() {
soxi "$1" || return 1
if ! whisper-cli --entropy-thold 2.8 --max-context 64 --no-prints --suppress-nst \
-m "$_model" -ovtt "$1" -of "$2"; then
return 1
fi
_res="${2}.vtt"
ls -lah "${_res}" || return 1
lt3 "${_res}" || return 1
return 0
}
function header() {
echo -n "filename,"
args=("$@")
for arg in "${args[@]}"; do
echo -n "${arg}"
echo -n ","
done | awk 'gsub(/,$/,x)'
echo ''
}
function _repl() {
while read -r line; do
_data="$line"
old_ifs="$IFS"
IFS=","
for replacer in $TRANSCRIBE_SH_REPLACE; do
one="$(echo "$replacer" | awk -F '=' '{print $1}')"
two="$(echo "$replacer" | awk -F '=' '{print $2}')"
sed="s|$one|$two|g"
_data="$(echo "$_data" | sed "$sed")"
done
IFS="$old_ifs"
echo "$_data"
done
}
function _u() {
_grep -v '>' | _grep -v 'WEBVTT' | _repl | uniq | sort -u
}
function _test_u() {
TRANSCRIBE_SH_REPLACE="1=one,5=five,hello=world"
echo "$TRANSCRIBE_SH_REPLACE"
_test="$(echo -e "WEBVTT\n1,2,3,4,5\nhello!")"
echo "$_test"
echo "$_test" | _u
}
function _counts() {
arr=("$@")
_data=""
readarray datums
printf -v _data "%s" "${datums[@]}"
_ret=1
for item in "${arr[@]}"; do
_key="$(echo "${item}" | tr -d '\n')"
if ! _ct="$(echo "${_data}" | sed "s|'s||g" | _grep -i -c "${_key}")"; then
echo -n ",0"
else
echo -n ",${_ct}"
_ret=0
fi
done
echo ''
return $_ret
}
function _z() {
_i=0
while :; do
if [[ "$_i" -eq "$1" ]]; then break; fi
echo -n "0,"
(("_i = $_i + 1"))
done | awk 'gsub(/,$/,x)'
}
function clean() {
#shellcheck disable=2086
_grep -v "$(_z $1)"
}
function transcribe() {
line="$1"
_base="$(basename "$line" .mp4)"
_target="${_base}.vtt"
_wav="${_base}.wav"
if ls "$_target" 2>/dev/null; then
echo "already transcribed: '$line'"
rm -v "${_wav}" 2>/dev/null
return 2
fi
echo -e "\n+ + + + + transcribing:\t'$line'\n"
if ! (ls "${_wav}" 2>&1) >/dev/null; then
echo -e "\n- - - - - ffmpeg:\t'$line' -> ${_wav}\n"
ffmpeg -nostdin -hide_banner -v warning -stats -i "$line" -ac 1 -ar 16000 -acodec pcm_s16le "${_wav}"
fi
echo -e "\n- - - - - whisper:\t'${_wav}'\n"
if ! _stt "${_wav}" "${_base}"; then
echo -e "\n!!! failed to transcribe '$line.wav' !!!\n"
return
fi
echo -e "\n- - - - - done:\t'$line'\n"
rm -v "${_wav}"
}
function run_transcribe() {
echo "transcribing videos...."
suf='\.mp4$'
pref=".*${1}.*"
val="${pref}${suf}"
echo "searching for: '$val'"
fd -i --threads=$_threads --type file "${val}" --exec "$0" transcribe_one {}
}
function run_count() {
if ! $only_none; then true; else echo "(finding files with no matches)"; fi
mv -vf "${_csv}" "${_csv}.bak" 2>/dev/null
merge_sql_tables "${@}"
arglen=${#_merged[@]}
header "${_merged[@]}" | tee "${_csv}"
fd --type=file '\.vtt$' | while read -r line; do
_file="$(basename "$line" .mp4)"
_fname="$(echo -n "${_file}" | sed 's|,|_|g')"
#shellcheck disable=2002
if _cts="$(cat "$line" | _u | _counts "${_merged[@]}")"; then
if ! $only_none; then
if _val="$(echo "${_fname}${_cts}" | clean "$arglen")"; then echo "$_val" | tee -a "${_csv}"; fi
else
echo "$_cts" | _grep "$(_z "$arglen")"
fi
fi
done
if ! $only_none; then run_sql; fi
}
function get_sql_tables() {
if ! which sqlite3 >/dev/null; then return 1; fi
if ! _dat="$(sqlite3 "${_sql}" '.schema' | _grep FLOAT | awk -F 'FLOAT' '{print $1}' | tr -d '\t' | tr -d '"')"; then
return 1
fi
echo "${_dat}" | while read -r line; do
echo "${line}"
done
return 0
}
function merge_sql_tables() {
if ! which sqlite3 >/dev/null; then return 1; fi
_tmp="$(mktemp | tr -d '"')"
get_sql_tables >>"${_tmp}"
for arg in "${@}"; do
echo "$arg" >>"${_tmp}"
done
#shellcheck disable=SC2002
_vals="$(cat "${_tmp}" | uniq -u | sort -u)"
readarray -t _m <<<"${_vals}"
_merged=("${_m[@]}")
export _merged
rm "${_tmp}"
for m in "${_merged[@]}"; do
echo "${m}"
done
return 0
}
function run_sql() {
if ! which sqlite3 >/dev/null; then return 1; fi
if ! which csvsql >/dev/null; then return 1; fi
csvsql --verbose -d ',' "${_csv}" -S --db "sqlite:///${_sql}" --overwrite \
--insert --prefix "OR REPLACE" --unique-constraint=filename --create-if-not-exists -v
sqlite3 -echo "${_sql}" "VACUUM; ANALYZE;"
}
# set -x
_cmd="$1"
shift 1
if [ "$_cmd" == "transcribe" ]; then
run_transcribe "$1" || exit 1
exit 0
fi
if [ "$_cmd" == "transcribe_one" ]; then
transcribe "$1" || exit 1
exit 0
fi
if [ "$_cmd" == "count" ]; then
args=("$@")
run_count "${args[@]}" || exit 1
exit 0
fi
if [ "$_cmd" == "count_nones" ]; then
only_none=true
export only_none
args=("$@")
run_count "${args[@]}" || exit 1
exit 0
fi
if [ "$_cmd" == "test_replacer" ]; then
_test_u
exit 0
fi
echo -e "unknown command. \n commands:\tcount, transcribe"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment