Last active
March 11, 2025 05:54
-
-
Save yunginnanet/dc76feb350a97fa5188ccc723ccaf4f3 to your computer and use it in GitHub Desktop.
bash script for using whisper.cpp to transcribe mp4 files and then count word occurrences in the transcriptions and store in an sqlite database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
_model="/usr/local/share/whisper.cpp/models/ggml-small.en-q5_1.bin" | |
_threads=3 | |
declare -f _grep | |
export _grep | |
declare -g _csv="/media/unsafe/pr0n/counts.csv" | |
declare -g _sql="/media/unsafe/pr0n/counts.db" | |
declare -a -g _merged | |
declare only_none=false | |
function _grep() { | |
args=("$@") | |
if ! which rg >/dev/null; then | |
$(which grep) -a "${args[@]}" | |
else | |
$(which rg) -u "${args[@]}" | |
fi | |
} | |
function lt3() { | |
# shellcheck disable=SC2002 | |
if [ "$(cat "${1}" | wc -l)" -lt 3 ]; then | |
echo -e "'${1}' has less than 3 lines:\n- - - - -\n$(cat "${1}")\n- - - - -\n" | |
return 1 | |
fi | |
return 0 | |
} | |
function _stt() { | |
soxi "$1" || return 1 | |
if ! whisper-cli --entropy-thold 2.8 --max-context 64 --no-prints --suppress-nst \ | |
-m "$_model" -ovtt "$1" -of "$2"; then | |
return 1 | |
fi | |
_res="${2}.vtt" | |
ls -lah "${_res}" || return 1 | |
lt3 "${_res}" || return 1 | |
return 0 | |
} | |
function header() { | |
echo -n "filename," | |
args=("$@") | |
for arg in "${args[@]}"; do | |
echo -n "${arg}" | |
echo -n "," | |
done | awk 'gsub(/,$/,x)' | |
echo '' | |
} | |
function _repl() { | |
while read -r line; do | |
_data="$line" | |
old_ifs="$IFS" | |
IFS="," | |
for replacer in $TRANSCRIBE_SH_REPLACE; do | |
one="$(echo "$replacer" | awk -F '=' '{print $1}')" | |
two="$(echo "$replacer" | awk -F '=' '{print $2}')" | |
sed="s|$one|$two|g" | |
_data="$(echo "$_data" | sed "$sed")" | |
done | |
IFS="$old_ifs" | |
echo "$_data" | |
done | |
} | |
function _u() { | |
_grep -v '>' | _grep -v 'WEBVTT' | _repl | uniq | sort -u | |
} | |
function _test_u() { | |
TRANSCRIBE_SH_REPLACE="1=one,5=five,hello=world" | |
echo "$TRANSCRIBE_SH_REPLACE" | |
_test="$(echo -e "WEBVTT\n1,2,3,4,5\nhello!")" | |
echo "$_test" | |
echo "$_test" | _u | |
} | |
function _counts() { | |
arr=("$@") | |
_data="" | |
readarray datums | |
printf -v _data "%s" "${datums[@]}" | |
_ret=1 | |
for item in "${arr[@]}"; do | |
_key="$(echo "${item}" | tr -d '\n')" | |
if ! _ct="$(echo "${_data}" | sed "s|'s||g" | _grep -i -c "${_key}")"; then | |
echo -n ",0" | |
else | |
echo -n ",${_ct}" | |
_ret=0 | |
fi | |
done | |
echo '' | |
return $_ret | |
} | |
function _z() { | |
_i=0 | |
while :; do | |
if [[ "$_i" -eq "$1" ]]; then break; fi | |
echo -n "0," | |
(("_i = $_i + 1")) | |
done | awk 'gsub(/,$/,x)' | |
} | |
function clean() { | |
#shellcheck disable=2086 | |
_grep -v "$(_z $1)" | |
} | |
function transcribe() { | |
line="$1" | |
_base="$(basename "$line" .mp4)" | |
_target="${_base}.vtt" | |
_wav="${_base}.wav" | |
if ls "$_target" 2>/dev/null; then | |
echo "already transcribed: '$line'" | |
rm -v "${_wav}" 2>/dev/null | |
return 2 | |
fi | |
echo -e "\n+ + + + + transcribing:\t'$line'\n" | |
if ! (ls "${_wav}" 2>&1) >/dev/null; then | |
echo -e "\n- - - - - ffmpeg:\t'$line' -> ${_wav}\n" | |
ffmpeg -nostdin -hide_banner -v warning -stats -i "$line" -ac 1 -ar 16000 -acodec pcm_s16le "${_wav}" | |
fi | |
echo -e "\n- - - - - whisper:\t'${_wav}'\n" | |
if ! _stt "${_wav}" "${_base}"; then | |
echo -e "\n!!! failed to transcribe '$line.wav' !!!\n" | |
return | |
fi | |
echo -e "\n- - - - - done:\t'$line'\n" | |
rm -v "${_wav}" | |
} | |
function run_transcribe() { | |
echo "transcribing videos...." | |
suf='\.mp4$' | |
pref=".*${1}.*" | |
val="${pref}${suf}" | |
echo "searching for: '$val'" | |
fd -i --threads=$_threads --type file "${val}" --exec "$0" transcribe_one {} | |
} | |
function run_count() { | |
if ! $only_none; then true; else echo "(finding files with no matches)"; fi | |
mv -vf "${_csv}" "${_csv}.bak" 2>/dev/null | |
merge_sql_tables "${@}" | |
arglen=${#_merged[@]} | |
header "${_merged[@]}" | tee "${_csv}" | |
fd --type=file '\.vtt$' | while read -r line; do | |
_file="$(basename "$line" .mp4)" | |
_fname="$(echo -n "${_file}" | sed 's|,|_|g')" | |
#shellcheck disable=2002 | |
if _cts="$(cat "$line" | _u | _counts "${_merged[@]}")"; then | |
if ! $only_none; then | |
if _val="$(echo "${_fname}${_cts}" | clean "$arglen")"; then echo "$_val" | tee -a "${_csv}"; fi | |
else | |
echo "$_cts" | _grep "$(_z "$arglen")" | |
fi | |
fi | |
done | |
if ! $only_none; then run_sql; fi | |
} | |
function get_sql_tables() { | |
if ! which sqlite3 >/dev/null; then return 1; fi | |
if ! _dat="$(sqlite3 "${_sql}" '.schema' | _grep FLOAT | awk -F 'FLOAT' '{print $1}' | tr -d '\t' | tr -d '"')"; then | |
return 1 | |
fi | |
echo "${_dat}" | while read -r line; do | |
echo "${line}" | |
done | |
return 0 | |
} | |
function merge_sql_tables() { | |
if ! which sqlite3 >/dev/null; then return 1; fi | |
_tmp="$(mktemp | tr -d '"')" | |
get_sql_tables >>"${_tmp}" | |
for arg in "${@}"; do | |
echo "$arg" >>"${_tmp}" | |
done | |
#shellcheck disable=SC2002 | |
_vals="$(cat "${_tmp}" | uniq -u | sort -u)" | |
readarray -t _m <<<"${_vals}" | |
_merged=("${_m[@]}") | |
export _merged | |
rm "${_tmp}" | |
for m in "${_merged[@]}"; do | |
echo "${m}" | |
done | |
return 0 | |
} | |
function run_sql() { | |
if ! which sqlite3 >/dev/null; then return 1; fi | |
if ! which csvsql >/dev/null; then return 1; fi | |
csvsql --verbose -d ',' "${_csv}" -S --db "sqlite:///${_sql}" --overwrite \ | |
--insert --prefix "OR REPLACE" --unique-constraint=filename --create-if-not-exists -v | |
sqlite3 -echo "${_sql}" "VACUUM; ANALYZE;" | |
} | |
# set -x | |
_cmd="$1" | |
shift 1 | |
if [ "$_cmd" == "transcribe" ]; then | |
run_transcribe "$1" || exit 1 | |
exit 0 | |
fi | |
if [ "$_cmd" == "transcribe_one" ]; then | |
transcribe "$1" || exit 1 | |
exit 0 | |
fi | |
if [ "$_cmd" == "count" ]; then | |
args=("$@") | |
run_count "${args[@]}" || exit 1 | |
exit 0 | |
fi | |
if [ "$_cmd" == "count_nones" ]; then | |
only_none=true | |
export only_none | |
args=("$@") | |
run_count "${args[@]}" || exit 1 | |
exit 0 | |
fi | |
if [ "$_cmd" == "test_replacer" ]; then | |
_test_u | |
exit 0 | |
fi | |
echo -e "unknown command. \n commands:\tcount, transcribe" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment