Skip to content

Instantly share code, notes, and snippets.

@dcondrey
Last active September 18, 2025 08:41
Show Gist options
  • Save dcondrey/320a131b8f387aeae5c05b66bb3c61f9 to your computer and use it in GitHub Desktop.
Save dcondrey/320a131b8f387aeae5c05b66bb3c61f9 to your computer and use it in GitHub Desktop.
ZSH writers utilities
repetitivephrases() {
recurse=0; minw=3; maxw=20; limit=500; incl_csv=""; excl_csv=""; allfiles=0; mincount=2; single_file=""; internal_check=0
OPTIND=1
while getopts "rm:M:n:E:X:Ac:f:h" opt; do
case "$opt" in
r) recurse=1 ;;
m) minw=$OPTARG ;;
M) maxw=$OPTARG ;;
n) limit=$OPTARG ;;
E) incl_csv=$OPTARG ;;
X) excl_csv=$OPTARG ;;
A) allfiles=1 ;;
c) mincount=$OPTARG ;;
f) single_file=$OPTARG ;;
h)
print -- "Usage: repetitivephrases [OPTIONS] [FILE]"
print -- ""
print -- "Find repeated n-grams (phrases) across text/code files"
print -- ""
print -- "OPTIONS:"
print -- " -r Recurse into subdirectories"
print -- " -m MIN Minimum phrase length in words (default: 3)"
print -- " -M MAX Maximum phrase length in words (default: 20)"
print -- " -n LIMIT Limit output to top N results (default: 500)"
print -- " -E ext,... Include only these file extensions (comma-separated)"
print -- " -X ext,... Exclude these file extensions (comma-separated)"
print -- " -A Analyze all files (ignore extension filters)"
print -- " -c MINCOUNT Minimum occurrence count to report (default: 2)"
print -- " -f FILE Check single FILE for repetitions against all other files"
print -- " -h Show this help message"
print -- ""
print -- "USAGE PATTERNS:"
print -- " repetitivephrases # Check all files in current directory"
print -- " repetitivephrases FILE # Check for repetitions within FILE only"
print -- " repetitivephrases -f FILE # Check FILE against all other files"
print -- ""
print -- "Default extensions: txt,md,markdown,rst,tex,html,htm,xml,json,yaml,yml,toml,ini,conf,env,csv,tsv,py,js,ts,jsx,tsx,css,scss,less,java,c,h,hh,cpp,hpp,cc,go,rs,rb,php,pl,lua,sh,bash,zsh,fish,sql,r,mdx"
print -- ""
print -- "Examples:"
print -- " repetitivephrases # Check current directory files"
print -- " repetitivephrases myfile.txt # Check repetitions within myfile.txt"
print -- " repetitivephrases -r -n 100 # Recurse and show top 100"
print -- " repetitivephrases -E txt,md # Only text and markdown files"
print -- " repetitivephrases -f myfile.txt # Check myfile.txt against others"
return 0;;
\?) print -u2 "repetitivephrases: invalid option"; return 1 ;;
esac
done
shift $((OPTIND-1))
# If no options but a filename argument is provided, check that file for internal repetitions
if [[ $OPTIND -eq 1 && $# -eq 1 ]]; then
if [[ -f "$1" ]]; then
single_file="$1"
# Special mode: check only within this single file
internal_check=1
else
print -u2 "Error: File '$1' not found"
return 1
fi
elif [[ $OPTIND -eq 1 && $# -eq 0 ]]; then
# No arguments at all, show help
print -- "Usage: repetitivephrases [OPTIONS] [FILE]"
print -- "Find repeated n-grams (phrases) across text/code files"
print -- ""
print -- "Run 'repetitivephrases -h' for detailed help"
return 0
fi
default_exts="txt,md,markdown,rst,tex,html,htm,xml,json,yaml,yml,toml,ini,conf,env,csv,tsv,py,js,ts,jsx,tsx,css,scss,less,java,c,h,hh,cpp,hpp,cc,go,rs,rb,php,pl,lua,sh,bash,zsh,fish,sql,r,mdx"
[[ -z $incl_csv ]] && incl_csv=$default_exts
incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') "
excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') "
TAB="$(printf '\t')"
tmp_list=$(mktemp) || return 1
tmp_pairs=$(mktemp) || { rm -f "$tmp_list"; return 1; }
# Build file list
if [[ -n $single_file ]]; then
# Check if single file exists
if [[ ! -f $single_file ]]; then
print -u2 "Error: File '$single_file' not found"
rm -f "$tmp_list" "$tmp_pairs"
return 1
fi
if [[ $internal_check -eq 1 ]]; then
# Internal check mode: only analyze the single file
realpath "$single_file" > "$tmp_list"
else
# Add the single file first
realpath "$single_file" > "$tmp_list"
# Then add all other files in the same directory (or recursively)
if [[ $recurse -eq 1 ]]; then
find . -type f -print | while read f; do
[[ "$(realpath "$f")" != "$(realpath "$single_file")" ]] && echo "$f"
done >> "$tmp_list"
else
find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' | while read f; do
[[ "$(realpath "$f")" != "$(realpath "$single_file")" ]] && echo "$f"
done >> "$tmp_list"
fi
fi
elif [[ $recurse -eq 1 ]]; then
find . -type f -print > "$tmp_list"
else
# top-level only
find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list"
fi
: > "$tmp_pairs"
while IFS= read -r fpath; do
[[ -f $fpath ]] || continue
base=${fpath##*/}; ext=""
case "$base" in *.*) ext=${base##*.} ;; esac
ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]')
# exclude by extension
if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then
continue
fi
# include filter unless -A
if [[ $allfiles -ne 1 ]]; then
if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then
continue
fi
fi
# Tokenize to n-grams and emit "phrase\tfilename"
command awk -v MIN="$minw" -v MAX="$maxw" -v FN="$fpath" '
function push(tok){ t[++tlen]=tok }
function emit( n,i,j,ph){
for(n=MIN;n<=MAX;n++){
if(tlen<n) continue
for(i=1;i<=tlen-n+1;i++){
ph=t[i]
for(j=i+1;j<i+n;j++) ph=ph " " t[j]
printf "%s\t%s\n", ph, FN
}
}
}
{
tlen=0; gsub(/\r$/,"")
gsub(/[^[:alnum:][:space:]\047-]/, " ")
$0=tolower($0)
nf=split($0,words)
for(w=1;w<=nf;w++){
if(length(words[w])>0) push(words[w])
}
emit()
}' "$fpath" >> "$tmp_pairs"
done < "$tmp_list"
# Nothing produced?
if [[ ! -s $tmp_pairs ]]; then
printf 'COUNT%sFILES%sFILE_LIST%sPHRASE\n' "$TAB" "$TAB" "$TAB"
rm -f "$tmp_list" "$tmp_pairs"
return 0
fi
# Aggregate and filter out subphrases
sort -t "$TAB" -k1,1 "$tmp_pairs" | \
command awk -F "$TAB" -v MINC="$mincount" -v TAB="$TAB" -v SINGLE="$single_file" -v INTERNAL="$internal_check" '
{
ph=$1; fn=$2
count[ph]++
key=ph SUBSEP fn
if (!(key in seen)) {
seen[key]=1
files[ph]++
# Track if phrase appears in single file
if (SINGLE != "" && fn == SINGLE) {
in_single[ph] = 1
}
# Build comma-separated list of unique files
if (file_list[ph] == "") {
file_list[ph] = fn
} else {
file_list[ph] = file_list[ph] "," fn
}
}
}
END {
# Build array of phrases that meet minimum count
n = 0
for (ph in count) {
if (INTERNAL == "1") {
# Internal check mode: show phrases repeated within the single file
if (count[ph] >= MINC) {
phrases[++n] = ph
phrase_count[ph] = count[ph]
phrase_files[ph] = 1 # Always 1 file in internal mode
phrase_file_list[ph] = file_list[ph]
}
} else if (SINGLE != "") {
# Single file vs others mode: only include phrases that appear in the single file
# and at least one other file
if (in_single[ph] && files[ph] >= 2 && count[ph] >= MINC) {
phrases[++n] = ph
phrase_count[ph] = count[ph]
phrase_files[ph] = files[ph]
phrase_file_list[ph] = file_list[ph]
}
} else {
# Normal mode: just check minimum count
if (count[ph] >= MINC) {
phrases[++n] = ph
phrase_count[ph] = count[ph]
phrase_files[ph] = files[ph]
phrase_file_list[ph] = file_list[ph]
}
}
}
# Mark phrases that are substrings of longer phrases with same/higher count
for (i = 1; i <= n; i++) {
for (j = 1; j <= n; j++) {
if (i != j && !is_substring[phrases[i]]) {
# Check if phrases[i] is a substring of phrases[j]
# and phrases[j] has same or higher count
if (index(phrases[j], phrases[i]) > 0 &&
phrase_count[phrases[j]] >= phrase_count[phrases[i]]) {
is_substring[phrases[i]] = 1
}
}
}
}
# Output non-substring phrases
for (ph in phrase_count) {
if (!is_substring[ph]) {
printf "%d%s%d%s%s%s%s\n", phrase_count[ph], TAB, phrase_files[ph], TAB, phrase_file_list[ph], TAB, ph
}
}
}' | \
sort -t "$TAB" -k1,1nr -k2,2nr -k4,4 | \
awk -F "$TAB" -v LIM="$limit" -v TAB="$TAB" '
BEGIN { print "COUNT" TAB "FILES" TAB "FILE_LIST" TAB "PHRASE" }
{ if (NR<=LIM) print }
'
rm -f "$tmp_list" "$tmp_pairs"
}
paragraph_histogram() {
echo -e "\033[1;36mParagraph Length Histogram (words)\033[0m"
find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 \
| xargs -0 awk 'BEGIN{RS="\n\n+"} NF{print NF}' \
| sort -nr \
| uniq -c \
| sort -nr \
| awk '{bar=""; for(i=0;i<$1/2;i++) bar=bar"█"; printf "\033[1;33m%4d\033[0m | %4d %s\n", $2, $1, bar}'
}
sentence_histogram() {
find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 | \
xargs -0 cat | \
awk '
BEGIN { RS="[.!?]"; FS=" "; total=0; count=0; min=0; max=0 }
NF {
len=NF
total+=len; count++
if(min==0 || len<min) min=len
if(len>max) max=len
bins[len]++
}
END {
mean = total/count
# median via cumulative counts
cumsum=0; median_idx=int((count+1)/2)
for(i=1;i<=max;i++) if(bins[i]) { cumsum+=bins[i]; if(!median_found && cumsum>=median_idx){ median=i; median_found=1 } }
# variance
varsum=0
for(i=1;i<=max;i++) if(bins[i]) varsum+=bins[i]*(i-mean)^2
variance=varsum/count
# histogram in bins of 5
printf "\033[1;36mSentence Length Histogram (words)\033[0m\n"
for(i=5;i<=max;i+=5){
bin_count=0
for(j=i-4;j<=i;j++) if(bins[j]) bin_count+=bins[j]
if(bin_count>0){
pct=(bin_count/count)*100
barlen=int(pct/2)
bar=""
for(k=0;k<barlen;k++) bar=bar"█"
label=(i>50)?">50":sprintf("%2d-%2d",i-4,i)
printf "\033[1;33m%-6s\033[0m %5d (%5.2f%%) %s\n", label, bin_count, pct, bar
}
}
# summary
printf "\n\033[1;32mSummary Stats:\033[0m\n"
printf " Sentences: %d\n Mean: %.2f\n Median: %d\n Variance: %.2f\n Min: %d\n Max: %d\n", count, mean, median, variance, min, max
}
'
}
wordfreq() {
recurse=0; incl_csv=""; excl_csv=""; allfiles=0
top=50; minlen=2; maxkb=1024; dropnum=0
OPTIND=1
while getopts "rE:X:An:L:s:Nh" opt; do
case "$opt" in
r) recurse=1 ;;
E) incl_csv=$OPTARG ;;
X) excl_csv=$OPTARG ;;
A) allfiles=1 ;;
n) top=$OPTARG ;;
L) minlen=$OPTARG ;;
s) maxkb=$OPTARG ;;
N) dropnum=1 ;;
h)
printf '%s\n' "Usage: wordfreq [-r] [-E ext,...] [-X ext,...] [-A] [-n N] [-L MINLEN] [-s KB] [-N]"
return 0 ;;
\?) printf '%s\n' "wordfreq: invalid option" >&2; return 1 ;;
esac
done
shift $((OPTIND-1))
default_exts="txt,md,markdown,tex"
[[ -z $incl_csv ]] && incl_csv=$default_exts
incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') "
excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') "
maxbytes=$(( maxkb * 1024 ))
tmp_list=$(mktemp) || return 1
tmp_counts=$(mktemp) || { rm -f "$tmp_list"; return 1; }
# Build file list
if [[ $recurse -eq 1 ]]; then
find . -type f -print > "$tmp_list"
else
# top-level only
find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list"
fi
# Stream files, count in awk
LC_ALL_BACKUP="$LC_ALL"; export LC_ALL=C
while IFS= read -r f; do
[[ -f $f ]] || continue
base=${f##*/}; ext=""
case "$base" in *.*) ext=${base##*.} ;; esac
ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]')
# extension filters
if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then continue; fi
if [[ $allfiles -ne 1 ]]; then
if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then continue; fi
fi
# size filter
sz=$(wc -c < "$f" 2>/dev/null | awk '{print $1+0}')
[[ -n $sz && $sz -le $maxbytes ]] || continue
# texty heuristic: if file(1) exists, skip non-text; else rely on tokenization to strip noise
if command -v file >/dev/null 2>&1; then
if ! file "$f" | grep -qiE 'text|utf-8|ascii|unicode'; then
continue
fi
fi
# Feed file into awk counter
awk -v MINLEN="$minlen" -v DROPNUM="$dropnum" '
function emit(tok) {
if (length(tok) < MINLEN) return
if (DROPNUM && tok ~ /^[0-9]+([[:punct:]]*[0-9]+)*$/) return
count[tok]++
}
{
line=tolower($0)
gsub(/[^[:alnum:]'\'' ]+/, " ", line)
gsub(/[[:space:]]+/, " ", line)
sub(/^ /, "", line); sub(/ $/, "", line)
n=split(line,a," ")
for (i=1;i<=n;i++) if (a[i]!="") emit(a[i])
}
END {
for (w in count) printf "%s\t%d\n", w, count[w]
}
' "$f"
done < "$tmp_list" | \
awk -F '\t' '{
w=$1; c=$2+0; total[w]+=c
}
END { for (w in total) printf "%d\t%s\n", total[w], w }' | \
sort -t $'\t' -k1,1nr | head -n "$top"
export LC_ALL="$LC_ALL_BACKUP"
rm -f "$tmp_list" "$tmp_counts"
}
avg_words_per_sentence() {
find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 | \
xargs -0 cat | \
awk '
BEGIN {
RS="\n\n+"; FS="[.!?]"; para=0
printf "\033[1;36m\nAverage Words per Sentence by Paragraph\033[0m\n\n"
}
{
para++
total_s=0
total_w=0
for(i=1;i<=NF;i++){
n=split($i, words, " ")
total_s++
total_w+=n
}
if(total_s>0){
avg=total_w/total_s
barlen=int(avg/2) # scale bar length for visual
bar=""
for(j=0;j<barlen;j++) bar=bar"█"
color="\033[0;32m" # green default
if(avg>40) color="\033[0;31m" # red if very long
printf "Paragraph %3d: %5.2f words/sentence %s%s\033[0m\n", para, avg, color, bar
}
}
'
}
wordbar() {
recurse=0; incl_csv=""; excl_csv=""; allfiles=0; topn=20; width=50; show_total=0
OPTIND=1
while getopts "rE:X:An:w:th" opt; do
case "$opt" in
r) recurse=1 ;; E) incl_csv=$OPTARG ;; X) excl_csv=$OPTARG ;; A) allfiles=1 ;;
n) topn=$OPTARG ;; w) width=$OPTARG ;; t) show_total=1 ;;
h) printf -- '%s\n' 'Usage: wordbar [-r] [-E ext,...] [-X ext,...] [-A] [-n N] [-w W] [-t]'; return 0 ;;
\?) printf -- '%s\n' "wordbar: invalid option" >&2; return 1 ;;
esac
done
shift $((OPTIND-1))
default_exts="txt,md,markdown,rst,tex"
[[ -z $incl_csv ]] && incl_csv=$default_exts
incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') "
excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') "
TAB="$(printf '\t')"
tmp_list=$(mktemp) || return 1
tmp_counts=$(mktemp) || { rm -f "$tmp_list"; return 1; }
if [[ $recurse -eq 1 ]]; then
find . -type f -print > "$tmp_list"
else
find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list"
fi
LC_ALL_BACKUP="$LC_ALL"; export LC_ALL=C
total=0
while IFS= read -r fpath; do
[[ -f $fpath ]] || continue
base=${fpath##*/}; ext=""
case "$base" in *.*) ext=${base##*.} ;; esac
ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]')
if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then continue; fi
if [[ $allfiles -ne 1 ]]; then
if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then continue; fi
fi
count=$(wc -w < "$fpath" 2>/dev/null | awk '{print $1+0}')
[[ -n $count ]] || continue
printf '%s\t%s\n' "$count" "$fpath" >> "$tmp_counts"
total=$(( total + count ))
done < "$tmp_list"
if [[ ! -s $tmp_counts ]]; then
printf 'No files matched.\n' >&2
export LC_ALL="$LC_ALL_BACKUP"
rm -f "$tmp_list" "$tmp_counts"
return 1
fi
printf '%s\n' "WORDS BAR FILE"
printf '%s\n' "----------------------------------------------"
sort -t "$TAB" -k1,1nr "$tmp_counts" | head -n "$topn" \
| awk -F '\t' -v W="$width" '
NR==1 { max=$1+0 }
{
words=$1+0; file=$2
len = (max>0)? int((words/max)*W) : 0
bar = ""; for (i=0;i<len;i++) bar = bar "█"
printf "%8d %-*s %s\n", words, W, bar, file
}'
if [[ $show_total -eq 1 ]]; then
printf '%s\n' "----------------------------------"
printf "TOTAL: %d words\n" "$total"
fi
export LC_ALL="$LC_ALL_BACKUP"
rm -f "$tmp_list" "$tmp_counts"
}
unique_ratio() {
awk '{for(i=1;i<=NF;i++){ w=tolower($i); gsub(/[^a-z]/,"",w); if(w!="") a[w]++}}
END{uniq=0; total=0; for(w in a){uniq++; total+=a[w]}; printf "Unique word ratio: %.2f%%\n", (uniq/total)*100}' "$@"
}
sentence_starters() {
python3 - <<'EOF'
import sys, re, glob, os
from collections import Counter
files = sys.argv[1:] or ["*.txt", "*.md"]
starters = Counter()
for pattern in files:
for fname in glob.glob(pattern):
if not os.path.isfile(fname):
continue
with open(fname, encoding="utf-8") as f:
text = f.read()
# Split into sentences
sentences = re.split(r'[.!?]+', text)
for s in sentences:
words = s.strip().split()
if words:
first_word = re.sub(r'[^A-Za-z]', '', words[0]).lower()
if first_word:
starters[first_word] += 1
for word, count in starters.most_common(20):
print(f"{count:5d} {word}")
EOF
}
dialogue_ratio() {
python3 - <<'EOF'
import sys, re, glob, os
files = sys.argv[1:] or ["*.txt", "*.md"]
dialogue_count = 0
sentence_count = 0
for pattern in files:
for fname in glob.glob(pattern):
if not os.path.isfile(fname):
continue
with open(fname, encoding="utf-8") as f:
text = f.read()
sentences = re.split(r'[.!?]+', text)
sentence_count += len([s for s in sentences if s.strip()])
dialogue_count += len(re.findall(r'"(.*?)"', text))
if sentence_count == 0:
print("No sentences found. Make sure you pass valid text files as arguments.")
else:
ratio = (dialogue_count / sentence_count) * 100
print(f"Dialogue sentences: {dialogue_count} of {sentence_count} ({ratio:.2f}%)")
EOF
}
readability() {
python3 - "$@" <<'PYTHON'
import sys, re
words = 0
sentences = 0
syllables = 0
for filename in sys.argv[1:]:
try:
with open(filename, 'r', encoding='utf-8') as f:
text = f.read()
except Exception as e:
print(f"Error reading {filename}: {e}")
continue
# split sentences
sents = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
sentences += len(sents)
# split words
ws = re.findall(r'\b\w+\b', text)
words += len(ws)
# syllables: count vowel groups in each word
for w in ws:
syllables += len(re.findall(r'[aeiouy]+', w.lower())) or 1
if sentences > 0 and words > 0:
asl = words / sentences
asw = syllables / words
fk = 206.835 - 1.015 * asl - 84.6 * asw
print(f"Flesch–Kincaid readability: {fk:.2f}")
else:
print("No sentences found. Make sure you pass valid text files as arguments.")
PYTHON
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment