Last active
September 18, 2025 08:41
-
-
Save dcondrey/320a131b8f387aeae5c05b66bb3c61f9 to your computer and use it in GitHub Desktop.
ZSH writers utilities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
repetitivephrases() { | |
recurse=0; minw=3; maxw=20; limit=500; incl_csv=""; excl_csv=""; allfiles=0; mincount=2; single_file=""; internal_check=0 | |
OPTIND=1 | |
while getopts "rm:M:n:E:X:Ac:f:h" opt; do | |
case "$opt" in | |
r) recurse=1 ;; | |
m) minw=$OPTARG ;; | |
M) maxw=$OPTARG ;; | |
n) limit=$OPTARG ;; | |
E) incl_csv=$OPTARG ;; | |
X) excl_csv=$OPTARG ;; | |
A) allfiles=1 ;; | |
c) mincount=$OPTARG ;; | |
f) single_file=$OPTARG ;; | |
h) | |
print -- "Usage: repetitivephrases [OPTIONS] [FILE]" | |
print -- "" | |
print -- "Find repeated n-grams (phrases) across text/code files" | |
print -- "" | |
print -- "OPTIONS:" | |
print -- " -r Recurse into subdirectories" | |
print -- " -m MIN Minimum phrase length in words (default: 3)" | |
print -- " -M MAX Maximum phrase length in words (default: 20)" | |
print -- " -n LIMIT Limit output to top N results (default: 500)" | |
print -- " -E ext,... Include only these file extensions (comma-separated)" | |
print -- " -X ext,... Exclude these file extensions (comma-separated)" | |
print -- " -A Analyze all files (ignore extension filters)" | |
print -- " -c MINCOUNT Minimum occurrence count to report (default: 2)" | |
print -- " -f FILE Check single FILE for repetitions against all other files" | |
print -- " -h Show this help message" | |
print -- "" | |
print -- "USAGE PATTERNS:" | |
print -- " repetitivephrases # Check all files in current directory" | |
print -- " repetitivephrases FILE # Check for repetitions within FILE only" | |
print -- " repetitivephrases -f FILE # Check FILE against all other files" | |
print -- "" | |
print -- "Default extensions: txt,md,markdown,rst,tex,html,htm,xml,json,yaml,yml,toml,ini,conf,env,csv,tsv,py,js,ts,jsx,tsx,css,scss,less,java,c,h,hh,cpp,hpp,cc,go,rs,rb,php,pl,lua,sh,bash,zsh,fish,sql,r,mdx" | |
print -- "" | |
print -- "Examples:" | |
print -- " repetitivephrases # Check current directory files" | |
print -- " repetitivephrases myfile.txt # Check repetitions within myfile.txt" | |
print -- " repetitivephrases -r -n 100 # Recurse and show top 100" | |
print -- " repetitivephrases -E txt,md # Only text and markdown files" | |
print -- " repetitivephrases -f myfile.txt # Check myfile.txt against others" | |
return 0;; | |
\?) print -u2 "repetitivephrases: invalid option"; return 1 ;; | |
esac | |
done | |
shift $((OPTIND-1)) | |
# If no options but a filename argument is provided, check that file for internal repetitions | |
if [[ $OPTIND -eq 1 && $# -eq 1 ]]; then | |
if [[ -f "$1" ]]; then | |
single_file="$1" | |
# Special mode: check only within this single file | |
internal_check=1 | |
else | |
print -u2 "Error: File '$1' not found" | |
return 1 | |
fi | |
elif [[ $OPTIND -eq 1 && $# -eq 0 ]]; then | |
# No arguments at all, show help | |
print -- "Usage: repetitivephrases [OPTIONS] [FILE]" | |
print -- "Find repeated n-grams (phrases) across text/code files" | |
print -- "" | |
print -- "Run 'repetitivephrases -h' for detailed help" | |
return 0 | |
fi | |
default_exts="txt,md,markdown,rst,tex,html,htm,xml,json,yaml,yml,toml,ini,conf,env,csv,tsv,py,js,ts,jsx,tsx,css,scss,less,java,c,h,hh,cpp,hpp,cc,go,rs,rb,php,pl,lua,sh,bash,zsh,fish,sql,r,mdx" | |
[[ -z $incl_csv ]] && incl_csv=$default_exts | |
incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') " | |
excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') " | |
TAB="$(printf '\t')" | |
tmp_list=$(mktemp) || return 1 | |
tmp_pairs=$(mktemp) || { rm -f "$tmp_list"; return 1; } | |
# Build file list | |
if [[ -n $single_file ]]; then | |
# Check if single file exists | |
if [[ ! -f $single_file ]]; then | |
print -u2 "Error: File '$single_file' not found" | |
rm -f "$tmp_list" "$tmp_pairs" | |
return 1 | |
fi | |
if [[ $internal_check -eq 1 ]]; then | |
# Internal check mode: only analyze the single file | |
realpath "$single_file" > "$tmp_list" | |
else | |
# Add the single file first | |
realpath "$single_file" > "$tmp_list" | |
# Then add all other files in the same directory (or recursively) | |
if [[ $recurse -eq 1 ]]; then | |
find . -type f -print | while read f; do | |
[[ "$(realpath "$f")" != "$(realpath "$single_file")" ]] && echo "$f" | |
done >> "$tmp_list" | |
else | |
find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' | while read f; do | |
[[ "$(realpath "$f")" != "$(realpath "$single_file")" ]] && echo "$f" | |
done >> "$tmp_list" | |
fi | |
fi | |
elif [[ $recurse -eq 1 ]]; then | |
find . -type f -print > "$tmp_list" | |
else | |
# top-level only | |
find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list" | |
fi | |
: > "$tmp_pairs" | |
while IFS= read -r fpath; do | |
[[ -f $fpath ]] || continue | |
base=${fpath##*/}; ext="" | |
case "$base" in *.*) ext=${base##*.} ;; esac | |
ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]') | |
# exclude by extension | |
if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then | |
continue | |
fi | |
# include filter unless -A | |
if [[ $allfiles -ne 1 ]]; then | |
if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then | |
continue | |
fi | |
fi | |
# Tokenize to n-grams and emit "phrase\tfilename" | |
command awk -v MIN="$minw" -v MAX="$maxw" -v FN="$fpath" ' | |
function push(tok){ t[++tlen]=tok } | |
function emit( n,i,j,ph){ | |
for(n=MIN;n<=MAX;n++){ | |
if(tlen<n) continue | |
for(i=1;i<=tlen-n+1;i++){ | |
ph=t[i] | |
for(j=i+1;j<i+n;j++) ph=ph " " t[j] | |
printf "%s\t%s\n", ph, FN | |
} | |
} | |
} | |
{ | |
tlen=0; gsub(/\r$/,"") | |
gsub(/[^[:alnum:][:space:]\047-]/, " ") | |
$0=tolower($0) | |
nf=split($0,words) | |
for(w=1;w<=nf;w++){ | |
if(length(words[w])>0) push(words[w]) | |
} | |
emit() | |
}' "$fpath" >> "$tmp_pairs" | |
done < "$tmp_list" | |
# Nothing produced? | |
if [[ ! -s $tmp_pairs ]]; then | |
printf 'COUNT%sFILES%sFILE_LIST%sPHRASE\n' "$TAB" "$TAB" "$TAB" | |
rm -f "$tmp_list" "$tmp_pairs" | |
return 0 | |
fi | |
# Aggregate and filter out subphrases | |
sort -t "$TAB" -k1,1 "$tmp_pairs" | \ | |
command awk -F "$TAB" -v MINC="$mincount" -v TAB="$TAB" -v SINGLE="$single_file" -v INTERNAL="$internal_check" ' | |
{ | |
ph=$1; fn=$2 | |
count[ph]++ | |
key=ph SUBSEP fn | |
if (!(key in seen)) { | |
seen[key]=1 | |
files[ph]++ | |
# Track if phrase appears in single file | |
if (SINGLE != "" && fn == SINGLE) { | |
in_single[ph] = 1 | |
} | |
# Build comma-separated list of unique files | |
if (file_list[ph] == "") { | |
file_list[ph] = fn | |
} else { | |
file_list[ph] = file_list[ph] "," fn | |
} | |
} | |
} | |
END { | |
# Build array of phrases that meet minimum count | |
n = 0 | |
for (ph in count) { | |
if (INTERNAL == "1") { | |
# Internal check mode: show phrases repeated within the single file | |
if (count[ph] >= MINC) { | |
phrases[++n] = ph | |
phrase_count[ph] = count[ph] | |
phrase_files[ph] = 1 # Always 1 file in internal mode | |
phrase_file_list[ph] = file_list[ph] | |
} | |
} else if (SINGLE != "") { | |
# Single file vs others mode: only include phrases that appear in the single file | |
# and at least one other file | |
if (in_single[ph] && files[ph] >= 2 && count[ph] >= MINC) { | |
phrases[++n] = ph | |
phrase_count[ph] = count[ph] | |
phrase_files[ph] = files[ph] | |
phrase_file_list[ph] = file_list[ph] | |
} | |
} else { | |
# Normal mode: just check minimum count | |
if (count[ph] >= MINC) { | |
phrases[++n] = ph | |
phrase_count[ph] = count[ph] | |
phrase_files[ph] = files[ph] | |
phrase_file_list[ph] = file_list[ph] | |
} | |
} | |
} | |
# Mark phrases that are substrings of longer phrases with same/higher count | |
for (i = 1; i <= n; i++) { | |
for (j = 1; j <= n; j++) { | |
if (i != j && !is_substring[phrases[i]]) { | |
# Check if phrases[i] is a substring of phrases[j] | |
# and phrases[j] has same or higher count | |
if (index(phrases[j], phrases[i]) > 0 && | |
phrase_count[phrases[j]] >= phrase_count[phrases[i]]) { | |
is_substring[phrases[i]] = 1 | |
} | |
} | |
} | |
} | |
# Output non-substring phrases | |
for (ph in phrase_count) { | |
if (!is_substring[ph]) { | |
printf "%d%s%d%s%s%s%s\n", phrase_count[ph], TAB, phrase_files[ph], TAB, phrase_file_list[ph], TAB, ph | |
} | |
} | |
}' | \ | |
sort -t "$TAB" -k1,1nr -k2,2nr -k4,4 | \ | |
awk -F "$TAB" -v LIM="$limit" -v TAB="$TAB" ' | |
BEGIN { print "COUNT" TAB "FILES" TAB "FILE_LIST" TAB "PHRASE" } | |
{ if (NR<=LIM) print } | |
' | |
rm -f "$tmp_list" "$tmp_pairs" | |
} | |
paragraph_histogram() { | |
echo -e "\033[1;36mParagraph Length Histogram (words)\033[0m" | |
find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 \ | |
| xargs -0 awk 'BEGIN{RS="\n\n+"} NF{print NF}' \ | |
| sort -nr \ | |
| uniq -c \ | |
| sort -nr \ | |
| awk '{bar=""; for(i=0;i<$1/2;i++) bar=bar"█"; printf "\033[1;33m%4d\033[0m | %4d %s\n", $2, $1, bar}' | |
} | |
sentence_histogram() { | |
find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 | \ | |
xargs -0 cat | \ | |
awk ' | |
BEGIN { RS="[.!?]"; FS=" "; total=0; count=0; min=0; max=0 } | |
NF { | |
len=NF | |
total+=len; count++ | |
if(min==0 || len<min) min=len | |
if(len>max) max=len | |
bins[len]++ | |
} | |
END { | |
mean = total/count | |
# median via cumulative counts | |
cumsum=0; median_idx=int((count+1)/2) | |
for(i=1;i<=max;i++) if(bins[i]) { cumsum+=bins[i]; if(!median_found && cumsum>=median_idx){ median=i; median_found=1 } } | |
# variance | |
varsum=0 | |
for(i=1;i<=max;i++) if(bins[i]) varsum+=bins[i]*(i-mean)^2 | |
variance=varsum/count | |
# histogram in bins of 5 | |
printf "\033[1;36mSentence Length Histogram (words)\033[0m\n" | |
for(i=5;i<=max;i+=5){ | |
bin_count=0 | |
for(j=i-4;j<=i;j++) if(bins[j]) bin_count+=bins[j] | |
if(bin_count>0){ | |
pct=(bin_count/count)*100 | |
barlen=int(pct/2) | |
bar="" | |
for(k=0;k<barlen;k++) bar=bar"█" | |
label=(i>50)?">50":sprintf("%2d-%2d",i-4,i) | |
printf "\033[1;33m%-6s\033[0m %5d (%5.2f%%) %s\n", label, bin_count, pct, bar | |
} | |
} | |
# summary | |
printf "\n\033[1;32mSummary Stats:\033[0m\n" | |
printf " Sentences: %d\n Mean: %.2f\n Median: %d\n Variance: %.2f\n Min: %d\n Max: %d\n", count, mean, median, variance, min, max | |
} | |
' | |
} | |
wordfreq() { | |
recurse=0; incl_csv=""; excl_csv=""; allfiles=0 | |
top=50; minlen=2; maxkb=1024; dropnum=0 | |
OPTIND=1 | |
while getopts "rE:X:An:L:s:Nh" opt; do | |
case "$opt" in | |
r) recurse=1 ;; | |
E) incl_csv=$OPTARG ;; | |
X) excl_csv=$OPTARG ;; | |
A) allfiles=1 ;; | |
n) top=$OPTARG ;; | |
L) minlen=$OPTARG ;; | |
s) maxkb=$OPTARG ;; | |
N) dropnum=1 ;; | |
h) | |
printf '%s\n' "Usage: wordfreq [-r] [-E ext,...] [-X ext,...] [-A] [-n N] [-L MINLEN] [-s KB] [-N]" | |
return 0 ;; | |
\?) printf '%s\n' "wordfreq: invalid option" >&2; return 1 ;; | |
esac | |
done | |
shift $((OPTIND-1)) | |
default_exts="txt,md,markdown,tex" | |
[[ -z $incl_csv ]] && incl_csv=$default_exts | |
incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') " | |
excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') " | |
maxbytes=$(( maxkb * 1024 )) | |
tmp_list=$(mktemp) || return 1 | |
tmp_counts=$(mktemp) || { rm -f "$tmp_list"; return 1; } | |
# Build file list | |
if [[ $recurse -eq 1 ]]; then | |
find . -type f -print > "$tmp_list" | |
else | |
# top-level only | |
find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list" | |
fi | |
# Stream files, count in awk | |
LC_ALL_BACKUP="$LC_ALL"; export LC_ALL=C | |
while IFS= read -r f; do | |
[[ -f $f ]] || continue | |
base=${f##*/}; ext="" | |
case "$base" in *.*) ext=${base##*.} ;; esac | |
ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]') | |
# extension filters | |
if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then continue; fi | |
if [[ $allfiles -ne 1 ]]; then | |
if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then continue; fi | |
fi | |
# size filter | |
sz=$(wc -c < "$f" 2>/dev/null | awk '{print $1+0}') | |
[[ -n $sz && $sz -le $maxbytes ]] || continue | |
# texty heuristic: if file(1) exists, skip non-text; else rely on tokenization to strip noise | |
if command -v file >/dev/null 2>&1; then | |
if ! file "$f" | grep -qiE 'text|utf-8|ascii|unicode'; then | |
continue | |
fi | |
fi | |
# Feed file into awk counter | |
awk -v MINLEN="$minlen" -v DROPNUM="$dropnum" ' | |
function emit(tok) { | |
if (length(tok) < MINLEN) return | |
if (DROPNUM && tok ~ /^[0-9]+([[:punct:]]*[0-9]+)*$/) return | |
count[tok]++ | |
} | |
{ | |
line=tolower($0) | |
gsub(/[^[:alnum:]'\'' ]+/, " ", line) | |
gsub(/[[:space:]]+/, " ", line) | |
sub(/^ /, "", line); sub(/ $/, "", line) | |
n=split(line,a," ") | |
for (i=1;i<=n;i++) if (a[i]!="") emit(a[i]) | |
} | |
END { | |
for (w in count) printf "%s\t%d\n", w, count[w] | |
} | |
' "$f" | |
done < "$tmp_list" | \ | |
awk -F '\t' '{ | |
w=$1; c=$2+0; total[w]+=c | |
} | |
END { for (w in total) printf "%d\t%s\n", total[w], w }' | \ | |
sort -t $'\t' -k1,1nr | head -n "$top" | |
export LC_ALL="$LC_ALL_BACKUP" | |
rm -f "$tmp_list" "$tmp_counts" | |
} | |
avg_words_per_sentence() { | |
find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 | \ | |
xargs -0 cat | \ | |
awk ' | |
BEGIN { | |
RS="\n\n+"; FS="[.!?]"; para=0 | |
printf "\033[1;36m\nAverage Words per Sentence by Paragraph\033[0m\n\n" | |
} | |
{ | |
para++ | |
total_s=0 | |
total_w=0 | |
for(i=1;i<=NF;i++){ | |
n=split($i, words, " ") | |
total_s++ | |
total_w+=n | |
} | |
if(total_s>0){ | |
avg=total_w/total_s | |
barlen=int(avg/2) # scale bar length for visual | |
bar="" | |
for(j=0;j<barlen;j++) bar=bar"█" | |
color="\033[0;32m" # green default | |
if(avg>40) color="\033[0;31m" # red if very long | |
printf "Paragraph %3d: %5.2f words/sentence %s%s\033[0m\n", para, avg, color, bar | |
} | |
} | |
' | |
} | |
wordbar() { | |
recurse=0; incl_csv=""; excl_csv=""; allfiles=0; topn=20; width=50; show_total=0 | |
OPTIND=1 | |
while getopts "rE:X:An:w:th" opt; do | |
case "$opt" in | |
r) recurse=1 ;; E) incl_csv=$OPTARG ;; X) excl_csv=$OPTARG ;; A) allfiles=1 ;; | |
n) topn=$OPTARG ;; w) width=$OPTARG ;; t) show_total=1 ;; | |
h) printf -- '%s\n' 'Usage: wordbar [-r] [-E ext,...] [-X ext,...] [-A] [-n N] [-w W] [-t]'; return 0 ;; | |
\?) printf -- '%s\n' "wordbar: invalid option" >&2; return 1 ;; | |
esac | |
done | |
shift $((OPTIND-1)) | |
default_exts="txt,md,markdown,rst,tex" | |
[[ -z $incl_csv ]] && incl_csv=$default_exts | |
incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') " | |
excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') " | |
TAB="$(printf '\t')" | |
tmp_list=$(mktemp) || return 1 | |
tmp_counts=$(mktemp) || { rm -f "$tmp_list"; return 1; } | |
if [[ $recurse -eq 1 ]]; then | |
find . -type f -print > "$tmp_list" | |
else | |
find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list" | |
fi | |
LC_ALL_BACKUP="$LC_ALL"; export LC_ALL=C | |
total=0 | |
while IFS= read -r fpath; do | |
[[ -f $fpath ]] || continue | |
base=${fpath##*/}; ext="" | |
case "$base" in *.*) ext=${base##*.} ;; esac | |
ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]') | |
if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then continue; fi | |
if [[ $allfiles -ne 1 ]]; then | |
if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then continue; fi | |
fi | |
count=$(wc -w < "$fpath" 2>/dev/null | awk '{print $1+0}') | |
[[ -n $count ]] || continue | |
printf '%s\t%s\n' "$count" "$fpath" >> "$tmp_counts" | |
total=$(( total + count )) | |
done < "$tmp_list" | |
if [[ ! -s $tmp_counts ]]; then | |
printf 'No files matched.\n' >&2 | |
export LC_ALL="$LC_ALL_BACKUP" | |
rm -f "$tmp_list" "$tmp_counts" | |
return 1 | |
fi | |
printf '%s\n' "WORDS BAR FILE" | |
printf '%s\n' "----------------------------------------------" | |
sort -t "$TAB" -k1,1nr "$tmp_counts" | head -n "$topn" \ | |
| awk -F '\t' -v W="$width" ' | |
NR==1 { max=$1+0 } | |
{ | |
words=$1+0; file=$2 | |
len = (max>0)? int((words/max)*W) : 0 | |
bar = ""; for (i=0;i<len;i++) bar = bar "█" | |
printf "%8d %-*s %s\n", words, W, bar, file | |
}' | |
if [[ $show_total -eq 1 ]]; then | |
printf '%s\n' "----------------------------------" | |
printf "TOTAL: %d words\n" "$total" | |
fi | |
export LC_ALL="$LC_ALL_BACKUP" | |
rm -f "$tmp_list" "$tmp_counts" | |
} | |
unique_ratio() { | |
awk '{for(i=1;i<=NF;i++){ w=tolower($i); gsub(/[^a-z]/,"",w); if(w!="") a[w]++}} | |
END{uniq=0; total=0; for(w in a){uniq++; total+=a[w]}; printf "Unique word ratio: %.2f%%\n", (uniq/total)*100}' "$@" | |
} | |
sentence_starters() { | |
python3 - <<'EOF' | |
import sys, re, glob, os | |
from collections import Counter | |
files = sys.argv[1:] or ["*.txt", "*.md"] | |
starters = Counter() | |
for pattern in files: | |
for fname in glob.glob(pattern): | |
if not os.path.isfile(fname): | |
continue | |
with open(fname, encoding="utf-8") as f: | |
text = f.read() | |
# Split into sentences | |
sentences = re.split(r'[.!?]+', text) | |
for s in sentences: | |
words = s.strip().split() | |
if words: | |
first_word = re.sub(r'[^A-Za-z]', '', words[0]).lower() | |
if first_word: | |
starters[first_word] += 1 | |
for word, count in starters.most_common(20): | |
print(f"{count:5d} {word}") | |
EOF | |
} | |
dialogue_ratio() { | |
python3 - <<'EOF' | |
import sys, re, glob, os | |
files = sys.argv[1:] or ["*.txt", "*.md"] | |
dialogue_count = 0 | |
sentence_count = 0 | |
for pattern in files: | |
for fname in glob.glob(pattern): | |
if not os.path.isfile(fname): | |
continue | |
with open(fname, encoding="utf-8") as f: | |
text = f.read() | |
sentences = re.split(r'[.!?]+', text) | |
sentence_count += len([s for s in sentences if s.strip()]) | |
dialogue_count += len(re.findall(r'"(.*?)"', text)) | |
if sentence_count == 0: | |
print("No sentences found. Make sure you pass valid text files as arguments.") | |
else: | |
ratio = (dialogue_count / sentence_count) * 100 | |
print(f"Dialogue sentences: {dialogue_count} of {sentence_count} ({ratio:.2f}%)") | |
EOF | |
} | |
readability() { | |
python3 - "$@" <<'PYTHON' | |
import sys, re | |
words = 0 | |
sentences = 0 | |
syllables = 0 | |
for filename in sys.argv[1:]: | |
try: | |
with open(filename, 'r', encoding='utf-8') as f: | |
text = f.read() | |
except Exception as e: | |
print(f"Error reading {filename}: {e}") | |
continue | |
# split sentences | |
sents = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()] | |
sentences += len(sents) | |
# split words | |
ws = re.findall(r'\b\w+\b', text) | |
words += len(ws) | |
# syllables: count vowel groups in each word | |
for w in ws: | |
syllables += len(re.findall(r'[aeiouy]+', w.lower())) or 1 | |
if sentences > 0 and words > 0: | |
asl = words / sentences | |
asw = syllables / words | |
fk = 206.835 - 1.015 * asl - 84.6 * asw | |
print(f"Flesch–Kincaid readability: {fk:.2f}") | |
else: | |
print("No sentences found. Make sure you pass valid text files as arguments.") | |
PYTHON | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment