dcondrey · September 18, 2025 08:41
diff --git a/.zshrc b/.zshrc
 repetitivephrases() {
  recurse=0; minw=3; maxw=20; limit=500; incl_csv=""; excl_csv=""; allfiles=0; mincount=2; single_file=""; internal_check=0
  OPTIND=1
  while getopts "rm:M:n:E:X:Ac:f:h" opt; do
    case "$opt" in
      r) recurse=1 ;;
      m) minw=$OPTARG ;;
      M) maxw=$OPTARG ;;
      n) limit=$OPTARG ;;
      E) incl_csv=$OPTARG ;;
      X) excl_csv=$OPTARG ;;
      A) allfiles=1 ;;
      c) mincount=$OPTARG ;;
      f) single_file=$OPTARG ;;
      h)
        print -- "Usage: repetitivephrases [OPTIONS] [FILE]"
        print -- ""
        print -- "Find repeated n-grams (phrases) across text/code files"
        print -- ""
        print -- "OPTIONS:"
        print -- "  -r            Recurse into subdirectories"
        print -- "  -m MIN        Minimum phrase length in words (default: 3)"
        print -- "  -M MAX        Maximum phrase length in words (default: 20)"
        print -- "  -n LIMIT      Limit output to top N results (default: 500)"
        print -- "  -E ext,...    Include only these file extensions (comma-separated)"
        print -- "  -X ext,...    Exclude these file extensions (comma-separated)"
        print -- "  -A            Analyze all files (ignore extension filters)"
        print -- "  -c MINCOUNT   Minimum occurrence count to report (default: 2)"
        print -- "  -f FILE       Check single FILE for repetitions against all other files"
        print -- "  -h            Show this help message"
        print -- ""
        print -- "USAGE PATTERNS:"
        print -- "  repetitivephrases                   # Check all files in current directory"
        print -- "  repetitivephrases FILE               # Check for repetitions within FILE only"
        print -- "  repetitivephrases -f FILE            # Check FILE against all other files"
        print -- ""
        print -- "Default extensions: txt,md,markdown,rst,tex,html,htm,xml,json,yaml,yml,toml,ini,conf,env,csv,tsv,py,js,ts,jsx,tsx,css,scss,less,java,c,h,hh,cpp,hpp,cc,go,rs,rb,php,pl,lua,sh,bash,zsh,fish,sql,r,mdx"
        print -- ""
        print -- "Examples:"
        print -- "  repetitivephrases                   # Check current directory files"
        print -- "  repetitivephrases myfile.txt        # Check repetitions within myfile.txt"
        print -- "  repetitivephrases -r -n 100         # Recurse and show top 100"
        print -- "  repetitivephrases -E txt,md         # Only text and markdown files"
        print -- "  repetitivephrases -f myfile.txt     # Check myfile.txt against others"
        return 0;;
      \?) print -u2 "repetitivephrases: invalid option"; return 1 ;;
    esac
  done
  shift $((OPTIND-1))

  # If no options but a filename argument is provided, check that file for internal repetitions
  if [[ $OPTIND -eq 1 && $# -eq 1 ]]; then
    if [[ -f "$1" ]]; then
      single_file="$1"
      # Special mode: check only within this single file
      internal_check=1
    else
      print -u2 "Error: File '$1' not found"
      return 1
    fi
  elif [[ $OPTIND -eq 1 && $# -eq 0 ]]; then
    # No arguments at all, show help
    print -- "Usage: repetitivephrases [OPTIONS] [FILE]"
    print -- "Find repeated n-grams (phrases) across text/code files"
    print -- ""
    print -- "Run 'repetitivephrases -h' for detailed help"
    return 0
  fi

  default_exts="txt,md,markdown,rst,tex,html,htm,xml,json,yaml,yml,toml,ini,conf,env,csv,tsv,py,js,ts,jsx,tsx,css,scss,less,java,c,h,hh,cpp,hpp,cc,go,rs,rb,php,pl,lua,sh,bash,zsh,fish,sql,r,mdx"
  [[ -z $incl_csv ]] && incl_csv=$default_exts
  incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') "
  excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') "

  TAB="$(printf '\t')"
  tmp_list=$(mktemp) || return 1
  tmp_pairs=$(mktemp) || { rm -f "$tmp_list"; return 1; }

  # Build file list
  if [[ -n $single_file ]]; then
    # Check if single file exists
    if [[ ! -f $single_file ]]; then
      print -u2 "Error: File '$single_file' not found"
      rm -f "$tmp_list" "$tmp_pairs"
      return 1
    fi

    if [[ $internal_check -eq 1 ]]; then
      # Internal check mode: only analyze the single file
      realpath "$single_file" > "$tmp_list"
    else
      # Add the single file first
      realpath "$single_file" > "$tmp_list"
      # Then add all other files in the same directory (or recursively)
      if [[ $recurse -eq 1 ]]; then
        find . -type f -print | while read f; do
          [[ "$(realpath "$f")" != "$(realpath "$single_file")" ]] && echo "$f"
        done >> "$tmp_list"
      else
        find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' | while read f; do
          [[ "$(realpath "$f")" != "$(realpath "$single_file")" ]] && echo "$f"
        done >> "$tmp_list"
      fi
    fi
  elif [[ $recurse -eq 1 ]]; then
    find . -type f -print > "$tmp_list"
  else
    # top-level only
    find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list"
  fi

  : > "$tmp_pairs"
  while IFS= read -r fpath; do
    [[ -f $fpath ]] || continue
    base=${fpath##*/}; ext=""
    case "$base" in *.*) ext=${base##*.} ;; esac
    ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]')

    # exclude by extension
    if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then
      continue
    fi

    # include filter unless -A
    if [[ $allfiles -ne 1 ]]; then
      if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then
        continue
      fi
    fi

    # Tokenize to n-grams and emit "phrase\tfilename"
    command awk -v MIN="$minw" -v MAX="$maxw" -v FN="$fpath" '
      function push(tok){ t[++tlen]=tok }
      function emit(   n,i,j,ph){
        for(n=MIN;n<=MAX;n++){
          if(tlen<n) continue
          for(i=1;i<=tlen-n+1;i++){
            ph=t[i]
            for(j=i+1;j<i+n;j++) ph=ph " " t[j]
            printf "%s\t%s\n", ph, FN
          }
        }
      }
      {
        tlen=0; gsub(/\r$/,"")
        gsub(/[^[:alnum:][:space:]\047-]/, " ")
        $0=tolower($0)
        nf=split($0,words)
        for(w=1;w<=nf;w++){
          if(length(words[w])>0) push(words[w])
        }
        emit()
      }' "$fpath" >> "$tmp_pairs"
  done < "$tmp_list"

  # Nothing produced?
  if [[ ! -s $tmp_pairs ]]; then
    printf 'COUNT%sFILES%sFILE_LIST%sPHRASE\n' "$TAB" "$TAB" "$TAB"
    rm -f "$tmp_list" "$tmp_pairs"
    return 0
  fi

  # Aggregate and filter out subphrases
  sort -t "$TAB" -k1,1 "$tmp_pairs" | \
  command awk -F "$TAB" -v MINC="$mincount" -v TAB="$TAB" -v SINGLE="$single_file" -v INTERNAL="$internal_check" '
    {
      ph=$1; fn=$2
      count[ph]++
      key=ph SUBSEP fn
      if (!(key in seen)) {
        seen[key]=1
        files[ph]++
        # Track if phrase appears in single file
        if (SINGLE != "" && fn == SINGLE) {
          in_single[ph] = 1
        }
        # Build comma-separated list of unique files
        if (file_list[ph] == "") {
          file_list[ph] = fn
        } else {
          file_list[ph] = file_list[ph] "," fn
        }
      }
    }
    END {
      # Build array of phrases that meet minimum count
      n = 0
      for (ph in count) {
        if (INTERNAL == "1") {
          # Internal check mode: show phrases repeated within the single file
          if (count[ph] >= MINC) {
            phrases[++n] = ph
            phrase_count[ph] = count[ph]
            phrase_files[ph] = 1  # Always 1 file in internal mode
            phrase_file_list[ph] = file_list[ph]
          }
        } else if (SINGLE != "") {
          # Single file vs others mode: only include phrases that appear in the single file
          # and at least one other file
          if (in_single[ph] && files[ph] >= 2 && count[ph] >= MINC) {
            phrases[++n] = ph
            phrase_count[ph] = count[ph]
            phrase_files[ph] = files[ph]
            phrase_file_list[ph] = file_list[ph]
          }
        } else {
          # Normal mode: just check minimum count
          if (count[ph] >= MINC) {
            phrases[++n] = ph
            phrase_count[ph] = count[ph]
            phrase_files[ph] = files[ph]
            phrase_file_list[ph] = file_list[ph]
          }
        }
      }

      # Mark phrases that are substrings of longer phrases with same/higher count
      for (i = 1; i <= n; i++) {
        for (j = 1; j <= n; j++) {
          if (i != j && !is_substring[phrases[i]]) {
            # Check if phrases[i] is a substring of phrases[j]
            # and phrases[j] has same or higher count
            if (index(phrases[j], phrases[i]) > 0 &&
                phrase_count[phrases[j]] >= phrase_count[phrases[i]]) {
              is_substring[phrases[i]] = 1
            }
          }
        }
      }

      # Output non-substring phrases
      for (ph in phrase_count) {
        if (!is_substring[ph]) {
          printf "%d%s%d%s%s%s%s\n", phrase_count[ph], TAB, phrase_files[ph], TAB, phrase_file_list[ph], TAB, ph
        }
      }
    }' | \
  sort -t "$TAB" -k1,1nr -k2,2nr -k4,4 | \
  awk -F "$TAB" -v LIM="$limit" -v TAB="$TAB" '
    BEGIN { print "COUNT" TAB "FILES" TAB "FILE_LIST" TAB "PHRASE" }
    { if (NR<=LIM) print }
  '

  rm -f "$tmp_list" "$tmp_pairs"
 }

 paragraph_histogram() {
  echo -e "\033[1;36mParagraph Length Histogram (words)\033[0m"

  find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 \
    | xargs -0 awk 'BEGIN{RS="\n\n+"} NF{print NF}' \
    | sort -nr \
    | uniq -c \
    | sort -nr \
    | awk '{bar=""; for(i=0;i<$1/2;i++) bar=bar"█"; printf "\033[1;33m%4d\033[0m | %4d %s\n", $2, $1, bar}'
 }

 sentence_histogram() {
  find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 | \
  xargs -0 cat | \
  awk '
    BEGIN { RS="[.!?]"; FS=" "; total=0; count=0; min=0; max=0 }
    NF {
      len=NF
      total+=len; count++
      if(min==0 || len<min) min=len
      if(len>max) max=len
      bins[len]++
    }
    END {
      mean = total/count
      # median via cumulative counts
      cumsum=0; median_idx=int((count+1)/2)
      for(i=1;i<=max;i++) if(bins[i]) { cumsum+=bins[i]; if(!median_found && cumsum>=median_idx){ median=i; median_found=1 } }
      # variance
      varsum=0
      for(i=1;i<=max;i++) if(bins[i]) varsum+=bins[i]*(i-mean)^2
      variance=varsum/count
      # histogram in bins of 5
      printf "\033[1;36mSentence Length Histogram (words)\033[0m\n"
      for(i=5;i<=max;i+=5){
        bin_count=0
        for(j=i-4;j<=i;j++) if(bins[j]) bin_count+=bins[j]
        if(bin_count>0){
          pct=(bin_count/count)*100
          barlen=int(pct/2)
          bar=""
          for(k=0;k<barlen;k++) bar=bar"█"
          label=(i>50)?">50":sprintf("%2d-%2d",i-4,i)
          printf "\033[1;33m%-6s\033[0m %5d (%5.2f%%) %s\n", label, bin_count, pct, bar
        }
      }
      # summary
      printf "\n\033[1;32mSummary Stats:\033[0m\n"
      printf "  Sentences: %d\n  Mean: %.2f\n  Median: %d\n  Variance: %.2f\n  Min: %d\n  Max: %d\n", count, mean, median, variance, min, max
    }
  '
 }

 wordfreq() {
  recurse=0; incl_csv=""; excl_csv=""; allfiles=0
  top=50; minlen=2; maxkb=1024; dropnum=0

  OPTIND=1
  while getopts "rE:X:An:L:s:Nh" opt; do
    case "$opt" in
      r) recurse=1 ;;
      E) incl_csv=$OPTARG ;;
      X) excl_csv=$OPTARG ;;
      A) allfiles=1 ;;
      n) top=$OPTARG ;;
      L) minlen=$OPTARG ;;
      s) maxkb=$OPTARG ;;
      N) dropnum=1 ;;
      h)
        printf '%s\n' "Usage: wordfreq [-r] [-E ext,...] [-X ext,...] [-A] [-n N] [-L MINLEN] [-s KB] [-N]"
        return 0 ;;
      \?) printf '%s\n' "wordfreq: invalid option" >&2; return 1 ;;
    esac
  done
  shift $((OPTIND-1))

  default_exts="txt,md,markdown,tex"
  [[ -z $incl_csv ]] && incl_csv=$default_exts
  incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') "
  excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') "
  maxbytes=$(( maxkb * 1024 ))

  tmp_list=$(mktemp) || return 1
  tmp_counts=$(mktemp) || { rm -f "$tmp_list"; return 1; }

  # Build file list
  if [[ $recurse -eq 1 ]]; then
    find . -type f -print > "$tmp_list"
  else
    # top-level only
    find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list"
  fi

  # Stream files, count in awk
  LC_ALL_BACKUP="$LC_ALL"; export LC_ALL=C
  while IFS= read -r f; do
    [[ -f $f ]] || continue
    base=${f##*/}; ext=""
    case "$base" in *.*) ext=${base##*.} ;; esac
    ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]')

    # extension filters
    if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then continue; fi
    if [[ $allfiles -ne 1 ]]; then
      if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then continue; fi
    fi

    # size filter
    sz=$(wc -c < "$f" 2>/dev/null | awk '{print $1+0}')
    [[ -n $sz && $sz -le $maxbytes ]] || continue

    # texty heuristic: if file(1) exists, skip non-text; else rely on tokenization to strip noise
    if command -v file >/dev/null 2>&1; then
      if ! file "$f" | grep -qiE 'text|utf-8|ascii|unicode'; then
        continue
      fi
    fi

    # Feed file into awk counter
    awk -v MINLEN="$minlen" -v DROPNUM="$dropnum" '
      function emit(tok) {
        if (length(tok) < MINLEN) return
        if (DROPNUM && tok ~ /^[0-9]+([[:punct:]]*[0-9]+)*$/) return
        count[tok]++
      }
      {
        line=tolower($0)
        gsub(/[^[:alnum:]'\'' ]+/, " ", line)
        gsub(/[[:space:]]+/, " ", line)
        sub(/^ /, "", line); sub(/ $/, "", line)
        n=split(line,a," ")
        for (i=1;i<=n;i++) if (a[i]!="") emit(a[i])
      }
      END {
        for (w in count) printf "%s\t%d\n", w, count[w]
      }
    ' "$f"
  done < "$tmp_list" | \
  awk -F '\t' '{
    w=$1; c=$2+0; total[w]+=c
  }
  END { for (w in total) printf "%d\t%s\n", total[w], w }' | \
  sort -t $'\t' -k1,1nr | head -n "$top"

  export LC_ALL="$LC_ALL_BACKUP"
  rm -f "$tmp_list" "$tmp_counts"
 }

 avg_words_per_sentence() {
  find . -type f \( -name "*.txt" -o -name "*.md" \) -print0 | \
  xargs -0 cat | \
  awk '
    BEGIN {
      RS="\n\n+"; FS="[.!?]"; para=0
      printf "\033[1;36m\nAverage Words per Sentence by Paragraph\033[0m\n\n"
    }
    {
      para++
      total_s=0
      total_w=0
      for(i=1;i<=NF;i++){
        n=split($i, words, " ")
        total_s++
        total_w+=n
      }
      if(total_s>0){
        avg=total_w/total_s
        barlen=int(avg/2)       # scale bar length for visual
        bar=""
        for(j=0;j<barlen;j++) bar=bar"█"
        color="\033[0;32m"      # green default
        if(avg>40) color="\033[0;31m"  # red if very long
        printf "Paragraph %3d: %5.2f words/sentence %s%s\033[0m\n", para, avg, color, bar
      }
    }
  '
 }


 wordbar() {
  recurse=0; incl_csv=""; excl_csv=""; allfiles=0; topn=20; width=50; show_total=0
  OPTIND=1
  while getopts "rE:X:An:w:th" opt; do
    case "$opt" in
      r) recurse=1 ;; E) incl_csv=$OPTARG ;; X) excl_csv=$OPTARG ;; A) allfiles=1 ;;
      n) topn=$OPTARG ;; w) width=$OPTARG ;; t) show_total=1 ;;
      h) printf -- '%s\n' 'Usage: wordbar [-r] [-E ext,...] [-X ext,...] [-A] [-n N] [-w W] [-t]'; return 0 ;;
      \?) printf -- '%s\n' "wordbar: invalid option" >&2; return 1 ;;
    esac
  done
  shift $((OPTIND-1))

  default_exts="txt,md,markdown,rst,tex"
  [[ -z $incl_csv ]] && incl_csv=$default_exts
  incl_set=" $(printf '%s' "$incl_csv" | sed 's/,/ /g') "
  excl_set=" $(printf '%s' "$excl_csv" | sed 's/,/ /g') "
  TAB="$(printf '\t')"

  tmp_list=$(mktemp) || return 1
  tmp_counts=$(mktemp) || { rm -f "$tmp_list"; return 1; }

  if [[ $recurse -eq 1 ]]; then
    find . -type f -print > "$tmp_list"
  else
    find . -type d ! -name . -prune -o -type f -print | sed -n 's|^\./[^/]*$|&|p' > "$tmp_list"
  fi

  LC_ALL_BACKUP="$LC_ALL"; export LC_ALL=C
  total=0
  while IFS= read -r fpath; do
    [[ -f $fpath ]] || continue
    base=${fpath##*/}; ext=""
    case "$base" in *.*) ext=${base##*.} ;; esac
    ext=$(printf '%s' "$ext" | tr '[:upper:]' '[:lower:]')
    if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" | grep -q " $ext "; then continue; fi
    if [[ $allfiles -ne 1 ]]; then
      if [[ -z $ext ]] || ! printf '%s' "$incl_set" | grep -q " $ext "; then continue; fi
    fi
    count=$(wc -w < "$fpath" 2>/dev/null | awk '{print $1+0}')
    [[ -n $count ]] || continue
    printf '%s\t%s\n' "$count" "$fpath" >> "$tmp_counts"
    total=$(( total + count ))
  done < "$tmp_list"

  if [[ ! -s $tmp_counts ]]; then
    printf 'No files matched.\n' >&2
    export LC_ALL="$LC_ALL_BACKUP"
    rm -f "$tmp_list" "$tmp_counts"
    return 1
  fi

  printf '%s\n' "WORDS    BAR  FILE"
  printf '%s\n' "----------------------------------------------"
  sort -t "$TAB" -k1,1nr "$tmp_counts" | head -n "$topn" \
  | awk -F '\t' -v W="$width" '
      NR==1 { max=$1+0 }
      {
        words=$1+0; file=$2
        len = (max>0)? int((words/max)*W) : 0
        bar = ""; for (i=0;i<len;i++) bar = bar "█"
        printf "%8d  %-*s  %s\n", words, W, bar, file
      }'

  if [[ $show_total -eq 1 ]]; then
    printf '%s\n' "----------------------------------"
    printf "TOTAL: %d words\n" "$total"
  fi

  export LC_ALL="$LC_ALL_BACKUP"
  rm -f "$tmp_list" "$tmp_counts"
 }

 unique_ratio() {
    awk '{for(i=1;i<=NF;i++){ w=tolower($i); gsub(/[^a-z]/,"",w); if(w!="") a[w]++}} 
    END{uniq=0; total=0; for(w in a){uniq++; total+=a[w]}; printf "Unique word ratio: %.2f%%\n", (uniq/total)*100}' "$@"
 }

 sentence_starters() {
  python3 - <<'EOF'
 import sys, re, glob, os
 from collections import Counter

 files = sys.argv[1:] or ["*.txt", "*.md"]
 starters = Counter()

 for pattern in files:
    for fname in glob.glob(pattern):
        if not os.path.isfile(fname):
            continue
        with open(fname, encoding="utf-8") as f:
            text = f.read()
            # Split into sentences
            sentences = re.split(r'[.!?]+', text)
            for s in sentences:
                words = s.strip().split()
                if words:
                    first_word = re.sub(r'[^A-Za-z]', '', words[0]).lower()
                    if first_word:
                        starters[first_word] += 1

 for word, count in starters.most_common(20):
    print(f"{count:5d} {word}")
 EOF
 }



 dialogue_ratio() {
  python3 - <<'EOF'
 import sys, re, glob, os

 files = sys.argv[1:] or ["*.txt", "*.md"]
 dialogue_count = 0
 sentence_count = 0

 for pattern in files:
    for fname in glob.glob(pattern):
        if not os.path.isfile(fname):
            continue
        with open(fname, encoding="utf-8") as f:
            text = f.read()
            sentences = re.split(r'[.!?]+', text)
            sentence_count += len([s for s in sentences if s.strip()])
            dialogue_count += len(re.findall(r'"(.*?)"', text))

 if sentence_count == 0:
    print("No sentences found. Make sure you pass valid text files as arguments.")
 else:
    ratio = (dialogue_count / sentence_count) * 100
    print(f"Dialogue sentences: {dialogue_count} of {sentence_count} ({ratio:.2f}%)")
 EOF
 }


 readability() {
  python3 - "$@" <<'PYTHON'
 import sys, re

 words = 0
 sentences = 0
 syllables = 0

 for filename in sys.argv[1:]:
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            text = f.read()
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        continue

    # split sentences
    sents = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
    sentences += len(sents)

    # split words
    ws = re.findall(r'\b\w+\b', text)
    words += len(ws)

    # syllables: count vowel groups in each word
    for w in ws:
        syllables += len(re.findall(r'[aeiouy]+', w.lower())) or 1

 if sentences > 0 and words > 0:
    asl = words / sentences
    asw = syllables / words
    fk = 206.835 - 1.015 * asl - 84.6 * asw
    print(f"Flesch–Kincaid readability: {fk:.2f}")
 else:
    print("No sentences found. Make sure you pass valid text files as arguments.")
 PYTHON
 }
	repetitivephrases() {
	recurse=0; minw=3; maxw=20; limit=500; incl_csv=""; excl_csv=""; allfiles=0; mincount=2; single_file=""; internal_check=0
	OPTIND=1
	while getopts "rm:M:n:E:X:Ac:f:h" opt; do
	case "$opt" in
	r) recurse=1 ;;
	m) minw=$OPTARG ;;
	M) maxw=$OPTARG ;;
	n) limit=$OPTARG ;;
	E) incl_csv=$OPTARG ;;
	X) excl_csv=$OPTARG ;;
	A) allfiles=1 ;;
	c) mincount=$OPTARG ;;
	f) single_file=$OPTARG ;;
	h)
	print -- "Usage: repetitivephrases [OPTIONS] [FILE]"
	print -- ""
	print -- "Find repeated n-grams (phrases) across text/code files"
	print -- ""
	print -- "OPTIONS:"
	print -- " -r Recurse into subdirectories"
	print -- " -m MIN Minimum phrase length in words (default: 3)"
	print -- " -M MAX Maximum phrase length in words (default: 20)"
	print -- " -n LIMIT Limit output to top N results (default: 500)"
	print -- " -E ext,... Include only these file extensions (comma-separated)"
	print -- " -X ext,... Exclude these file extensions (comma-separated)"
	print -- " -A Analyze all files (ignore extension filters)"
	print -- " -c MINCOUNT Minimum occurrence count to report (default: 2)"
	print -- " -f FILE Check single FILE for repetitions against all other files"
	print -- " -h Show this help message"
	print -- ""
	print -- "USAGE PATTERNS:"
	print -- " repetitivephrases # Check all files in current directory"
	print -- " repetitivephrases FILE # Check for repetitions within FILE only"
	print -- " repetitivephrases -f FILE # Check FILE against all other files"
	print -- ""
	print -- "Default extensions: txt,md,markdown,rst,tex,html,htm,xml,json,yaml,yml,toml,ini,conf,env,csv,tsv,py,js,ts,jsx,tsx,css,scss,less,java,c,h,hh,cpp,hpp,cc,go,rs,rb,php,pl,lua,sh,bash,zsh,fish,sql,r,mdx"
	print -- ""
	print -- "Examples:"
	print -- " repetitivephrases # Check current directory files"
	print -- " repetitivephrases myfile.txt # Check repetitions within myfile.txt"
	print -- " repetitivephrases -r -n 100 # Recurse and show top 100"
	print -- " repetitivephrases -E txt,md # Only text and markdown files"
	print -- " repetitivephrases -f myfile.txt # Check myfile.txt against others"
	return 0;;
	\?) print -u2 "repetitivephrases: invalid option"; return 1 ;;
	esac
	done
	shift $((OPTIND-1))

	# If no options but a filename argument is provided, check that file for internal repetitions
	if [[ $OPTIND -eq 1 && $# -eq 1 ]]; then
	if [[ -f "$1" ]]; then
	single_file="$1"
	# Special mode: check only within this single file
	internal_check=1
	else
	print -u2 "Error: File '$1' not found"
	return 1
	fi
	elif [[ $OPTIND -eq 1 && $# -eq 0 ]]; then
	# No arguments at all, show help
	print -- "Usage: repetitivephrases [OPTIONS] [FILE]"
	print -- "Find repeated n-grams (phrases) across text/code files"
	print -- ""
	print -- "Run 'repetitivephrases -h' for detailed help"
	return 0
	fi

	default_exts="txt,md,markdown,rst,tex,html,htm,xml,json,yaml,yml,toml,ini,conf,env,csv,tsv,py,js,ts,jsx,tsx,css,scss,less,java,c,h,hh,cpp,hpp,cc,go,rs,rb,php,pl,lua,sh,bash,zsh,fish,sql,r,mdx"
	[[ -z $incl_csv ]] && incl_csv=$default_exts
	incl_set=" $(printf '%s' "$incl_csv" \| sed 's/,/ /g') "
	excl_set=" $(printf '%s' "$excl_csv" \| sed 's/,/ /g') "

	TAB="$(printf '\t')"
	tmp_list=$(mktemp) \|\| return 1
	tmp_pairs=$(mktemp) \|\| { rm -f "$tmp_list"; return 1; }

	# Build file list
	if [[ -n $single_file ]]; then
	# Check if single file exists
	if [[ ! -f $single_file ]]; then
	print -u2 "Error: File '$single_file' not found"
	rm -f "$tmp_list" "$tmp_pairs"
	return 1
	fi

	if [[ $internal_check -eq 1 ]]; then
	# Internal check mode: only analyze the single file
	realpath "$single_file" > "$tmp_list"
	else
	# Add the single file first
	realpath "$single_file" > "$tmp_list"
	# Then add all other files in the same directory (or recursively)
	if [[ $recurse -eq 1 ]]; then
	find . -type f -print \| while read f; do
	[[ "$(realpath "$f")" != "$(realpath "$single_file")" ]] && echo "$f"
	done >> "$tmp_list"
	else
	find . -type d ! -name . -prune -o -type f -print \| sed -n 's\|^\./[^/]*$\|&\|p' \| while read f; do
	[[ "$(realpath "$f")" != "$(realpath "$single_file")" ]] && echo "$f"
	done >> "$tmp_list"
	fi
	fi
	elif [[ $recurse -eq 1 ]]; then
	find . -type f -print > "$tmp_list"
	else
	# top-level only
	find . -type d ! -name . -prune -o -type f -print \| sed -n 's\|^\./[^/]*$\|&\|p' > "$tmp_list"
	fi

	: > "$tmp_pairs"
	while IFS= read -r fpath; do
	[[ -f $fpath ]] \|\| continue
	base=${fpath##*/}; ext=""
	case "$base" in .) ext=${base##*.} ;; esac
	ext=$(printf '%s' "$ext" \| tr '[:upper:]' '[:lower:]')

	# exclude by extension
	if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" \| grep -q " $ext "; then
	continue
	fi

	# include filter unless -A
	if [[ $allfiles -ne 1 ]]; then
	if [[ -z $ext ]] \|\| ! printf '%s' "$incl_set" \| grep -q " $ext "; then
	continue
	fi
	fi

	# Tokenize to n-grams and emit "phrase\tfilename"
	command awk -v MIN="$minw" -v MAX="$maxw" -v FN="$fpath" '
	function push(tok){ t[++tlen]=tok }
	function emit( n,i,j,ph){
	for(n=MIN;n<=MAX;n++){
	if(tlen<n) continue
	for(i=1;i<=tlen-n+1;i++){
	ph=t[i]
	for(j=i+1;j<i+n;j++) ph=ph " " t[j]
	printf "%s\t%s\n", ph, FN
	}
	}
	}
	{
	tlen=0; gsub(/\r$/,"")
	gsub(/[^[:alnum:][:space:]\047-]/, " ")
	$0=tolower($0)
	nf=split($0,words)
	for(w=1;w<=nf;w++){
	if(length(words[w])>0) push(words[w])
	}
	emit()
	}' "$fpath" >> "$tmp_pairs"
	done < "$tmp_list"

	# Nothing produced?
	if [[ ! -s $tmp_pairs ]]; then
	printf 'COUNT%sFILES%sFILE_LIST%sPHRASE\n' "$TAB" "$TAB" "$TAB"
	rm -f "$tmp_list" "$tmp_pairs"
	return 0
	fi

	# Aggregate and filter out subphrases
	sort -t "$TAB" -k1,1 "$tmp_pairs" \| \
	command awk -F "$TAB" -v MINC="$mincount" -v TAB="$TAB" -v SINGLE="$single_file" -v INTERNAL="$internal_check" '
	{
	ph=$1; fn=$2
	count[ph]++
	key=ph SUBSEP fn
	if (!(key in seen)) {
	seen[key]=1
	files[ph]++
	# Track if phrase appears in single file
	if (SINGLE != "" && fn == SINGLE) {
	in_single[ph] = 1
	}
	# Build comma-separated list of unique files
	if (file_list[ph] == "") {
	file_list[ph] = fn
	} else {
	file_list[ph] = file_list[ph] "," fn
	}
	}
	}
	END {
	# Build array of phrases that meet minimum count
	n = 0
	for (ph in count) {
	if (INTERNAL == "1") {
	# Internal check mode: show phrases repeated within the single file
	if (count[ph] >= MINC) {
	phrases[++n] = ph
	phrase_count[ph] = count[ph]
	phrase_files[ph] = 1 # Always 1 file in internal mode
	phrase_file_list[ph] = file_list[ph]
	}
	} else if (SINGLE != "") {
	# Single file vs others mode: only include phrases that appear in the single file
	# and at least one other file
	if (in_single[ph] && files[ph] >= 2 && count[ph] >= MINC) {
	phrases[++n] = ph
	phrase_count[ph] = count[ph]
	phrase_files[ph] = files[ph]
	phrase_file_list[ph] = file_list[ph]
	}
	} else {
	# Normal mode: just check minimum count
	if (count[ph] >= MINC) {
	phrases[++n] = ph
	phrase_count[ph] = count[ph]
	phrase_files[ph] = files[ph]
	phrase_file_list[ph] = file_list[ph]
	}
	}
	}

	# Mark phrases that are substrings of longer phrases with same/higher count
	for (i = 1; i <= n; i++) {
	for (j = 1; j <= n; j++) {
	if (i != j && !is_substring[phrases[i]]) {
	# Check if phrases[i] is a substring of phrases[j]
	# and phrases[j] has same or higher count
	if (index(phrases[j], phrases[i]) > 0 &&
	phrase_count[phrases[j]] >= phrase_count[phrases[i]]) {
	is_substring[phrases[i]] = 1
	}
	}
	}
	}

	# Output non-substring phrases
	for (ph in phrase_count) {
	if (!is_substring[ph]) {
	printf "%d%s%d%s%s%s%s\n", phrase_count[ph], TAB, phrase_files[ph], TAB, phrase_file_list[ph], TAB, ph
	}
	}
	}' \| \
	sort -t "$TAB" -k1,1nr -k2,2nr -k4,4 \| \
	awk -F "$TAB" -v LIM="$limit" -v TAB="$TAB" '
	BEGIN { print "COUNT" TAB "FILES" TAB "FILE_LIST" TAB "PHRASE" }
	{ if (NR<=LIM) print }
	'

	rm -f "$tmp_list" "$tmp_pairs"
	}

	paragraph_histogram() {
	echo -e "\033[1;36mParagraph Length Histogram (words)\033[0m"

	find . -type f \( -name ".txt" -o -name ".md" \) -print0 \
	\| xargs -0 awk 'BEGIN{RS="\n\n+"} NF{print NF}' \
	\| sort -nr \
	\| uniq -c \
	\| sort -nr \
	\| awk '{bar=""; for(i=0;i<$1/2;i++) bar=bar"█"; printf "\033[1;33m%4d\033[0m \| %4d %s\n", $2, $1, bar}'
	}

	sentence_histogram() {
	find . -type f \( -name ".txt" -o -name ".md" \) -print0 \| \
	xargs -0 cat \| \
	awk '
	BEGIN { RS="[.!?]"; FS=" "; total=0; count=0; min=0; max=0 }
	NF {
	len=NF
	total+=len; count++
	if(min==0 \|\| len<min) min=len
	if(len>max) max=len
	bins[len]++
	}
	END {
	mean = total/count
	# median via cumulative counts
	cumsum=0; median_idx=int((count+1)/2)
	for(i=1;i<=max;i++) if(bins[i]) { cumsum+=bins[i]; if(!median_found && cumsum>=median_idx){ median=i; median_found=1 } }
	# variance
	varsum=0
	for(i=1;i<=max;i++) if(bins[i]) varsum+=bins[i]*(i-mean)^2
	variance=varsum/count
	# histogram in bins of 5
	printf "\033[1;36mSentence Length Histogram (words)\033[0m\n"
	for(i=5;i<=max;i+=5){
	bin_count=0
	for(j=i-4;j<=i;j++) if(bins[j]) bin_count+=bins[j]
	if(bin_count>0){
	pct=(bin_count/count)*100
	barlen=int(pct/2)
	bar=""
	for(k=0;k<barlen;k++) bar=bar"█"
	label=(i>50)?">50":sprintf("%2d-%2d",i-4,i)
	printf "\033[1;33m%-6s\033[0m %5d (%5.2f%%) %s\n", label, bin_count, pct, bar
	}
	}
	# summary
	printf "\n\033[1;32mSummary Stats:\033[0m\n"
	printf " Sentences: %d\n Mean: %.2f\n Median: %d\n Variance: %.2f\n Min: %d\n Max: %d\n", count, mean, median, variance, min, max
	}
	'
	}

	wordfreq() {
	recurse=0; incl_csv=""; excl_csv=""; allfiles=0
	top=50; minlen=2; maxkb=1024; dropnum=0

	OPTIND=1
	while getopts "rE:X:An:L:s:Nh" opt; do
	case "$opt" in
	r) recurse=1 ;;
	E) incl_csv=$OPTARG ;;
	X) excl_csv=$OPTARG ;;
	A) allfiles=1 ;;
	n) top=$OPTARG ;;
	L) minlen=$OPTARG ;;
	s) maxkb=$OPTARG ;;
	N) dropnum=1 ;;
	h)
	printf '%s\n' "Usage: wordfreq [-r] [-E ext,...] [-X ext,...] [-A] [-n N] [-L MINLEN] [-s KB] [-N]"
	return 0 ;;
	\?) printf '%s\n' "wordfreq: invalid option" >&2; return 1 ;;
	esac
	done
	shift $((OPTIND-1))

	default_exts="txt,md,markdown,tex"
	[[ -z $incl_csv ]] && incl_csv=$default_exts
	incl_set=" $(printf '%s' "$incl_csv" \| sed 's/,/ /g') "
	excl_set=" $(printf '%s' "$excl_csv" \| sed 's/,/ /g') "
	maxbytes=$(( maxkb * 1024 ))

	tmp_list=$(mktemp) \|\| return 1
	tmp_counts=$(mktemp) \|\| { rm -f "$tmp_list"; return 1; }

	# Build file list
	if [[ $recurse -eq 1 ]]; then
	find . -type f -print > "$tmp_list"
	else
	# top-level only
	find . -type d ! -name . -prune -o -type f -print \| sed -n 's\|^\./[^/]*$\|&\|p' > "$tmp_list"
	fi

	# Stream files, count in awk
	LC_ALL_BACKUP="$LC_ALL"; export LC_ALL=C
	while IFS= read -r f; do
	[[ -f $f ]] \|\| continue
	base=${f##*/}; ext=""
	case "$base" in .) ext=${base##*.} ;; esac
	ext=$(printf '%s' "$ext" \| tr '[:upper:]' '[:lower:]')

	# extension filters
	if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" \| grep -q " $ext "; then continue; fi
	if [[ $allfiles -ne 1 ]]; then
	if [[ -z $ext ]] \|\| ! printf '%s' "$incl_set" \| grep -q " $ext "; then continue; fi
	fi

	# size filter
	sz=$(wc -c < "$f" 2>/dev/null \| awk '{print $1+0}')
	[[ -n $sz && $sz -le $maxbytes ]] \|\| continue

	# texty heuristic: if file(1) exists, skip non-text; else rely on tokenization to strip noise
	if command -v file >/dev/null 2>&1; then
	if ! file "$f" \| grep -qiE 'text\|utf-8\|ascii\|unicode'; then
	continue
	fi
	fi

	# Feed file into awk counter
	awk -v MINLEN="$minlen" -v DROPNUM="$dropnum" '
	function emit(tok) {
	if (length(tok) < MINLEN) return
	if (DROPNUM && tok ~ /^[0-9]+([[:punct:]][0-9]+)$/) return
	count[tok]++
	}
	{
	line=tolower($0)
	gsub(/[^[:alnum:]'\'' ]+/, " ", line)
	gsub(/[[:space:]]+/, " ", line)
	sub(/^ /, "", line); sub(/ $/, "", line)
	n=split(line,a," ")
	for (i=1;i<=n;i++) if (a[i]!="") emit(a[i])
	}
	END {
	for (w in count) printf "%s\t%d\n", w, count[w]
	}
	' "$f"
	done < "$tmp_list" \| \
	awk -F '\t' '{
	w=$1; c=$2+0; total[w]+=c
	}
	END { for (w in total) printf "%d\t%s\n", total[w], w }' \| \
	sort -t $'\t' -k1,1nr \| head -n "$top"

	export LC_ALL="$LC_ALL_BACKUP"
	rm -f "$tmp_list" "$tmp_counts"
	}

	avg_words_per_sentence() {
	find . -type f \( -name ".txt" -o -name ".md" \) -print0 \| \
	xargs -0 cat \| \
	awk '
	BEGIN {
	RS="\n\n+"; FS="[.!?]"; para=0
	printf "\033[1;36m\nAverage Words per Sentence by Paragraph\033[0m\n\n"
	}
	{
	para++
	total_s=0
	total_w=0
	for(i=1;i<=NF;i++){
	n=split($i, words, " ")
	total_s++
	total_w+=n
	}
	if(total_s>0){
	avg=total_w/total_s
	barlen=int(avg/2) # scale bar length for visual
	bar=""
	for(j=0;j<barlen;j++) bar=bar"█"
	color="\033[0;32m" # green default
	if(avg>40) color="\033[0;31m" # red if very long
	printf "Paragraph %3d: %5.2f words/sentence %s%s\033[0m\n", para, avg, color, bar
	}
	}
	'
	}


	wordbar() {
	recurse=0; incl_csv=""; excl_csv=""; allfiles=0; topn=20; width=50; show_total=0
	OPTIND=1
	while getopts "rE:X:An:w:th" opt; do
	case "$opt" in
	r) recurse=1 ;; E) incl_csv=$OPTARG ;; X) excl_csv=$OPTARG ;; A) allfiles=1 ;;
	n) topn=$OPTARG ;; w) width=$OPTARG ;; t) show_total=1 ;;
	h) printf -- '%s\n' 'Usage: wordbar [-r] [-E ext,...] [-X ext,...] [-A] [-n N] [-w W] [-t]'; return 0 ;;
	\?) printf -- '%s\n' "wordbar: invalid option" >&2; return 1 ;;
	esac
	done
	shift $((OPTIND-1))

	default_exts="txt,md,markdown,rst,tex"
	[[ -z $incl_csv ]] && incl_csv=$default_exts
	incl_set=" $(printf '%s' "$incl_csv" \| sed 's/,/ /g') "
	excl_set=" $(printf '%s' "$excl_csv" \| sed 's/,/ /g') "
	TAB="$(printf '\t')"

	tmp_list=$(mktemp) \|\| return 1
	tmp_counts=$(mktemp) \|\| { rm -f "$tmp_list"; return 1; }

	if [[ $recurse -eq 1 ]]; then
	find . -type f -print > "$tmp_list"
	else
	find . -type d ! -name . -prune -o -type f -print \| sed -n 's\|^\./[^/]*$\|&\|p' > "$tmp_list"
	fi

	LC_ALL_BACKUP="$LC_ALL"; export LC_ALL=C
	total=0
	while IFS= read -r fpath; do
	[[ -f $fpath ]] \|\| continue
	base=${fpath##*/}; ext=""
	case "$base" in .) ext=${base##*.} ;; esac
	ext=$(printf '%s' "$ext" \| tr '[:upper:]' '[:lower:]')
	if [[ -n $excl_csv && -n $ext ]] && printf '%s' "$excl_set" \| grep -q " $ext "; then continue; fi
	if [[ $allfiles -ne 1 ]]; then
	if [[ -z $ext ]] \|\| ! printf '%s' "$incl_set" \| grep -q " $ext "; then continue; fi
	fi
	count=$(wc -w < "$fpath" 2>/dev/null \| awk '{print $1+0}')
	[[ -n $count ]] \|\| continue
	printf '%s\t%s\n' "$count" "$fpath" >> "$tmp_counts"
	total=$(( total + count ))
	done < "$tmp_list"

	if [[ ! -s $tmp_counts ]]; then
	printf 'No files matched.\n' >&2
	export LC_ALL="$LC_ALL_BACKUP"
	rm -f "$tmp_list" "$tmp_counts"
	return 1
	fi

	printf '%s\n' "WORDS BAR FILE"
	printf '%s\n' "----------------------------------------------"
	sort -t "$TAB" -k1,1nr "$tmp_counts" \| head -n "$topn" \
	\| awk -F '\t' -v W="$width" '
	NR==1 { max=$1+0 }
	{
	words=$1+0; file=$2
	len = (max>0)? int((words/max)*W) : 0
	bar = ""; for (i=0;i<len;i++) bar = bar "█"
	printf "%8d %-*s %s\n", words, W, bar, file
	}'

	if [[ $show_total -eq 1 ]]; then
	printf '%s\n' "----------------------------------"
	printf "TOTAL: %d words\n" "$total"
	fi

	export LC_ALL="$LC_ALL_BACKUP"
	rm -f "$tmp_list" "$tmp_counts"
	}

	unique_ratio() {
	awk '{for(i=1;i<=NF;i++){ w=tolower($i); gsub(/[^a-z]/,"",w); if(w!="") a[w]++}}
	END{uniq=0; total=0; for(w in a){uniq++; total+=a[w]}; printf "Unique word ratio: %.2f%%\n", (uniq/total)*100}' "$@"
	}

	sentence_starters() {
	python3 - <<'EOF'
	import sys, re, glob, os
	from collections import Counter

	files = sys.argv[1:] or [".txt", ".md"]
	starters = Counter()

	for pattern in files:
	for fname in glob.glob(pattern):
	if not os.path.isfile(fname):
	continue
	with open(fname, encoding="utf-8") as f:
	text = f.read()
	# Split into sentences
	sentences = re.split(r'[.!?]+', text)
	for s in sentences:
	words = s.strip().split()
	if words:
	first_word = re.sub(r'[^A-Za-z]', '', words[0]).lower()
	if first_word:
	starters[first_word] += 1

	for word, count in starters.most_common(20):
	print(f"{count:5d} {word}")
	EOF
	}



	dialogue_ratio() {
	python3 - <<'EOF'
	import sys, re, glob, os

	files = sys.argv[1:] or [".txt", ".md"]
	dialogue_count = 0
	sentence_count = 0

	for pattern in files:
	for fname in glob.glob(pattern):
	if not os.path.isfile(fname):
	continue
	with open(fname, encoding="utf-8") as f:
	text = f.read()
	sentences = re.split(r'[.!?]+', text)
	sentence_count += len([s for s in sentences if s.strip()])
	dialogue_count += len(re.findall(r'"(.*?)"', text))

	if sentence_count == 0:
	print("No sentences found. Make sure you pass valid text files as arguments.")
	else:
	ratio = (dialogue_count / sentence_count) * 100
	print(f"Dialogue sentences: {dialogue_count} of {sentence_count} ({ratio:.2f}%)")
	EOF
	}


	readability() {
	python3 - "$@" <<'PYTHON'
	import sys, re

	words = 0
	sentences = 0
	syllables = 0

	for filename in sys.argv[1:]:
	try:
	with open(filename, 'r', encoding='utf-8') as f:
	text = f.read()
	except Exception as e:
	print(f"Error reading {filename}: {e}")
	continue

	# split sentences
	sents = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
	sentences += len(sents)

	# split words
	ws = re.findall(r'\b\w+\b', text)
	words += len(ws)

	# syllables: count vowel groups in each word
	for w in ws:
	syllables += len(re.findall(r'[aeiouy]+', w.lower())) or 1

	if sentences > 0 and words > 0:
	asl = words / sentences
	asw = syllables / words
	fk = 206.835 - 1.015 * asl - 84.6 * asw
	print(f"Flesch–Kincaid readability: {fk:.2f}")
	else:
	print("No sentences found. Make sure you pass valid text files as arguments.")
	PYTHON
	}