Last active
October 31, 2025 09:21
-
-
Save neu5ron/86bac61083c135638452c92db8b3ae6d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # ==================================== SHEBANG CHECK ==================================== | |
| if [ -z "$BASH_VERSION" ]; then | |
| echo "ERROR: This script requires bash. Run with: bash $0 $*" >&2 | |
| exit 1 | |
| fi | |
| set -eo pipefail | |
| # ==================================== Redaction patterns ==================================== | |
| ORG_TERMS_RE='org_terms\.[A-Za-z0-9_]+' | |
| IPV4_RE='\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b' | |
| IPV6_RE='\b(?:[0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}\b' | |
| DOMAIN_RE='\b[a-zA-Z0-9.-]+\.(com|org|dev|local|mil|gov|net|io|tech|co)\b' | |
| URL_RE='\bhttps?://[a-zA-Z0-9.-]+\.(com|org|dev|local|mil|gov|net|io|tech|co)[^\s]*\b' | |
| USER_RE='(?i)\busername[ =:]+\S+' | |
| PASS_RE='(?i)\bpassword[ =:]+\S+' | |
| # shellcheck disable=SC2125 | |
| SKIP_REGEX_RE=$'(?s)/.+/|r\\".+?\\"|r\\'.+?\\'|re\\.compile\\(.+?\\)' | |
| CUSTOM_RE='' | |
| # ==================================== Defaults ==================================== | |
| OUTPUT_STYLE="xml" | |
| OUTPUT_PREFIX="repo_pack" | |
| ALL_STYLES=1 # default: run all styles | |
| # ==================================== USAGE ==================================== | |
| print_usage() { | |
| cat <<EOF | |
| Usage: $0 --repo-dir DIR [--copy-to DIR] [--output-prefix NAME] [--style xml|markdown|json|plain] [--clean] [--dry-run] [--custom-regex PATTERN] [--config-file PATH] [--all-styles] [--repomix [REPOMIX_ARGS] [--help] | |
| NOTE: --repomix MUST BE THE LAST OPTION. | |
| Options: | |
| --repo-dir DIR Source directory (required) | |
| --copy-to DIR Copy repo to this dir before modifying (safe mode) | |
| --output-prefix NAME Base name for output files (default: repo_pack) | |
| --style STYLE Output style: xml (default), markdown, json, or plain | |
| --clean Remove previous outputs before run | |
| --dry-run Show what would be redacted/ignored, no changes | |
| --custom-regex PAT Redact additional patterns (regex) | |
| --config-file PATH Use existing repomix config JSON | |
| --all-styles Run for all styles (default) | |
| --help Show this help | |
| Examples: | |
| # Basic: XML output (default) | |
| $0 --repo-dir ./logstash_pipeline | |
| # Custom output name + Markdown | |
| $0 --repo-dir ./logstash_pipeline --output-prefix myproject_pack --style markdown | |
| # Use custom Repomix config | |
| $0 --repo-dir ./logstash_pipeline --config-file ./my-repomix-config.json | |
| # Clean run | |
| $0 --repo-dir ./logstash_pipeline --clean | |
| # Dry run | |
| $0 --repo-dir ./logstash_pipeline --dry-run | |
| # Redact API keys | |
| $0 --repo-dir ./logstash_pipeline --custom-regex 'API_KEY_[A-Z0-9]+' | |
| # Redact secrets + AWS keys | |
| $0 --repo-dir ./logstash_pipeline --custom-regex 'secret_\\d{8}|AKIA[0-9A-Z]{16}' | |
| # Exclude files/folders from Repomix | |
| $0 --repo-dir ./logstash_pipeline \ | |
| --repomix --ignore \"**/*.log,docs/**,secrets/**\" | |
| # Limit Repomix to specific files | |
| $0 --repo-dir ./logstash_pipeline \ | |
| --repomix --include \"src/**/*.ts,*.md\" --ignore \"*.test.js,docs/**\" | |
| # Limit token count + remove comments | |
| $0 --repo-dir ./logstash_pipeline \ | |
| --repomix --max-tokens 50000 --remove-comments | |
| # Full clean + custom redaction + Repomix flags | |
| $0 --repo-dir ./logstash_pipeline --clean \ | |
| --custom-regex 'token_[a-f0-9]{32}' \ | |
| --repomix --no-security-check --verbose | |
| # Redact JWT + internal hostnames | |
| $0 --repo-dir ./logstash_pipeline \ | |
| --custom-regex 'ey[A-Za-z0-9-_]+\\.[A-Za-z0-9-_]+\\.[A-Za-z0-9-_]+|internal-app-\\d+\\.corp\\.local' | |
| # Include git diffs/logs + truncate base64 | |
| $0 --repo-dir ./logstash_pipeline \ | |
| --repomix --include-diffs --include-logs --truncate-base64 | |
| # Compress + no summary + markdown | |
| $0 --repo-dir ./logstash_pipeline \ | |
| --repomix --compress --no-file-summary --style markdown | |
| # Token count tree + top 20 files | |
| $0 --repo-dir ./logstash_pipeline \ | |
| --repomix --token-count-tree --top-files-len 20 | |
| # Dry run + custom + Repomix preview | |
| $0 --repo-dir ./logstash_pipeline --dry-run \ | |
| --custom-regex 'MY_SECRET_.*' \ | |
| --repomix --verbose --ignore \"**/*.log,tmp/**\" | |
| EOF | |
| exit 0 | |
| } | |
| # ==================================== CLI PARSING ==================================== | |
| parse_args() { | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --repo-dir) REPO_DIR="$2"; shift 2 ;; | |
| --copy-to) COPY_TO_DIR="$2"; shift 2 ;; | |
| --output-prefix) OUTPUT_PREFIX="$2"; shift 2 ;; | |
| --style) | |
| case "${2,,}" in | |
| xml|markdown|json|plain) OUTPUT_STYLE="${2,,}" ;; | |
| *) echo "Invalid style: $2" >&2; print_usage ;; | |
| esac | |
| shift 2 ;; | |
| --clean) CLEAN_FLAG="--clean"; shift ;; | |
| --dry-run) DRY_RUN=1; shift ;; | |
| --custom-regex) CUSTOM_RE="$2"; shift 2 ;; | |
| --config-file) CONFIG_FILE="$2"; shift 2 ;; | |
| --all-styles) ALL_STYLES=1; shift ;; | |
| --help) print_usage ;; | |
| --repomix) | |
| shift | |
| REPOMIX_ARGS="$*" | |
| break ;; | |
| -*) | |
| echo "Error: Unknown option $1 or misplaced --repomix" >&2 | |
| print_usage ;; | |
| *) | |
| echo "Positional args not allowed." >&2 | |
| print_usage ;; | |
| esac | |
| done | |
| if [[ -z "$REPO_DIR" ]]; then | |
| echo "Missing --repo-dir" >&2 | |
| print_usage | |
| fi | |
| if [[ ! -d "$REPO_DIR" ]]; then | |
| echo "Not a directory: $REPO_DIR" >&2 | |
| exit 1 | |
| fi | |
| } | |
| # ==================================== Copy Directory (Safe Mode) ==================================== | |
| copy_repo() { | |
| if [[ -n "$COPY_TO_DIR" ]]; then | |
| echo "[*] Copying repo to safe working directory: $COPY_TO_DIR" | |
| rm -rf "$COPY_TO_DIR" | |
| cp -a "$REPO_DIR" "$COPY_TO_DIR" | |
| REPO_DIR="$COPY_TO_DIR" | |
| ABS_REPO_DIR="$(cd "$REPO_DIR" && pwd -P)" | |
| echo "[*] Working on copy: $ABS_REPO_DIR" | |
| fi | |
| } | |
| # ==================================== Setup Paths ==================================== | |
| setup_paths() { | |
| OUTPUT_ROOT="./repomix_output" | |
| REPOMIX_CONFIG_ROOT="./repomix_configs" | |
| RED_STR_FILE="$OUTPUT_ROOT/redacted_strings.txt" | |
| case "$OUTPUT_STYLE" in | |
| xml) EXT="xml" ;; | |
| markdown) EXT="md" ;; | |
| json) EXT="json" ;; | |
| plain) EXT="txt" ;; | |
| *) EXT="xml" ;; | |
| esac | |
| OUTPUT_FILE="${OUTPUT_PREFIX}.${EXT}" | |
| MD_OUT="$OUTPUT_ROOT/$OUTPUT_FILE" | |
| TXT_OUT="$OUTPUT_ROOT/${OUTPUT_PREFIX}.txt" | |
| PDF_OUT="$OUTPUT_ROOT/${OUTPUT_PREFIX}.pdf" | |
| ABS_REPO_DIR="$(cd "$REPO_DIR" && pwd -P)" | |
| mkdir -p "$OUTPUT_ROOT" "$REPOMIX_CONFIG_ROOT" | |
| ABS_OUTPUT_ROOT="$(cd "$OUTPUT_ROOT" && pwd -P)" | |
| ABS_CONFIG_ROOT="$(cd "$REPOMIX_CONFIG_ROOT" && pwd -P)" | |
| ABS_CONFIG_JSON="$ABS_CONFIG_ROOT/repomix.config.json" | |
| ABS_MD_OUT="$ABS_OUTPUT_ROOT/$(basename "$MD_OUT")" | |
| ABS_TXT_OUT="$ABS_OUTPUT_ROOT/$(basename "$TXT_OUT")" | |
| ABS_PDF_OUT="$ABS_OUTPUT_ROOT/$(basename "$PDF_OUT")" | |
| } | |
| # ==================================== Clean ==================================== | |
| clean_outputs() { | |
| if [[ -n "$CLEAN_FLAG" ]]; then | |
| echo "[*] --clean: removing old outputs" | |
| if [[ -z "$DRY_RUN" ]]; then | |
| rm -f "$ABS_OUTPUT_ROOT"/* "$ABS_CONFIG_JSON" || true | |
| else | |
| echo "[DRY RUN] Would remove old outputs" | |
| fi | |
| fi | |
| if [[ -z "$DRY_RUN" ]]; then | |
| : > "$RED_STR_FILE" | |
| fi | |
| } | |
| # ==================================== Redaction ==================================== | |
| run_redaction() { | |
| env REPO_DIR="$ABS_REPO_DIR" \ | |
| ORG_TERMS_RE="$ORG_TERMS_RE" IPV4_RE="$IPV4_RE" IPV6_RE="$IPV6_RE" \ | |
| DOMAIN_RE="$DOMAIN_RE" URL_RE="$URL_RE" USER_RE="$USER_RE" PASS_RE="$PASS_RE" \ | |
| SKIP_REGEX_RE="$SKIP_REGEX_RE" CUSTOM_RE="$CUSTOM_RE" RED_STR_FILE="$RED_STR_FILE" \ | |
| DRY_RUN="$DRY_RUN" \ | |
| python3 - <<"PY" | |
| import os, re, hashlib | |
| repo = os.environ["REPO_DIR"] | |
| org_re = re.compile(os.environ["ORG_TERMS_RE"]) | |
| ipv4_re = re.compile(os.environ["IPV4_RE"]) | |
| ipv6_re = re.compile(os.environ["IPV6_RE"]) | |
| domain_re = re.compile(os.environ["DOMAIN_RE"]) | |
| url_re = re.compile(os.environ["URL_RE"]) | |
| user_re = re.compile(os.environ["USER_RE"]) | |
| pass_re = re.compile(os.environ["PASS_RE"]) | |
| skip_re = re.compile(os.environ["SKIP_REGEX_RE"]) | |
| custom_re = re.compile(os.environ.get("CUSTOM_RE", "^$")) | |
| red_file = os.environ["RED_STR_FILE"] | |
| dry_run = os.environ.get("DRY_RUN") == "1" | |
| def md5(p): | |
| h = hashlib.md5() | |
| with open(p,"rb") as f: | |
| for c in iter(lambda: f.read(1<<20), b""): h.update(c) | |
| return h.hexdigest() | |
| def log_redaction(rel, line_num, match): | |
| with open(red_file, "a", encoding="utf-8") as out: | |
| out.write(f"{rel}:{line_num}: {match}\n") | |
| edited = 0 | |
| for root, _, files in os.walk(repo): | |
| for fn in files: | |
| p = os.path.join(root, fn) | |
| rel = os.path.relpath(p, repo) | |
| is_input_conf = fn.endswith("input.conf") | |
| exts = (".conf",".txt",".json") if not is_input_conf else (".conf",) | |
| if not fn.endswith(exts): continue | |
| try: | |
| before = md5(p) | |
| with open(p,"r",encoding="utf-8",errors="ignore") as f: lines = f.readlines() | |
| if skip_re.search("".join(lines)): continue | |
| except: continue | |
| matches = [] | |
| s_new_lines = lines[:] | |
| for idx, line in enumerate(lines, 1): | |
| for r in [org_re, ipv4_re, ipv6_re, domain_re, url_re, custom_re]: | |
| for m in r.finditer(line): | |
| matches.append((idx, m.group(0))) | |
| if is_input_conf: | |
| for r in [user_re, pass_re]: | |
| for m in r.finditer(line): | |
| matches.append((idx, m.group(0))) | |
| for r, repl in [ | |
| (org_re, "_REDACTED_ORG_"), | |
| (ipv4_re, "_REDACTED_IPV4_"), | |
| (ipv6_re, "_REDACTED_IPV6_"), | |
| (domain_re, "_REDACTED_DOMAIN_"), | |
| (url_re, "_REDACTED_URL_"), | |
| (custom_re, "_REDACTED_CUSTOM_") | |
| ]: | |
| s_new_lines[idx-1] = r.sub(repl, s_new_lines[idx-1]) | |
| if is_input_conf: | |
| s_new_lines[idx-1] = user_re.sub("_REDACTED_USER_", s_new_lines[idx-1]) | |
| s_new_lines[idx-1] = pass_re.sub("_REDACTED_PASS_", s_new_lines[idx-1]) | |
| if not matches: continue | |
| for line_num, match in sorted(matches): | |
| log_redaction(rel, line_num, match) | |
| try: | |
| if dry_run: | |
| if hashlib.md5("".join(s_new_lines).encode("utf-8")).hexdigest() != before: | |
| print(f"[DRY RUN] Would redact {rel}: {len(matches)} unique") | |
| edited += 1 | |
| else: | |
| with open(p,"w",encoding="utf-8") as f: f.write("".join(s_new_lines)) | |
| if md5(p) != before: | |
| print(f"[REDACTED] {rel}: {len(matches)} unique") | |
| edited += 1 | |
| except: continue | |
| prefix = "[DRY RUN] " if dry_run else "" | |
| print(f"{prefix}Edited: {edited} files") | |
| PY | |
| if [[ -s "$RED_STR_FILE" ]]; then | |
| echo "[*] Redacted strings saved: $RED_STR_FILE" | |
| fi | |
| } | |
| # ==================================== Repomix Config ==================================== | |
| write_repomix_config() { | |
| if [[ -n "$CONFIG_FILE" ]]; then | |
| ABS_CONFIG_JSON="$(cd "$(dirname "$CONFIG_FILE")" && pwd -P)/$(basename "$CONFIG_FILE")" | |
| if [[ ! -f "$ABS_CONFIG_JSON" ]]; then | |
| echo "Config file not found: $CONFIG_FILE" >&2 | |
| exit 1 | |
| fi | |
| echo "[*] Using provided config: $ABS_CONFIG_JSON" | |
| else | |
| ABS_CONFIG_JSON="$ABS_CONFIG_ROOT/repomix.config.json" | |
| if [[ -z "$DRY_RUN" ]]; then | |
| echo "[*] Writing config: $ABS_CONFIG_JSON" | |
| mkdir -p "$ABS_CONFIG_ROOT" | |
| cat > "$ABS_CONFIG_JSON" <<EOF | |
| { | |
| "\$schema": "https://repomix.com/schemas/latest/schema.json", | |
| "output": { | |
| "filePath": "PLACEHOLDER_OUTPUT_FILE", | |
| "style": "$OUTPUT_STYLE", | |
| "fileSummary": true, | |
| "directoryStructure": true, | |
| "files": true, | |
| "showLineNumbers": true | |
| }, | |
| "include": ["**/*"], | |
| "ignore": { | |
| "useGitignore": false, | |
| "useDefaultPatterns": true, | |
| "customPatterns": [ | |
| "**/*.jks","**/*.p12","**/*.pfx","**/*.crt","**/*.key", | |
| "**/*.der","**/*.pem","**/*.cer","**/*.so","**/*.dll", | |
| "**/*.bin","**/*.exe","**/*.dylib","**/*.class","**/*.jar" | |
| ] | |
| }, | |
| "security": { "enableSecurityCheck": true } | |
| } | |
| EOF | |
| sed "s|PLACEHOLDER_OUTPUT_FILE|$OUTPUT_FILE|g" "$ABS_CONFIG_JSON" > "${ABS_CONFIG_JSON}.tmp" | |
| mv "${ABS_CONFIG_JSON}.tmp" "$ABS_CONFIG_JSON" | |
| else | |
| echo "[DRY RUN] Would write config with style='$OUTPUT_STYLE' and filePath='$OUTPUT_FILE'" | |
| fi | |
| fi | |
| } | |
| # ==================================== Run Repomix + Ignored Files ==================================== | |
| run_repomix() { | |
| if ! command -v repomix >/dev/null; then | |
| echo "repomix missing" >&2 | |
| exit 1 | |
| fi | |
| REPOMIX_IGNORED_LOG="$ABS_OUTPUT_ROOT/repomix-ignored.txt" | |
| if [[ -n "$DRY_RUN" ]]; then | |
| echo "[DRY RUN] Previewing Repomix (no output written) – showing ignored files…" | |
| echo "[INFO] Ignored file list will be saved to: $REPOMIX_IGNORED_LOG" | |
| repomix --config "$ABS_CONFIG_JSON" \ | |
| --style "$OUTPUT_STYLE" \ | |
| --verbose \ | |
| $REPOMIX_ARGS \ | |
| "$ABS_REPO_DIR" 2>&1 | grep -E '^[[:space:]]+ignored:' | sed -E 's/^[[:space:]]+ignored:[[:space:]]*//' > "$REPOMIX_IGNORED_LOG" | |
| if [[ -s "$REPOMIX_IGNORED_LOG" ]]; then | |
| COUNT=$(wc -l < "$REPOMIX_IGNORED_LOG") | |
| echo "[DRY RUN] Repomix would ignore $COUNT file(s):" | |
| head -n 10 "$REPOMIX_IGNORED_LOG" | |
| if (( COUNT > 10 )); then | |
| echo "… ($((COUNT - 10)) more)" | |
| fi | |
| else | |
| echo "[DRY RUN] Repomix would ignore **no** files" | |
| fi | |
| else | |
| echo "[*] Running Repomix (normal mode)…" | |
| repomix --config "$ABS_CONFIG_JSON" \ | |
| --output "$MD_OUT" \ | |
| --style "$OUTPUT_STYLE" \ | |
| $REPOMIX_ARGS \ | |
| "$ABS_REPO_DIR" | |
| for f in repomix-output.*; do | |
| if [[ -f "$f" ]]; then | |
| mv "$f" "$ABS_OUTPUT_ROOT/" | |
| fi | |
| done | |
| fi | |
| echo "[INFO] Repomix ignored file list: $REPOMIX_IGNORED_LOG" | |
| } | |
| # ==================================== Convert to TXT/PDF ==================================== | |
| convert_outputs() { | |
| if [[ -z "$DRY_RUN" ]]; then | |
| if [[ "$OUTPUT_STYLE" == "markdown" ]]; then | |
| if [[ -f "$ABS_MD_OUT" ]] && command -v pandoc >/dev/null; then | |
| echo "[*] Converting markdown to PDF..." | |
| local pdf_out="${ABS_MD_OUT%.*}.pdf" | |
| pandoc --from=markdown "$ABS_MD_OUT" -o "$pdf_out" | |
| echo "[*] PDF created: $pdf_out" | |
| else | |
| [[ ! -f "$ABS_MD_OUT" ]] && echo "[WARNING] Markdown output file missing for PDF conversion" | |
| command -v pandoc >/dev/null || echo "[WARNING] pandoc not found — skipping PDF conversion" | |
| fi | |
| else | |
| echo "[INFO] Skipping PDF conversion (only runs for markdown style)" | |
| fi | |
| else | |
| echo "[DRY RUN] Would convert to PDF if style=markdown and pandoc available" | |
| fi | |
| } | |
| # ==================================== Run Once ==================================== | |
| run_once() { | |
| local skip_clean="$1" # "true" = skip cleaning between styles | |
| copy_repo | |
| setup_paths | |
| if [[ -n "$DRY_RUN" ]]; then | |
| echo "[DRY RUN] No files will be modified or written." | |
| fi | |
| echo "[*] Redaction in: $ABS_REPO_DIR" | |
| echo "[*] Output directory: $ABS_OUTPUT_ROOT" | |
| if [[ "$skip_clean" != "true" ]]; then | |
| clean_outputs | |
| else | |
| echo "[INFO] Skipping cleanup for subsequent style run" | |
| fi | |
| run_redaction | |
| write_repomix_config | |
| run_repomix | |
| convert_outputs | |
| echo "[*] Outputs:" | |
| if [[ -z "$DRY_RUN" ]]; then | |
| ls -lh "$ABS_OUTPUT_ROOT" | |
| else | |
| echo "[DRY RUN] Outputs would be in: $ABS_OUTPUT_ROOT" | |
| fi | |
| } | |
| # ==================================== Main ==================================== | |
| main() { | |
| parse_args "$@" | |
| if [[ "$ALL_STYLES" -eq 1 ]]; then | |
| local styles=("xml" "markdown" "json" "plain") | |
| local first=1 | |
| for style in "${styles[@]}"; do | |
| echo "========================================================================" | |
| echo "[*] Running with --style $style" | |
| echo "========================================================================" | |
| OUTPUT_STYLE="$style" | |
| if (( first )); then | |
| run_once "false" # clean only on first style | |
| first=0 | |
| else | |
| run_once "true" # skip cleaning on subsequent styles | |
| fi | |
| done | |
| else | |
| run_once "false" | |
| fi | |
| echo "🎉 All done. Outputs available under: $ABS_OUTPUT_ROOT" | |
| } | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment