Skip to content

Instantly share code, notes, and snippets.

@neu5ron
Last active October 31, 2025 09:21
Show Gist options
  • Select an option

  • Save neu5ron/86bac61083c135638452c92db8b3ae6d to your computer and use it in GitHub Desktop.

Select an option

Save neu5ron/86bac61083c135638452c92db8b3ae6d to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# ==================================== SHEBANG CHECK ====================================
if [ -z "$BASH_VERSION" ]; then
echo "ERROR: This script requires bash. Run with: bash $0 $*" >&2
exit 1
fi
set -eo pipefail
# ==================================== Redaction patterns ====================================
ORG_TERMS_RE='org_terms\.[A-Za-z0-9_]+'
IPV4_RE='\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
IPV6_RE='\b(?:[0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}\b'
DOMAIN_RE='\b[a-zA-Z0-9.-]+\.(com|org|dev|local|mil|gov|net|io|tech|co)\b'
URL_RE='\bhttps?://[a-zA-Z0-9.-]+\.(com|org|dev|local|mil|gov|net|io|tech|co)[^\s]*\b'
USER_RE='(?i)\busername[ =:]+\S+'
PASS_RE='(?i)\bpassword[ =:]+\S+'
# shellcheck disable=SC2125
SKIP_REGEX_RE=$'(?s)/.+/|r\\".+?\\"|r\\'.+?\\'|re\\.compile\\(.+?\\)'
CUSTOM_RE=''
# ==================================== Defaults ====================================
OUTPUT_STYLE="xml"
OUTPUT_PREFIX="repo_pack"
ALL_STYLES=1 # default: run all styles
# ==================================== USAGE ====================================
print_usage() {
cat <<EOF
Usage: $0 --repo-dir DIR [--copy-to DIR] [--output-prefix NAME] [--style xml|markdown|json|plain] [--clean] [--dry-run] [--custom-regex PATTERN] [--config-file PATH] [--all-styles] [--repomix [REPOMIX_ARGS] [--help]
NOTE: --repomix MUST BE THE LAST OPTION.
Options:
--repo-dir DIR Source directory (required)
--copy-to DIR Copy repo to this dir before modifying (safe mode)
--output-prefix NAME Base name for output files (default: repo_pack)
--style STYLE Output style: xml (default), markdown, json, or plain
--clean Remove previous outputs before run
--dry-run Show what would be redacted/ignored, no changes
--custom-regex PAT Redact additional patterns (regex)
--config-file PATH Use existing repomix config JSON
--all-styles Run for all styles (default)
--help Show this help
Examples:
# Basic: XML output (default)
$0 --repo-dir ./logstash_pipeline
# Custom output name + Markdown
$0 --repo-dir ./logstash_pipeline --output-prefix myproject_pack --style markdown
# Use custom Repomix config
$0 --repo-dir ./logstash_pipeline --config-file ./my-repomix-config.json
# Clean run
$0 --repo-dir ./logstash_pipeline --clean
# Dry run
$0 --repo-dir ./logstash_pipeline --dry-run
# Redact API keys
$0 --repo-dir ./logstash_pipeline --custom-regex 'API_KEY_[A-Z0-9]+'
# Redact secrets + AWS keys
$0 --repo-dir ./logstash_pipeline --custom-regex 'secret_\\d{8}|AKIA[0-9A-Z]{16}'
# Exclude files/folders from Repomix
$0 --repo-dir ./logstash_pipeline \
--repomix --ignore \"**/*.log,docs/**,secrets/**\"
# Limit Repomix to specific files
$0 --repo-dir ./logstash_pipeline \
--repomix --include \"src/**/*.ts,*.md\" --ignore \"*.test.js,docs/**\"
# Limit token count + remove comments
$0 --repo-dir ./logstash_pipeline \
--repomix --max-tokens 50000 --remove-comments
# Full clean + custom redaction + Repomix flags
$0 --repo-dir ./logstash_pipeline --clean \
--custom-regex 'token_[a-f0-9]{32}' \
--repomix --no-security-check --verbose
# Redact JWT + internal hostnames
$0 --repo-dir ./logstash_pipeline \
--custom-regex 'ey[A-Za-z0-9-_]+\\.[A-Za-z0-9-_]+\\.[A-Za-z0-9-_]+|internal-app-\\d+\\.corp\\.local'
# Include git diffs/logs + truncate base64
$0 --repo-dir ./logstash_pipeline \
--repomix --include-diffs --include-logs --truncate-base64
# Compress + no summary + markdown
$0 --repo-dir ./logstash_pipeline \
--repomix --compress --no-file-summary --style markdown
# Token count tree + top 20 files
$0 --repo-dir ./logstash_pipeline \
--repomix --token-count-tree --top-files-len 20
# Dry run + custom + Repomix preview
$0 --repo-dir ./logstash_pipeline --dry-run \
--custom-regex 'MY_SECRET_.*' \
--repomix --verbose --ignore \"**/*.log,tmp/**\"
EOF
exit 0
}
# ==================================== CLI PARSING ====================================
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--repo-dir) REPO_DIR="$2"; shift 2 ;;
--copy-to) COPY_TO_DIR="$2"; shift 2 ;;
--output-prefix) OUTPUT_PREFIX="$2"; shift 2 ;;
--style)
case "${2,,}" in
xml|markdown|json|plain) OUTPUT_STYLE="${2,,}" ;;
*) echo "Invalid style: $2" >&2; print_usage ;;
esac
shift 2 ;;
--clean) CLEAN_FLAG="--clean"; shift ;;
--dry-run) DRY_RUN=1; shift ;;
--custom-regex) CUSTOM_RE="$2"; shift 2 ;;
--config-file) CONFIG_FILE="$2"; shift 2 ;;
--all-styles) ALL_STYLES=1; shift ;;
--help) print_usage ;;
--repomix)
shift
REPOMIX_ARGS="$*"
break ;;
-*)
echo "Error: Unknown option $1 or misplaced --repomix" >&2
print_usage ;;
*)
echo "Positional args not allowed." >&2
print_usage ;;
esac
done
if [[ -z "$REPO_DIR" ]]; then
echo "Missing --repo-dir" >&2
print_usage
fi
if [[ ! -d "$REPO_DIR" ]]; then
echo "Not a directory: $REPO_DIR" >&2
exit 1
fi
}
# ==================================== Copy Directory (Safe Mode) ====================================
copy_repo() {
if [[ -n "$COPY_TO_DIR" ]]; then
echo "[*] Copying repo to safe working directory: $COPY_TO_DIR"
rm -rf "$COPY_TO_DIR"
cp -a "$REPO_DIR" "$COPY_TO_DIR"
REPO_DIR="$COPY_TO_DIR"
ABS_REPO_DIR="$(cd "$REPO_DIR" && pwd -P)"
echo "[*] Working on copy: $ABS_REPO_DIR"
fi
}
# ==================================== Setup Paths ====================================
setup_paths() {
OUTPUT_ROOT="./repomix_output"
REPOMIX_CONFIG_ROOT="./repomix_configs"
RED_STR_FILE="$OUTPUT_ROOT/redacted_strings.txt"
case "$OUTPUT_STYLE" in
xml) EXT="xml" ;;
markdown) EXT="md" ;;
json) EXT="json" ;;
plain) EXT="txt" ;;
*) EXT="xml" ;;
esac
OUTPUT_FILE="${OUTPUT_PREFIX}.${EXT}"
MD_OUT="$OUTPUT_ROOT/$OUTPUT_FILE"
TXT_OUT="$OUTPUT_ROOT/${OUTPUT_PREFIX}.txt"
PDF_OUT="$OUTPUT_ROOT/${OUTPUT_PREFIX}.pdf"
ABS_REPO_DIR="$(cd "$REPO_DIR" && pwd -P)"
mkdir -p "$OUTPUT_ROOT" "$REPOMIX_CONFIG_ROOT"
ABS_OUTPUT_ROOT="$(cd "$OUTPUT_ROOT" && pwd -P)"
ABS_CONFIG_ROOT="$(cd "$REPOMIX_CONFIG_ROOT" && pwd -P)"
ABS_CONFIG_JSON="$ABS_CONFIG_ROOT/repomix.config.json"
ABS_MD_OUT="$ABS_OUTPUT_ROOT/$(basename "$MD_OUT")"
ABS_TXT_OUT="$ABS_OUTPUT_ROOT/$(basename "$TXT_OUT")"
ABS_PDF_OUT="$ABS_OUTPUT_ROOT/$(basename "$PDF_OUT")"
}
# ==================================== Clean ====================================
clean_outputs() {
if [[ -n "$CLEAN_FLAG" ]]; then
echo "[*] --clean: removing old outputs"
if [[ -z "$DRY_RUN" ]]; then
rm -f "$ABS_OUTPUT_ROOT"/* "$ABS_CONFIG_JSON" || true
else
echo "[DRY RUN] Would remove old outputs"
fi
fi
if [[ -z "$DRY_RUN" ]]; then
: > "$RED_STR_FILE"
fi
}
# ==================================== Redaction ====================================
run_redaction() {
env REPO_DIR="$ABS_REPO_DIR" \
ORG_TERMS_RE="$ORG_TERMS_RE" IPV4_RE="$IPV4_RE" IPV6_RE="$IPV6_RE" \
DOMAIN_RE="$DOMAIN_RE" URL_RE="$URL_RE" USER_RE="$USER_RE" PASS_RE="$PASS_RE" \
SKIP_REGEX_RE="$SKIP_REGEX_RE" CUSTOM_RE="$CUSTOM_RE" RED_STR_FILE="$RED_STR_FILE" \
DRY_RUN="$DRY_RUN" \
python3 - <<"PY"
import os, re, hashlib
repo = os.environ["REPO_DIR"]
org_re = re.compile(os.environ["ORG_TERMS_RE"])
ipv4_re = re.compile(os.environ["IPV4_RE"])
ipv6_re = re.compile(os.environ["IPV6_RE"])
domain_re = re.compile(os.environ["DOMAIN_RE"])
url_re = re.compile(os.environ["URL_RE"])
user_re = re.compile(os.environ["USER_RE"])
pass_re = re.compile(os.environ["PASS_RE"])
skip_re = re.compile(os.environ["SKIP_REGEX_RE"])
custom_re = re.compile(os.environ.get("CUSTOM_RE", "^$"))
red_file = os.environ["RED_STR_FILE"]
dry_run = os.environ.get("DRY_RUN") == "1"
def md5(p):
h = hashlib.md5()
with open(p,"rb") as f:
for c in iter(lambda: f.read(1<<20), b""): h.update(c)
return h.hexdigest()
def log_redaction(rel, line_num, match):
with open(red_file, "a", encoding="utf-8") as out:
out.write(f"{rel}:{line_num}: {match}\n")
edited = 0
for root, _, files in os.walk(repo):
for fn in files:
p = os.path.join(root, fn)
rel = os.path.relpath(p, repo)
is_input_conf = fn.endswith("input.conf")
exts = (".conf",".txt",".json") if not is_input_conf else (".conf",)
if not fn.endswith(exts): continue
try:
before = md5(p)
with open(p,"r",encoding="utf-8",errors="ignore") as f: lines = f.readlines()
if skip_re.search("".join(lines)): continue
except: continue
matches = []
s_new_lines = lines[:]
for idx, line in enumerate(lines, 1):
for r in [org_re, ipv4_re, ipv6_re, domain_re, url_re, custom_re]:
for m in r.finditer(line):
matches.append((idx, m.group(0)))
if is_input_conf:
for r in [user_re, pass_re]:
for m in r.finditer(line):
matches.append((idx, m.group(0)))
for r, repl in [
(org_re, "_REDACTED_ORG_"),
(ipv4_re, "_REDACTED_IPV4_"),
(ipv6_re, "_REDACTED_IPV6_"),
(domain_re, "_REDACTED_DOMAIN_"),
(url_re, "_REDACTED_URL_"),
(custom_re, "_REDACTED_CUSTOM_")
]:
s_new_lines[idx-1] = r.sub(repl, s_new_lines[idx-1])
if is_input_conf:
s_new_lines[idx-1] = user_re.sub("_REDACTED_USER_", s_new_lines[idx-1])
s_new_lines[idx-1] = pass_re.sub("_REDACTED_PASS_", s_new_lines[idx-1])
if not matches: continue
for line_num, match in sorted(matches):
log_redaction(rel, line_num, match)
try:
if dry_run:
if hashlib.md5("".join(s_new_lines).encode("utf-8")).hexdigest() != before:
print(f"[DRY RUN] Would redact {rel}: {len(matches)} unique")
edited += 1
else:
with open(p,"w",encoding="utf-8") as f: f.write("".join(s_new_lines))
if md5(p) != before:
print(f"[REDACTED] {rel}: {len(matches)} unique")
edited += 1
except: continue
prefix = "[DRY RUN] " if dry_run else ""
print(f"{prefix}Edited: {edited} files")
PY
if [[ -s "$RED_STR_FILE" ]]; then
echo "[*] Redacted strings saved: $RED_STR_FILE"
fi
}
# ==================================== Repomix Config ====================================
write_repomix_config() {
if [[ -n "$CONFIG_FILE" ]]; then
ABS_CONFIG_JSON="$(cd "$(dirname "$CONFIG_FILE")" && pwd -P)/$(basename "$CONFIG_FILE")"
if [[ ! -f "$ABS_CONFIG_JSON" ]]; then
echo "Config file not found: $CONFIG_FILE" >&2
exit 1
fi
echo "[*] Using provided config: $ABS_CONFIG_JSON"
else
ABS_CONFIG_JSON="$ABS_CONFIG_ROOT/repomix.config.json"
if [[ -z "$DRY_RUN" ]]; then
echo "[*] Writing config: $ABS_CONFIG_JSON"
mkdir -p "$ABS_CONFIG_ROOT"
cat > "$ABS_CONFIG_JSON" <<EOF
{
"\$schema": "https://repomix.com/schemas/latest/schema.json",
"output": {
"filePath": "PLACEHOLDER_OUTPUT_FILE",
"style": "$OUTPUT_STYLE",
"fileSummary": true,
"directoryStructure": true,
"files": true,
"showLineNumbers": true
},
"include": ["**/*"],
"ignore": {
"useGitignore": false,
"useDefaultPatterns": true,
"customPatterns": [
"**/*.jks","**/*.p12","**/*.pfx","**/*.crt","**/*.key",
"**/*.der","**/*.pem","**/*.cer","**/*.so","**/*.dll",
"**/*.bin","**/*.exe","**/*.dylib","**/*.class","**/*.jar"
]
},
"security": { "enableSecurityCheck": true }
}
EOF
sed "s|PLACEHOLDER_OUTPUT_FILE|$OUTPUT_FILE|g" "$ABS_CONFIG_JSON" > "${ABS_CONFIG_JSON}.tmp"
mv "${ABS_CONFIG_JSON}.tmp" "$ABS_CONFIG_JSON"
else
echo "[DRY RUN] Would write config with style='$OUTPUT_STYLE' and filePath='$OUTPUT_FILE'"
fi
fi
}
# ==================================== Run Repomix + Ignored Files ====================================
run_repomix() {
if ! command -v repomix >/dev/null; then
echo "repomix missing" >&2
exit 1
fi
REPOMIX_IGNORED_LOG="$ABS_OUTPUT_ROOT/repomix-ignored.txt"
if [[ -n "$DRY_RUN" ]]; then
echo "[DRY RUN] Previewing Repomix (no output written) – showing ignored files…"
echo "[INFO] Ignored file list will be saved to: $REPOMIX_IGNORED_LOG"
repomix --config "$ABS_CONFIG_JSON" \
--style "$OUTPUT_STYLE" \
--verbose \
$REPOMIX_ARGS \
"$ABS_REPO_DIR" 2>&1 | grep -E '^[[:space:]]+ignored:' | sed -E 's/^[[:space:]]+ignored:[[:space:]]*//' > "$REPOMIX_IGNORED_LOG"
if [[ -s "$REPOMIX_IGNORED_LOG" ]]; then
COUNT=$(wc -l < "$REPOMIX_IGNORED_LOG")
echo "[DRY RUN] Repomix would ignore $COUNT file(s):"
head -n 10 "$REPOMIX_IGNORED_LOG"
if (( COUNT > 10 )); then
echo "… ($((COUNT - 10)) more)"
fi
else
echo "[DRY RUN] Repomix would ignore **no** files"
fi
else
echo "[*] Running Repomix (normal mode)…"
repomix --config "$ABS_CONFIG_JSON" \
--output "$MD_OUT" \
--style "$OUTPUT_STYLE" \
$REPOMIX_ARGS \
"$ABS_REPO_DIR"
for f in repomix-output.*; do
if [[ -f "$f" ]]; then
mv "$f" "$ABS_OUTPUT_ROOT/"
fi
done
fi
echo "[INFO] Repomix ignored file list: $REPOMIX_IGNORED_LOG"
}
# ==================================== Convert to TXT/PDF ====================================
convert_outputs() {
if [[ -z "$DRY_RUN" ]]; then
if [[ "$OUTPUT_STYLE" == "markdown" ]]; then
if [[ -f "$ABS_MD_OUT" ]] && command -v pandoc >/dev/null; then
echo "[*] Converting markdown to PDF..."
local pdf_out="${ABS_MD_OUT%.*}.pdf"
pandoc --from=markdown "$ABS_MD_OUT" -o "$pdf_out"
echo "[*] PDF created: $pdf_out"
else
[[ ! -f "$ABS_MD_OUT" ]] && echo "[WARNING] Markdown output file missing for PDF conversion"
command -v pandoc >/dev/null || echo "[WARNING] pandoc not found — skipping PDF conversion"
fi
else
echo "[INFO] Skipping PDF conversion (only runs for markdown style)"
fi
else
echo "[DRY RUN] Would convert to PDF if style=markdown and pandoc available"
fi
}
# ==================================== Run Once ====================================
run_once() {
local skip_clean="$1" # "true" = skip cleaning between styles
copy_repo
setup_paths
if [[ -n "$DRY_RUN" ]]; then
echo "[DRY RUN] No files will be modified or written."
fi
echo "[*] Redaction in: $ABS_REPO_DIR"
echo "[*] Output directory: $ABS_OUTPUT_ROOT"
if [[ "$skip_clean" != "true" ]]; then
clean_outputs
else
echo "[INFO] Skipping cleanup for subsequent style run"
fi
run_redaction
write_repomix_config
run_repomix
convert_outputs
echo "[*] Outputs:"
if [[ -z "$DRY_RUN" ]]; then
ls -lh "$ABS_OUTPUT_ROOT"
else
echo "[DRY RUN] Outputs would be in: $ABS_OUTPUT_ROOT"
fi
}
# ==================================== Main ====================================
main() {
parse_args "$@"
if [[ "$ALL_STYLES" -eq 1 ]]; then
local styles=("xml" "markdown" "json" "plain")
local first=1
for style in "${styles[@]}"; do
echo "========================================================================"
echo "[*] Running with --style $style"
echo "========================================================================"
OUTPUT_STYLE="$style"
if (( first )); then
run_once "false" # clean only on first style
first=0
else
run_once "true" # skip cleaning on subsequent styles
fi
done
else
run_once "false"
fi
echo "🎉 All done. Outputs available under: $ABS_OUTPUT_ROOT"
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment