Created
September 1, 2025 22:41
-
-
Save senecaso/42198a1cec1c837ed2b7c7357ee91ce0 to your computer and use it in GitHub Desktop.
simple script that extracts a CSV of all hosts present from a CommonCrawl crawl and the IP(s) that served those responses
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" | |
| CRAWL="CC-MAIN-2025-33" | |
| TMP_DIR=$(mktemp --directory "/tmp/${CRAWL}-XXXX") | |
| OUTPUT_FILE="/tmp/${CRAWL}-hosts-and-ip.warc.zstd" | |
| rm -rf "$OUTPUT_FILE" | |
| ROBOTSTXT_PATHS="${TMP_DIR}/robotstxt.paths.gz" | |
| wget -c -t 0 --retry-on-http-error=503 --waitretry=1 -O "${ROBOTSTXT_PATHS}" "https://data.commoncrawl.org/crawl-data/${CRAWL}/robotstxt.paths.gz" | |
| for SEGMENT_PATH in $(zcat "${ROBOTSTXT_PATHS}"); do | |
| echo "Processing ${SEGMENT_PATH} ..." | |
| curl -s --retry 1000 --retry-all-errors --retry-delay 1 -o "${TMP_DIR}/segment.warc.gz" "https://data.commoncrawl.org/${SEGMENT_PATH}" | |
| # extract just the IP and the URL values, then merge them into a single line | |
| zcat "${TMP_DIR}/segment.warc.gz" | egrep '^WARC-IP-Address|^WARC-Target-URI' | sed -n 'N;s/WARC-IP-Address: \(.*\)\r\nWARC-Target-URI: \(.*\)/\1,\2/p;' | sort | uniq | zstd -c --stdout >> "${OUTPUT_FILE}" | |
| done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment