Created
July 24, 2015 17:32
-
-
Save romain-dartigues/376c0810d7b91be1728d to your computer and use it in GitHub Desktop.
extract email fields from mbox to CSV using awk
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/awk -f | |
# usage: mbox2csv INBOX > result.csv | |
# tested with gawk and mawk | |
# attempt to generate a Microsoft Excel compatible CSV | |
function escape_cell(cell) { | |
gsub(/"/, "\"\"", cell) | |
return "\"" cell "\"" | |
} | |
# awk does not have a "join" builtin | |
# and we want it to take a references for columns order | |
function join(array, reference, sep) { | |
sep = sep ? sep : ";" | |
result = escape_cell(array[ reference[1] ]) | |
for (i=2; i<=length(reference); ++i) { | |
result = result sep escape_cell(array[ reference[i] ]) | |
} | |
return result | |
} | |
BEGIN { | |
# MIME headers are case insensitive | |
# desired headers | |
headers = tolower("From|Date|To|Message-ID|Subject|Received") | |
# this will determine the order in the output | |
split(headers, headers_array, "|") | |
# initialize an array | |
split("", row, ":") | |
# consider we are not in headers now | |
in_headers = 0 | |
} | |
# every time we match a mbox message separator | |
$1 == "From" { | |
# note we are now in a message header | |
in_headers = 1 | |
# if there is data (not the first line) | |
if (length(row) > 1) { | |
print join(row, headers_array) | |
} | |
# reinitialize the array | |
split("", row, ":") | |
next | |
} | |
in_headers { | |
if ($0) { | |
if ($0 ~ /^[[:space:]]/ && key ~ "^(" headers ")$") { | |
# RFC2822 2.2.3. Long Header Fields | |
row[key] = row[key] $0 | |
next | |
} | |
else { | |
key = tolower($1) | |
sub(/:$/, "", key) | |
} | |
# the keys you want to store | |
if (tolower($1) ~ "^(" headers "):") { | |
row[key] = $0; | |
next | |
} | |
} | |
else { | |
# an empty line indicate end of headers | |
in_headers = 0 | |
next | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment