-
-
Save beamzer/8a1e9629c203eaa9eb8d2fb4725b053a to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python3.7 | |
| """ | |
| source: https://gist.github.com/urschrei/5258588 by Stephan Hügel | |
| 2020 update: | |
| - More iterators, fewer lists | |
| - Python 3 compatible | |
| - Processes files in parallel | |
| (one thread per CPU, but that's not really how it works) | |
| Ewald ( https://gist.github.com/beamzer/8a1e9629c203eaa9eb8d2fb4725b053a ) | |
| 2020-09-20: | |
| - handling of same filenames (write everything because contents might be different) | |
| - handling of filenames with * | |
| - handling of mkdir errors | |
| - added arguments | |
| 2020-09-23 (v1.2) | |
| - version_nr before extension | |
| - error handling for utf-8 chars in eml (on error continue) | |
| 2020-10-02 (v1.3) | |
| - now correctly handles RFC2047 MIME encoded filenames | |
| 2020-10-06 (v1.4) | |
| - now handles multi-line filenames | |
| - fixed handling of emails with no attachments | |
| """ | |
| import glob | |
| import os | |
| import email | |
| import argparse | |
| from multiprocessing import Pool | |
| from cs.rfc2047 import unrfc2047 | |
| EXTENSION = "eml" | |
| parser = argparse.ArgumentParser(description='extract attachments from eml files') | |
| parser.add_argument( | |
| '-d','--debug', | |
| action='store_true', | |
| help='print debug messages to stderr' | |
| ) | |
| parser.add_argument( | |
| '-s','--single', | |
| action='store_true', | |
| help='run as single thread (default = multithreaded, one thread per core)' | |
| ) | |
| parser.add_argument( | |
| '-q','--quiet', | |
| action='store_true', | |
| help='no output' | |
| ) | |
| args = parser.parse_args() | |
| debug = args.debug | |
| single = args.single | |
| quiet = args.quiet | |
| debug and print("debug output is active") | |
| # ensure that an output dir exists | |
| od = "attachments" | |
| # the exist_ok=True avoids error messages due to us being multithreaded and race-conditions | |
| # that should be no problem since we moved this out of the repetitive extract function | |
| os.path.exists(od) or os.makedirs(od,exist_ok=True) | |
| def extract(filename): | |
| """ | |
| Try to extract the attachments from filename | |
| """ | |
| debug and print("=> reading %s" % filename) | |
| output_count = 0 | |
| try: | |
| with open(filename, "r") as f: | |
| try: | |
| msg = email.message_from_file(f) | |
| nratt = len(msg.get_payload()) | |
| # this will be 4000something if no attachments are present | |
| if (nratt > 1 and nratt < 20): | |
| for attachment in msg.get_payload()[1:]: | |
| of = attachment.get_filename() | |
| debug and print("attachment name: %s" % of) | |
| # handle multi-line strings, and other problematic characters | |
| of = of.replace("\n", "") | |
| of = of.replace("\t", "_") | |
| of = of.replace("\*", "#") | |
| # this is to handle RFC2047 MIME encoded filenames (often used for obfuscation) | |
| try: | |
| output_filename = unrfc2047(of) | |
| if ( of != output_filename): | |
| debug and print("decoded attachment name: %s" % output_filename) | |
| except Exception as inst: | |
| print(type(inst)) # the exception instance | |
| print(inst.args) # arguments stored in .args | |
| print(inst) # __str__ allows args to be printed directly | |
| # If no attachments are found, skip this file | |
| if output_filename: | |
| # check if this filename already exists | |
| fn = od + "/" + output_filename | |
| debug and print("checking existence of %s" % fn) | |
| expand = 0 | |
| if os.path.isfile(fn): | |
| while True: | |
| expand += 1 | |
| # add the increment before the filename extension | |
| fn_name, fn_ext = os.path.splitext(output_filename) | |
| new_filename = fn_name + "_" + str(expand) + fn_ext | |
| fn = od + "/" + new_filename | |
| if os.path.isfile(fn): | |
| continue | |
| else: | |
| output_filename = new_filename | |
| break | |
| not(quiet) and print("Writing %s " % output_filename) | |
| with open(os.path.join(od, output_filename), "wb") as of: | |
| of.write(attachment.get_payload(decode=True)) | |
| output_count += 1 | |
| if output_count == 0: | |
| not(quiet) and print("No attachment found for file %s!" % f.name) | |
| except Exception: | |
| print('Fail: %s\n' % f) | |
| # this should catch read and write errors | |
| except IOError: | |
| not(quiet) and print("Problem with %s or one of its attachments!" % f.name) | |
| return 1, output_count | |
| if __name__ == "__main__": | |
| if not(single): | |
| debug and print("running multithreaded") | |
| # let's do this in parallel, using cpu count as number of threads | |
| pool = Pool(None) | |
| res = pool.map(extract, glob.iglob("*.%s" % EXTENSION)) | |
| # need these if we use _async | |
| pool.close() | |
| pool.join() | |
| # 2-element list holding number of files, number of attachments | |
| numfiles = [sum(i) for i in zip(*res)] | |
| not(quiet) and print("Done: Processed {} files with {} attachments.".format(*numfiles)) | |
| else: | |
| filecnt = 0 | |
| cnt = 0 | |
| debug and print("running single threaded") | |
| for file in glob.glob("*.%s" % EXTENSION): | |
| filecnt += 1 | |
| cnt += extract(file)[1] | |
| not(quiet) and print("Done: Processed %s files with %s attachments." % (filecnt, cnt)) | |
This is a very well written script but I'm having issue's. Would you be able to point me in the right direction?
I'm currently testing with a folder that has 100 or so .eml files and each file has a .msg file attached within it.
I am getting the following error's for every email inside the folder as it iterates through.
Fail: <_io.TextIOWrapper name='00000463.eml' mode='r' encoding='cp1257'>
Fail: <_io.TextIOWrapper name='00000464.eml' mode='r' encoding='cp1257'>
Done: Processed 100 files with 0 attachments.
Any assistance would be amazing!
Regards,
-Julian
Hi @beamzer thanks for this wonderful code
Hi @julianjamespy ,
I had the same problem with .eml embedded in .eml.
For some shady reason, the parser is unable to process adequately .eml attachments.
But at least you can open it as a string or bytes, and you notice that there is kind of weird header like:
"Content-Type: message/rfc822
Content-Disposition: attachment;
creation-date="Fri, 03 Sep 2021 07:45:44 GMT";
modification-date="Fri, 03 Sep 2021 07:45:44 GMT""
So I tried to remove it, by simply removing the first 162 characters of attachments when the attachment is detected with NoneType:
attachment.as_bytes()[162:]
Also, I added a recursive call to extract, to extract the attachments in .eml embedded in .eml
Please find below the reviewed function (working on my side):
def extract(filename):
"""
Try to extract the attachments from all files in cwd
"""
# ensure that an output dir exists
od = "output"
os.path.exists(od) or os.makedirs(od)
output_count = 0
try:
with open(filename, "r") as f:
msg = email.message_from_file(f, policy=policy.default)
for attachment in msg.iter_attachments():
try:
output_filename = attachment.get_filename()
except AttributeError:
print("Got string instead of filename for %s. Skipping." % f.name)
continue
# If no attachments are found, skip this file
if output_filename:
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.get_payload(decode=True))
output_count += 1
except TypeError:
print("Couldn't get payload for %s" % output_filename)
#for EML embedded in EML
else:
output_filename = filename.split('.')[:-1][0].split('\\')[-1]+'_void_'+str(output_count)+'.eml'
with open(os.path.join(od, output_filename), "wb") as of:
try:
of.write(attachment.as_bytes()[162:])
output_count += 1
extract(os.path.join(od, output_filename))
except TypeError:
print("Couldn't get payload for %s" % output_filename)
if output_count == 0:
print("No attachment found for file %s!" % f.name)
# this should catch read and write errors
except IOError:
print("Problem with %s or one of its attachments!" % f.name)
return 1, output_count
Hi. Its quite useful code. Just small issue I have is, when there is an attachment like "Indoor Unit 03 Günlük Rapor_3_20210718_115911.csv" inside eml file, this code extracts it as "Indoor Unit 03 Günlük Rapor_3_20210718_115911.cs v". There is space between s and v at the extension. And when i print the of variable, I get the output as "=?iso-8859-3?Q?Indoor_Unit_03_G=FCnl=FCk_Rapor=5F3=5F20210718=5F115911.cs?=
=?iso-8859-3?Q?v?=". Can you please help with this?
Thanks.