Created
July 14, 2018 21:58
-
-
Save jmb/424e8e113f2a546349ff60d07f4eab3a to your computer and use it in GitHub Desktop.
Script to extract all email addresses from all emails in all folders of an IMAP account.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Create a connection to an IMAP server and find ALL email addresses | |
Original script by abought: https://gist.github.com/abought/15a1e08705b121c1b7bd | |
References: | |
http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/ | |
and | |
https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/ | |
""" | |
__author__ = 'jmb' | |
import email | |
import imaplib | |
import getpass | |
import sys | |
import re | |
import os | |
# EDIT these as required: | |
FILENAME = 'out.txt' | |
DEFAULT_MAIL_SERVER = 'imap.server' | |
# No user parameters below this line | |
# ADDR_PATTERN = re.compile('<(.*?)>') # Finds email as <[email protected]> | |
# Find ALL email address in all fields: | |
ADDR_PATTERN = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)") | |
def connect(user, pwd, server=DEFAULT_MAIL_SERVER): | |
"""Connect to [the specified] mail server. Return an open connection""" | |
conn = imaplib.IMAP4_SSL(server) | |
try: | |
conn.login(user, pwd) | |
except imaplib.IMAP4.error: | |
print "Failed to login" | |
sys.exit(1) | |
return conn | |
def print_folders(conn): | |
"""Print a list of open mailbox folders""" | |
for f in conn.list(): | |
print " ", f | |
def get_folder(conn, folder_name): | |
"""Fetch a specific folder (or label) from server""" | |
if conn.state == "SELECTED": | |
# Explicitly close any previously opened folders; may not be necessary | |
conn.close() | |
rv, data = conn.select(folder_name) | |
if rv != 'OK': | |
print "Could not open specified folder. Known labels:" | |
print_folders(conn) | |
return conn | |
def get_email_ids(conn, query='ALL'): | |
"""Get the numeric IDs for all emails in a given folder""" | |
if conn.state != "SELECTED": | |
raise imaplib.IMAP4.error("Cannot search without selecting a folder") | |
rv, data = conn.uid('search', None, query) | |
if rv != 'OK': | |
print "Could not fetch email ids" # for some reason... | |
return [] | |
return data[0].split() | |
def fetch_message(conn, msg_uid): | |
""" | |
Fetch a specific message uid (not sequential id!) from the given folder; | |
return the parsed message. User must ensure that specified | |
message ID exists in that folder. | |
""" | |
# TODO: Could we fetch just the envelope of the response to save bandwidth? | |
rv, data = conn.uid('fetch', msg_uid, "(RFC822)") | |
if rv != 'OK': | |
print "ERROR fetching message #", msg_uid | |
return {} | |
return email.message_from_string(data[0][1]) # dict-like object | |
def get_recipients(msg_parsed): | |
"""Given a parsed message, extract and return recipient list""" | |
recipients = [] | |
addr_fields = ['From', 'To', 'Cc', 'Bcc'] | |
for f in addr_fields: | |
rfield = msg_parsed.get(f, "") # Empty string if field not present | |
rlist = re.findall(ADDR_PATTERN, rfield) | |
recipients.extend(rlist) | |
return recipients | |
if __name__ == "__main__": | |
username = raw_input("Full email address: ") | |
password = getpass.getpass() | |
# Connect | |
mail_conn = connect(username, password) | |
# Open output file | |
file = open(FILENAME, "a") | |
# Go through each folder | |
for f in mail_conn.list()[1]: | |
folder = f.split()[2].strip('"') | |
if folder == ".": | |
continue | |
mail_conn = get_folder(mail_conn, folder) | |
msg_uid_list = get_email_ids(mail_conn) | |
print "Scanning folder: ", folder, " with ", len(msg_uid_list), " messages" | |
# Fetch a list of recipients | |
all_recipients = [] | |
for msg_uid in msg_uid_list: | |
msg = fetch_message(mail_conn, msg_uid) | |
recip_list = get_recipients(msg) | |
all_recipients.extend(recip_list) | |
print "Writing", len(set(all_recipients)), "email addresses to file", file.name | |
output = "" | |
for address in set(all_recipients): | |
output = output + address + "\n" | |
file.write(output) | |
file.flush() | |
os.fsync(file.fileno()) | |
file.close() | |
print("\nWritten to file: " + FILENAME) | |
try: | |
mail_conn.close() # Close currently selected folder (if any) | |
finally: | |
mail_conn.logout() |
Woohoo! Got it working.
For anyone else who might be looking for this:
"""Create a connection to an IMAP server and find ALL email addresses
Original script by abought: https://gist.github.com/abought/15a1e08705b121c1b7bd
References:
http://www.voidynullness.net/blog/2013/07/25/gmail-email-with-python-via-imap/
and
https://yuji.wordpress.com/2011/06/22/python-imaplib-imap-example-with-gmail/
"""
__author__ = 'jmb'
import email
import imaplib
import getpass
import sys
import re
import os
# EDIT these as required:
FILENAME = 'out.txt'
DEFAULT_MAIL_SERVER = 'imap.server'
# No user parameters below this line
# ADDR_PATTERN = re.compile('<(.*?)>') # Finds email as <[email protected]>
# Find ALL email address in all fields:
ADDR_PATTERN = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
def connect(user, pwd, server=DEFAULT_MAIL_SERVER):
"""Connect to [the specified] mail server. Return an open connection"""
conn = imaplib.IMAP4_SSL(server)
try:
conn.login(user, pwd)
except imaplib.IMAP4.error:
print("Failed to login")
sys.exit(1)
return conn
def print_folders(conn):
"""Print a list of open mailbox folders"""
for f in conn.list():
print(" ", f)
def get_folder(conn, folder_name):
"""Fetch a specific folder (or label) from server"""
if conn.state == "SELECTED":
# Explicitly close any previously opened folders; may not be necessary
conn.close()
rv, data = conn.select(folder_name)
if rv != 'OK':
print("Could not open specified folder. Known labels:")
print_folders(conn)
return conn
def get_email_ids(conn, query='ALL'):
"""Get the numeric IDs for all emails in a given folder"""
if conn.state != "SELECTED":
raise imaplib.IMAP4.error("Cannot search without selecting a folder")
rv, data = conn.uid('search', None, query)
if rv != 'OK':
print("Could not fetch email ids") # for some reason...
return []
return data[0].split()
def fetch_message(conn, msg_uid):
"""
Fetch a specific message uid (not sequential id!) from the given folder;
return the parsed message. User must ensure that specified
message ID exists in that folder.
"""
# TODO: Could we fetch just the envelope of the response to save bandwidth?
rv, data = conn.uid('fetch', msg_uid, "(RFC822)")
if rv != 'OK':
print("ERROR fetching message #", msg_uid)
return {}
#return email.message_from_string(data[0][1]) # dict-like object
return email.message_from_bytes(data[0][1])
from email.utils import getaddresses
def get_recipients(msg_parsed):
"""Given a parsed message, extract and return recipient list"""
addr_fields = ['From', 'To', 'Cc', 'Bcc']
recipients = []
for f in addr_fields:
field_data = msg_parsed.get_all(f, [])
parsed_addrs = getaddresses(field_data)
for name, addr in parsed_addrs:
if addr:
recipients.append(addr.lower())
return recipients
#def get_recipients(msg_parsed):
# """Given a parsed message, extract and return recipient list"""
# recipients = []
# addr_fields = ['From', 'To', 'Cc', 'Bcc']
#
# for f in addr_fields:
# rfield = msg_parsed.get(f, "") # Empty string if field not present
# rlist = re.findall(ADDR_PATTERN, rfield)
# recipients.extend(rlist)
#
# return recipients
def parse_folder_name(line):
parts = line.decode().split(' "." ')
if len(parts) != 2:
print(f"Warning: Could not parse folder name from: {line}")
return None
return parts[1] # Already quoted if needed
if __name__ == "__main__":
username = input("Full email address: ")
password = getpass.getpass()
# Connect
mail_conn = connect(username, password)
# Open output file
file = open(FILENAME, "a")
# Go through each folder
for f in mail_conn.list()[1]:
#folder = f.split()[2].strip('"')
#folder = f.decode().split()[2].strip('"')
folder = parse_folder_name(f)
if not folder or folder == ".":
continue
mail_conn = get_folder(mail_conn, folder)
if mail_conn.state != "SELECTED":
continue # Skip folder if selection failed
msg_uid_list = get_email_ids(mail_conn)
print("Scanning folder: ", folder, " with ", len(msg_uid_list), " messages")
# Fetch a list of recipients
all_recipients = []
for msg_uid in msg_uid_list:
msg = fetch_message(mail_conn, msg_uid)
recip_list = get_recipients(msg)
all_recipients.extend(recip_list)
print("Writing", len(set(all_recipients)), "email addresses to file", file.name)
output = ""
for address in set(all_recipients):
output = output + address + "\n"
file.write(output)
file.flush()
os.fsync(file.fileno())
file.close()
print(("\nWritten to file: " + FILENAME))
try:
mail_conn.close() # Close currently selected folder (if any)
finally:
mail_conn.logout()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, thank you for script! I've been searching for a while and this is exactly what I'm looking for.
I tried to convert this to python 3 but I ran into a problem.
I can connect to my imap server just fine, but then I gen an error on
folder = f.split()[2].strip('"')
it says:
TypeError: a bytes-like object is required, not 'str'