phreakin · April 25, 2025 05:16 · phreakin · Apr 25, 2025
diff --git a/parse_facebook_archive.py b/parse_facebook_archive.py
 import os
 import json
 import re
 import shutil
 from pathlib import Path
 from datetime import datetime
 from argparse import ArgumentParser
 from html import escape

 MY_NAME = "YOUR NAME AS IT IS ON FB HERE"

 MEDIA_TYPES = {
    "audio": [
        ".mp3",
        ".m4a",
        ".aac",
        ".wav",
        ".ogg",
        ".opus",
        ".flac",
        ".wma",
        ".aiff",
    ],
    "photos": [".jpg", ".jpeg", ".png", ".webp", ".heic", ".heif", ".tiff", ".bmp"],
    "gifs": [".gif"],
    "videos": [
        ".mp4",
        ".mov",
        ".mkv",
        ".avi",
        ".wmv",
        ".flv",
        ".webm",
        ".m4v",
        ".3gp",
        ".mpeg",
        ".mpg",
        ".rmvb",
        ".vob",
        ".ts",
    ],
    "files": [
        ".docx",
        ".pdf",
        ".txt",
        ".zip",
        ".pptx",
        ".xlsx",
        ".csv",
        ".rar",
        ".7z",
        ".tar",
        ".gz",
        ".bz2",
    ],
 }


 def readable_timestamp(ts_ms):
    return datetime.fromtimestamp(ts_ms / 1000).strftime("%m/%d/%Y %I:%M %p")


 def linkify(text):
    def replacer(match):
        url = match.group(0)
        return f'<a href="{url}" target="_blank" rel="noopener noreferrer">{url}</a>'

    url_pattern = re.compile(r"(https?://[^\s]+)")
    return url_pattern.sub(replacer, text)


 def parse_messages(json_files):
    all_messages = []
    participants = set()

    for file in sorted(json_files):
        try:
            with open(file, "r", encoding="utf-8") as f:
                data = json.load(f)
        except UnicodeDecodeError:
            with open(file, "r", encoding="utf-8", errors="replace") as f:
                data = json.load(f)

        participants.update(p.get("name") for p in data.get("participants", []))
        for msg in data.get("messages", []):
            if "content" in msg:
                all_messages.append(
                    {
                        "timestamp": msg["timestamp_ms"],
                        "sender": msg["sender_name"],
                        "content": linkify(escape(msg["content"], quote=False)),
                    }
                )

    all_messages.sort(key=lambda m: m["timestamp"])
    output_html = []
    for msg in all_messages:
        ts = readable_timestamp(msg["timestamp"])
        css_class = "from-me" if msg["sender"] == MY_NAME else "from-them"
        output_html.append(
            f"<p class='message {css_class}' data-sender='{msg['sender'].lower()}' data-content='{msg['content'].lower()}'><strong>[{ts}]</strong> <span class='sender'>{escape(msg['sender'])}</span>: <span class='content'>{msg['content']}</span></p>"
        )
    return "\n".join(output_html), participants


 def generate_media_previews(folder, rel_url):
    if not folder.exists():
        return ""
    previews = ""
    for file in sorted(folder.iterdir()):
        ext = file.suffix.lower()
        url = f"{rel_url}/{file.name}"
        if ext in MEDIA_TYPES["photos"] + MEDIA_TYPES["gifs"]:
            previews += (
                f"<div class='media-box'><img src='{url}' class='img-fluid'></div>"
            )
        elif ext in MEDIA_TYPES["videos"]:
            previews += f"<div class='media-box'><video controls width='100%'><source src='{url}' type='video/mp4'></video></div>"
        elif ext in MEDIA_TYPES["audio"]:
            previews += f"<div class='media-box'><audio controls><source src='{url}'></audio></div>"
    return previews


 def generate_media_table(folder, rel_url):
    if not folder.exists():
        return ""
    rows = ""
    for file in sorted(folder.iterdir()):
        if file.is_file():
            ext = file.suffix.replace(".", "").upper()
            rows += f"<tr><td>{escape(file.name)}</td><td>{ext}</td><td><a href='{rel_url}/{file.name}' target='_blank'>View</a></td></tr>"
    if rows:
        return f"<h3>{folder.name.title()} Files</h3><table class='table table-sm table-dark table-striped'><thead><tr><th>Filename</th><th>Type</th><th>View/Download</th></tr></thead><tbody>{rows}</tbody></table>"
    return ""


 def copy_media_dirs(convo_dir, output_convo_dir):
    for media_folder in MEDIA_TYPES:
        src = convo_dir / media_folder
        dst = output_convo_dir / media_folder
        if src.exists() and src.is_dir():
            shutil.copytree(src, dst, dirs_exist_ok=True)


 def generate_html_page(
    title, messages_html, media_tables, media_previews, participants
 ):
    participants_html = (
        "<p id='participants' ><strong>Participants:</strong> "
        + ", ".join(escape(p) for p in sorted(participants))
        + "</p>"
        if participants
        else ""
    )
    return f"""
    <!DOCTYPE html>
    <html lang='en'>
 <head>
 <meta charset='UTF-8'>
 <title>{escape(title)}</title>
 <link href='https://cdn.jsdelivr.net/npm/bootstrap@latest/dist/css/bootstrap.min.css' rel='stylesheet'>
 <link href='https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@latest/css/all.min.css' rel='stylesheet'>
 <link href='https://cdn.jsdelivr.net/npm/sweetalert2@latest/dist/sweetalert2.min.css' rel='stylesheet'>
 <link href='https://cdn.jsdelivr.net/npm/tippy.js@latest/dist/tippy.css' rel='stylesheet'>
 <link href='assets/css/all.css' rel='stylesheet'>
 <style>
    body {{ background: #222222; color: #F5F5F5; font-family: monospace; padding: 20px; }}
    .media-box {{ margin: 10px 0; }}
    .chat-log p {{ margin-bottom: 5px; }}
    #search {{ width: 100%; margin-bottom: 20px; padding: 10px; border-radius: 5px; border: none; }}
    .from-me {{ color: #0fd5db; }}
    .from-them {{ color: #9bff7d; }}
    .p {{ margin-top: 5px; margin-bottom: 5px; }}
    #participants {{ font-size="32" text-align: "center"; margin-bottom: 20px; }}
 </style>
 </head>
 <body>
 <h1 class='text-warning text-center'>
    {escape(title)}
 </h1>
 {participants_html}
 <input id='search' type='text' placeholder='Search by sender or message...' />
 <div class='chat-log'>
    {messages_html}
 </div>
 <hr/>
 {media_previews}
 {media_tables}
 <script>
 document.getElementById('search').addEventListener('input', function(e) {{
  const value = e.target.value.toLowerCase();
  document.querySelectorAll('.message').forEach(msg => {{
    const sender = msg.dataset.sender || '';
    const content = msg.dataset.content || '';
    msg.style.display = (sender.includes(value) || content.includes(value)) ? 'block' : 'none';
  }});
 }});
 </script>
 <script src='https://cdn.jsdelivr.net/npm/bootstrap@latest/dist/js/bootstrap.bundle.min.js'></script>
 <script src='https://cdn.jsdelivr.net/npm/sweetalert2@latest/dist/sweetalert2.all.min.js'></script>
 <script src='https://cdn.jsdelivr.net/npm/tippy.js@latest/dist/tippy-bundle.umd.min.js'></script>
 <script src='https://cdn.jsdelivr.net/npm/clipboard@latest/dist/clipboard.min.js'></script>
 <script src='https://cdn.jsdelivr.net/npm/clipboard-polyfill@latest/dist/clipboard-polyfill.min.js'></script>
 <script src='assets/js/all.js'></script>
 </body>
 </html>
 """


 def build_convo_html(convo_dir, output_dir, rel_output_path):
    json_files = list(convo_dir.glob("*.json"))
    if not json_files:
        return None
    messages_html, participants = parse_messages(json_files)
    media_html = ""
    tables_html = ""
    for folder_name in MEDIA_TYPES:
        media_path = convo_dir / folder_name
        rel_url = f"./{folder_name}" if media_path.exists() else ""
        media_html += generate_media_previews(media_path, rel_url)
        tables_html += generate_media_table(media_path, rel_url)

    output_path = output_dir / rel_output_path / "index.html"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    copy_media_dirs(convo_dir, output_path.parent)

    try:
        with open(json_files[0], "r", encoding="utf-8") as f:
            title = json.load(f).get("title", convo_dir.name)
    except:
        title = convo_dir.name

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(
            generate_html_page(
                title, messages_html, tables_html, media_html, participants
            )
        )

    return str(rel_output_path).replace("\\", "/")


 def build_index(all_convos, output_dir):
    sections = ""
    for section, convos in all_convos.items():
        links = "\n".join(
            [
                f"<li class='list-group-item bg-dark'><a class='text-warning' href='{c}/index.html'>{c}</a></li>"
                for c in sorted(convos)
            ]
        )
        sections += f"<h2 class='text-warning'>{section.title()}</h2><ul class='list-group mb-4'>{links}</ul>"

    html = f"""<!DOCTYPE html>
    <html lang='en'>
    <head>
        <meta charset='UTF-8'>
        <title>Messenger Archive</title>
        <link href='https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css' rel='stylesheet'>
    </head>
    <body class='bg-dark text-light'>
    <div class='container mt-5'>
        <h1 class='text-warning text-center mb-5'>
            Facebook Messenger Archive
        </h1>
        {sections}
    </div>
    </body>
    </html>"""
    with open(output_dir / "index.html", "w", encoding="utf-8") as f:
        f.write(html)


 def process_all(root_input, output_root):
    categories = [
        "inbox",
        "archived_threads",
        "e2ee_cutover",
        "stickers_used",
        "support_files",
    ]
    all_convos = {}

    for category in categories:
        input_path = Path(root_input) / category
        if not input_path.exists():
            continue
        for convo_dir in input_path.iterdir():
            if convo_dir.is_dir():
                rel_path = Path(category) / convo_dir.name
                result = build_convo_html(convo_dir, output_root, rel_path)
                if result:
                    all_convos.setdefault(category, []).append(result)

    build_index(all_convos, output_root)


 if __name__ == "__main__":
    parser = ArgumentParser(
        description="Parse Facebook Messenger export into HTML archive."
    )
    parser.add_argument(
        "messages_folder",
        help="Path to 'messages' folder (contains inbox/, archived_threads/, e2ee_cutover/)",
    )
    parser.add_argument("--output", help="Output folder", default="output_html")
    args = parser.parse_args()

    process_all(args.messages_folder, Path(args.output).resolve())
	import os
	import json
	import re
	import shutil
	from pathlib import Path
	from datetime import datetime
	from argparse import ArgumentParser
	from html import escape

	MY_NAME = "YOUR NAME AS IT IS ON FB HERE"

	MEDIA_TYPES = {
	"audio": [
	".mp3",
	".m4a",
	".aac",
	".wav",
	".ogg",
	".opus",
	".flac",
	".wma",
	".aiff",
	],
	"photos": [".jpg", ".jpeg", ".png", ".webp", ".heic", ".heif", ".tiff", ".bmp"],
	"gifs": [".gif"],
	"videos": [
	".mp4",
	".mov",
	".mkv",
	".avi",
	".wmv",
	".flv",
	".webm",
	".m4v",
	".3gp",
	".mpeg",
	".mpg",
	".rmvb",
	".vob",
	".ts",
	],
	"files": [
	".docx",
	".pdf",
	".txt",
	".zip",
	".pptx",
	".xlsx",
	".csv",
	".rar",
	".7z",
	".tar",
	".gz",
	".bz2",
	],
	}


	def readable_timestamp(ts_ms):
	return datetime.fromtimestamp(ts_ms / 1000).strftime("%m/%d/%Y %I:%M %p")


	def linkify(text):
	def replacer(match):
	url = match.group(0)
	return f'<a href="{url}" target="_blank" rel="noopener noreferrer">{url}</a>'

	url_pattern = re.compile(r"(https?://[^\s]+)")
	return url_pattern.sub(replacer, text)


	def parse_messages(json_files):
	all_messages = []
	participants = set()

	for file in sorted(json_files):
	try:
	with open(file, "r", encoding="utf-8") as f:
	data = json.load(f)
	except UnicodeDecodeError:
	with open(file, "r", encoding="utf-8", errors="replace") as f:
	data = json.load(f)

	participants.update(p.get("name") for p in data.get("participants", []))
	for msg in data.get("messages", []):
	if "content" in msg:
	all_messages.append(
	{
	"timestamp": msg["timestamp_ms"],
	"sender": msg["sender_name"],
	"content": linkify(escape(msg["content"], quote=False)),
	}
	)

	all_messages.sort(key=lambda m: m["timestamp"])
	output_html = []
	for msg in all_messages:
	ts = readable_timestamp(msg["timestamp"])
	css_class = "from-me" if msg["sender"] == MY_NAME else "from-them"
	output_html.append(
	f"<p class='message {css_class}' data-sender='{msg['sender'].lower()}' data-content='{msg['content'].lower()}'><strong>[{ts}]</strong> <span class='sender'>{escape(msg['sender'])}</span>: <span class='content'>{msg['content']}</span></p>"
	)
	return "\n".join(output_html), participants


	def generate_media_previews(folder, rel_url):
	if not folder.exists():
	return ""
	previews = ""
	for file in sorted(folder.iterdir()):
	ext = file.suffix.lower()
	url = f"{rel_url}/{file.name}"
	if ext in MEDIA_TYPES["photos"] + MEDIA_TYPES["gifs"]:
	previews += (
	f"<div class='media-box'><img src='{url}' class='img-fluid'></div>"
	)
	elif ext in MEDIA_TYPES["videos"]:
	previews += f"<div class='media-box'><video controls width='100%'><source src='{url}' type='video/mp4'></video></div>"
	elif ext in MEDIA_TYPES["audio"]:
	previews += f"<div class='media-box'><audio controls><source src='{url}'></audio></div>"
	return previews


	def generate_media_table(folder, rel_url):
	if not folder.exists():
	return ""
	rows = ""
	for file in sorted(folder.iterdir()):
	if file.is_file():
	ext = file.suffix.replace(".", "").upper()
	rows += f"<tr><td>{escape(file.name)}</td><td>{ext}</td><td><a href='{rel_url}/{file.name}' target='_blank'>View</a></td></tr>"
	if rows:
	return f"<h3>{folder.name.title()} Files</h3><table class='table table-sm table-dark table-striped'><thead><tr><th>Filename</th><th>Type</th><th>View/Download</th></tr></thead><tbody>{rows}</tbody></table>"
	return ""


	def copy_media_dirs(convo_dir, output_convo_dir):
	for media_folder in MEDIA_TYPES:
	src = convo_dir / media_folder
	dst = output_convo_dir / media_folder
	if src.exists() and src.is_dir():
	shutil.copytree(src, dst, dirs_exist_ok=True)


	def generate_html_page(
	title, messages_html, media_tables, media_previews, participants
	):
	participants_html = (
	"<p id='participants' ><strong>Participants:</strong> "
	+ ", ".join(escape(p) for p in sorted(participants))
	+ "</p>"
	if participants
	else ""
	)
	return f"""
	<!DOCTYPE html>
	<html lang='en'>
	<head>
	<meta charset='UTF-8'>
	<title>{escape(title)}</title>
	<link href='https://cdn.jsdelivr.net/npm/bootstrap@latest/dist/css/bootstrap.min.css' rel='stylesheet'>
	<link href='https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@latest/css/all.min.css' rel='stylesheet'>
	<link href='https://cdn.jsdelivr.net/npm/sweetalert2@latest/dist/sweetalert2.min.css' rel='stylesheet'>
	<link href='https://cdn.jsdelivr.net/npm/tippy.js@latest/dist/tippy.css' rel='stylesheet'>
	<link href='assets/css/all.css' rel='stylesheet'>
	<style>
	body {{ background: #222222; color: #F5F5F5; font-family: monospace; padding: 20px; }}
	.media-box {{ margin: 10px 0; }}
	.chat-log p {{ margin-bottom: 5px; }}
	#search {{ width: 100%; margin-bottom: 20px; padding: 10px; border-radius: 5px; border: none; }}
	.from-me {{ color: #0fd5db; }}
	.from-them {{ color: #9bff7d; }}
	.p {{ margin-top: 5px; margin-bottom: 5px; }}
	#participants {{ font-size="32" text-align: "center"; margin-bottom: 20px; }}
	</style>
	</head>
	<body>
	<h1 class='text-warning text-center'>
	{escape(title)}
	</h1>
	{participants_html}
	<input id='search' type='text' placeholder='Search by sender or message...' />
	<div class='chat-log'>
	{messages_html}
	</div>
	<hr/>
	{media_previews}
	{media_tables}
	<script>
	document.getElementById('search').addEventListener('input', function(e) {{
	const value = e.target.value.toLowerCase();
	document.querySelectorAll('.message').forEach(msg => {{
	const sender = msg.dataset.sender \|\| '';
	const content = msg.dataset.content \|\| '';
	msg.style.display = (sender.includes(value) \|\| content.includes(value)) ? 'block' : 'none';
	}});
	}});
	</script>
	<script src='https://cdn.jsdelivr.net/npm/bootstrap@latest/dist/js/bootstrap.bundle.min.js'></script>
	<script src='https://cdn.jsdelivr.net/npm/sweetalert2@latest/dist/sweetalert2.all.min.js'></script>
	<script src='https://cdn.jsdelivr.net/npm/tippy.js@latest/dist/tippy-bundle.umd.min.js'></script>
	<script src='https://cdn.jsdelivr.net/npm/clipboard@latest/dist/clipboard.min.js'></script>
	<script src='https://cdn.jsdelivr.net/npm/clipboard-polyfill@latest/dist/clipboard-polyfill.min.js'></script>
	<script src='assets/js/all.js'></script>
	</body>
	</html>
	"""


	def build_convo_html(convo_dir, output_dir, rel_output_path):
	json_files = list(convo_dir.glob("*.json"))
	if not json_files:
	return None
	messages_html, participants = parse_messages(json_files)
	media_html = ""
	tables_html = ""
	for folder_name in MEDIA_TYPES:
	media_path = convo_dir / folder_name
	rel_url = f"./{folder_name}" if media_path.exists() else ""
	media_html += generate_media_previews(media_path, rel_url)
	tables_html += generate_media_table(media_path, rel_url)

	output_path = output_dir / rel_output_path / "index.html"
	output_path.parent.mkdir(parents=True, exist_ok=True)
	copy_media_dirs(convo_dir, output_path.parent)

	try:
	with open(json_files[0], "r", encoding="utf-8") as f:
	title = json.load(f).get("title", convo_dir.name)
	except:
	title = convo_dir.name

	with open(output_path, "w", encoding="utf-8") as f:
	f.write(
	generate_html_page(
	title, messages_html, tables_html, media_html, participants
	)
	)

	return str(rel_output_path).replace("\\", "/")


	def build_index(all_convos, output_dir):
	sections = ""
	for section, convos in all_convos.items():
	links = "\n".join(
	[
	f"<li class='list-group-item bg-dark'><a class='text-warning' href='{c}/index.html'>{c}</a></li>"
	for c in sorted(convos)
	]
	)
	sections += f"<h2 class='text-warning'>{section.title()}</h2><ul class='list-group mb-4'>{links}</ul>"

	html = f"""<!DOCTYPE html>
	<html lang='en'>
	<head>
	<meta charset='UTF-8'>
	<title>Messenger Archive</title>
	<link href='https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css' rel='stylesheet'>
	</head>
	<body class='bg-dark text-light'>
	<div class='container mt-5'>
	<h1 class='text-warning text-center mb-5'>
	Facebook Messenger Archive
	</h1>
	{sections}
	</div>
	</body>
	</html>"""
	with open(output_dir / "index.html", "w", encoding="utf-8") as f:
	f.write(html)


	def process_all(root_input, output_root):
	categories = [
	"inbox",
	"archived_threads",
	"e2ee_cutover",
	"stickers_used",
	"support_files",
	]
	all_convos = {}

	for category in categories:
	input_path = Path(root_input) / category
	if not input_path.exists():
	continue
	for convo_dir in input_path.iterdir():
	if convo_dir.is_dir():
	rel_path = Path(category) / convo_dir.name
	result = build_convo_html(convo_dir, output_root, rel_path)
	if result:
	all_convos.setdefault(category, []).append(result)

	build_index(all_convos, output_root)


	if __name__ == "__main__":
	parser = ArgumentParser(
	description="Parse Facebook Messenger export into HTML archive."
	)
	parser.add_argument(
	"messages_folder",
	help="Path to 'messages' folder (contains inbox/, archived_threads/, e2ee_cutover/)",
	)
	parser.add_argument("--output", help="Output folder", default="output_html")
	args = parser.parse_args()

	process_all(args.messages_folder, Path(args.output).resolve())