Last active
April 25, 2025 05:16
-
-
Save phreakin/b3751a185240c08ae96c27573e4d67ba to your computer and use it in GitHub Desktop.
A python script to parse Facebook messenger data from your downloaded Facebook data and creates searchable html files of them
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import re | |
import shutil | |
from pathlib import Path | |
from datetime import datetime | |
from argparse import ArgumentParser | |
from html import escape | |
MY_NAME = "YOUR NAME AS IT IS ON FB HERE" | |
MEDIA_TYPES = { | |
"audio": [ | |
".mp3", | |
".m4a", | |
".aac", | |
".wav", | |
".ogg", | |
".opus", | |
".flac", | |
".wma", | |
".aiff", | |
], | |
"photos": [".jpg", ".jpeg", ".png", ".webp", ".heic", ".heif", ".tiff", ".bmp"], | |
"gifs": [".gif"], | |
"videos": [ | |
".mp4", | |
".mov", | |
".mkv", | |
".avi", | |
".wmv", | |
".flv", | |
".webm", | |
".m4v", | |
".3gp", | |
".mpeg", | |
".mpg", | |
".rmvb", | |
".vob", | |
".ts", | |
], | |
"files": [ | |
".docx", | |
".pdf", | |
".txt", | |
".zip", | |
".pptx", | |
".xlsx", | |
".csv", | |
".rar", | |
".7z", | |
".tar", | |
".gz", | |
".bz2", | |
], | |
} | |
def readable_timestamp(ts_ms): | |
return datetime.fromtimestamp(ts_ms / 1000).strftime("%m/%d/%Y %I:%M %p") | |
def linkify(text): | |
def replacer(match): | |
url = match.group(0) | |
return f'<a href="{url}" target="_blank" rel="noopener noreferrer">{url}</a>' | |
url_pattern = re.compile(r"(https?://[^\s]+)") | |
return url_pattern.sub(replacer, text) | |
def parse_messages(json_files): | |
all_messages = [] | |
participants = set() | |
for file in sorted(json_files): | |
try: | |
with open(file, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
except UnicodeDecodeError: | |
with open(file, "r", encoding="utf-8", errors="replace") as f: | |
data = json.load(f) | |
participants.update(p.get("name") for p in data.get("participants", [])) | |
for msg in data.get("messages", []): | |
if "content" in msg: | |
all_messages.append( | |
{ | |
"timestamp": msg["timestamp_ms"], | |
"sender": msg["sender_name"], | |
"content": linkify(escape(msg["content"], quote=False)), | |
} | |
) | |
all_messages.sort(key=lambda m: m["timestamp"]) | |
output_html = [] | |
for msg in all_messages: | |
ts = readable_timestamp(msg["timestamp"]) | |
css_class = "from-me" if msg["sender"] == MY_NAME else "from-them" | |
output_html.append( | |
f"<p class='message {css_class}' data-sender='{msg['sender'].lower()}' data-content='{msg['content'].lower()}'><strong>[{ts}]</strong> <span class='sender'>{escape(msg['sender'])}</span>: <span class='content'>{msg['content']}</span></p>" | |
) | |
return "\n".join(output_html), participants | |
def generate_media_previews(folder, rel_url): | |
if not folder.exists(): | |
return "" | |
previews = "" | |
for file in sorted(folder.iterdir()): | |
ext = file.suffix.lower() | |
url = f"{rel_url}/{file.name}" | |
if ext in MEDIA_TYPES["photos"] + MEDIA_TYPES["gifs"]: | |
previews += ( | |
f"<div class='media-box'><img src='{url}' class='img-fluid'></div>" | |
) | |
elif ext in MEDIA_TYPES["videos"]: | |
previews += f"<div class='media-box'><video controls width='100%'><source src='{url}' type='video/mp4'></video></div>" | |
elif ext in MEDIA_TYPES["audio"]: | |
previews += f"<div class='media-box'><audio controls><source src='{url}'></audio></div>" | |
return previews | |
def generate_media_table(folder, rel_url): | |
if not folder.exists(): | |
return "" | |
rows = "" | |
for file in sorted(folder.iterdir()): | |
if file.is_file(): | |
ext = file.suffix.replace(".", "").upper() | |
rows += f"<tr><td>{escape(file.name)}</td><td>{ext}</td><td><a href='{rel_url}/{file.name}' target='_blank'>View</a></td></tr>" | |
if rows: | |
return f"<h3>{folder.name.title()} Files</h3><table class='table table-sm table-dark table-striped'><thead><tr><th>Filename</th><th>Type</th><th>View/Download</th></tr></thead><tbody>{rows}</tbody></table>" | |
return "" | |
def copy_media_dirs(convo_dir, output_convo_dir): | |
for media_folder in MEDIA_TYPES: | |
src = convo_dir / media_folder | |
dst = output_convo_dir / media_folder | |
if src.exists() and src.is_dir(): | |
shutil.copytree(src, dst, dirs_exist_ok=True) | |
def generate_html_page( | |
title, messages_html, media_tables, media_previews, participants | |
): | |
participants_html = ( | |
"<p id='participants' ><strong>Participants:</strong> " | |
+ ", ".join(escape(p) for p in sorted(participants)) | |
+ "</p>" | |
if participants | |
else "" | |
) | |
return f""" | |
<!DOCTYPE html> | |
<html lang='en'> | |
<head> | |
<meta charset='UTF-8'> | |
<title>{escape(title)}</title> | |
<link href='https://cdn.jsdelivr.net/npm/bootstrap@latest/dist/css/bootstrap.min.css' rel='stylesheet'> | |
<link href='https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@latest/css/all.min.css' rel='stylesheet'> | |
<link href='https://cdn.jsdelivr.net/npm/sweetalert2@latest/dist/sweetalert2.min.css' rel='stylesheet'> | |
<link href='https://cdn.jsdelivr.net/npm/tippy.js@latest/dist/tippy.css' rel='stylesheet'> | |
<link href='assets/css/all.css' rel='stylesheet'> | |
<style> | |
body {{ background: #222222; color: #F5F5F5; font-family: monospace; padding: 20px; }} | |
.media-box {{ margin: 10px 0; }} | |
.chat-log p {{ margin-bottom: 5px; }} | |
#search {{ width: 100%; margin-bottom: 20px; padding: 10px; border-radius: 5px; border: none; }} | |
.from-me {{ color: #0fd5db; }} | |
.from-them {{ color: #9bff7d; }} | |
.p {{ margin-top: 5px; margin-bottom: 5px; }} | |
#participants {{ font-size="32" text-align: "center"; margin-bottom: 20px; }} | |
</style> | |
</head> | |
<body> | |
<h1 class='text-warning text-center'> | |
{escape(title)} | |
</h1> | |
{participants_html} | |
<input id='search' type='text' placeholder='Search by sender or message...' /> | |
<div class='chat-log'> | |
{messages_html} | |
</div> | |
<hr/> | |
{media_previews} | |
{media_tables} | |
<script> | |
document.getElementById('search').addEventListener('input', function(e) {{ | |
const value = e.target.value.toLowerCase(); | |
document.querySelectorAll('.message').forEach(msg => {{ | |
const sender = msg.dataset.sender || ''; | |
const content = msg.dataset.content || ''; | |
msg.style.display = (sender.includes(value) || content.includes(value)) ? 'block' : 'none'; | |
}}); | |
}}); | |
</script> | |
<script src='https://cdn.jsdelivr.net/npm/bootstrap@latest/dist/js/bootstrap.bundle.min.js'></script> | |
<script src='https://cdn.jsdelivr.net/npm/sweetalert2@latest/dist/sweetalert2.all.min.js'></script> | |
<script src='https://cdn.jsdelivr.net/npm/tippy.js@latest/dist/tippy-bundle.umd.min.js'></script> | |
<script src='https://cdn.jsdelivr.net/npm/clipboard@latest/dist/clipboard.min.js'></script> | |
<script src='https://cdn.jsdelivr.net/npm/clipboard-polyfill@latest/dist/clipboard-polyfill.min.js'></script> | |
<script src='assets/js/all.js'></script> | |
</body> | |
</html> | |
""" | |
def build_convo_html(convo_dir, output_dir, rel_output_path): | |
json_files = list(convo_dir.glob("*.json")) | |
if not json_files: | |
return None | |
messages_html, participants = parse_messages(json_files) | |
media_html = "" | |
tables_html = "" | |
for folder_name in MEDIA_TYPES: | |
media_path = convo_dir / folder_name | |
rel_url = f"./{folder_name}" if media_path.exists() else "" | |
media_html += generate_media_previews(media_path, rel_url) | |
tables_html += generate_media_table(media_path, rel_url) | |
output_path = output_dir / rel_output_path / "index.html" | |
output_path.parent.mkdir(parents=True, exist_ok=True) | |
copy_media_dirs(convo_dir, output_path.parent) | |
try: | |
with open(json_files[0], "r", encoding="utf-8") as f: | |
title = json.load(f).get("title", convo_dir.name) | |
except: | |
title = convo_dir.name | |
with open(output_path, "w", encoding="utf-8") as f: | |
f.write( | |
generate_html_page( | |
title, messages_html, tables_html, media_html, participants | |
) | |
) | |
return str(rel_output_path).replace("\\", "/") | |
def build_index(all_convos, output_dir): | |
sections = "" | |
for section, convos in all_convos.items(): | |
links = "\n".join( | |
[ | |
f"<li class='list-group-item bg-dark'><a class='text-warning' href='{c}/index.html'>{c}</a></li>" | |
for c in sorted(convos) | |
] | |
) | |
sections += f"<h2 class='text-warning'>{section.title()}</h2><ul class='list-group mb-4'>{links}</ul>" | |
html = f"""<!DOCTYPE html> | |
<html lang='en'> | |
<head> | |
<meta charset='UTF-8'> | |
<title>Messenger Archive</title> | |
<link href='https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css' rel='stylesheet'> | |
</head> | |
<body class='bg-dark text-light'> | |
<div class='container mt-5'> | |
<h1 class='text-warning text-center mb-5'> | |
Facebook Messenger Archive | |
</h1> | |
{sections} | |
</div> | |
</body> | |
</html>""" | |
with open(output_dir / "index.html", "w", encoding="utf-8") as f: | |
f.write(html) | |
def process_all(root_input, output_root): | |
categories = [ | |
"inbox", | |
"archived_threads", | |
"e2ee_cutover", | |
"stickers_used", | |
"support_files", | |
] | |
all_convos = {} | |
for category in categories: | |
input_path = Path(root_input) / category | |
if not input_path.exists(): | |
continue | |
for convo_dir in input_path.iterdir(): | |
if convo_dir.is_dir(): | |
rel_path = Path(category) / convo_dir.name | |
result = build_convo_html(convo_dir, output_root, rel_path) | |
if result: | |
all_convos.setdefault(category, []).append(result) | |
build_index(all_convos, output_root) | |
if __name__ == "__main__": | |
parser = ArgumentParser( | |
description="Parse Facebook Messenger export into HTML archive." | |
) | |
parser.add_argument( | |
"messages_folder", | |
help="Path to 'messages' folder (contains inbox/, archived_threads/, e2ee_cutover/)", | |
) | |
parser.add_argument("--output", help="Output folder", default="output_html") | |
args = parser.parse_args() | |
process_all(args.messages_folder, Path(args.output).resolve()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Get your Facebook data from Facebook. Once you have it, unzip it to wherever and put this script inside the root folder of wherever you unzipped the data. By default, this script parses data in facebook_data\your_facebook_activity\messages