Created
April 14, 2025 11:57
-
-
Save secemp9/dfe3ae2c3d4b57a3331b2e80a4dadc25 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
from collections import Counter | |
from datetime import datetime, timedelta | |
import seaborn as sns | |
import matplotlib.dates as mdates | |
from matplotlib.gridspec import GridSpec | |
import re | |
import os | |
import traceback | |
# Set style for prettier plots | |
plt.style.use('ggplot') | |
sns.set_palette("pastel") | |
def safe_extract_text(message): | |
"""Safely extract text content from a message with various possible structures""" | |
try: | |
# Check if it's the old structure with 'content.parts' | |
if isinstance(message.get('content'), dict) and 'parts' in message.get('content', {}): | |
return message.get('content', {}).get('parts', [''])[0] | |
# Check if it's the new structure with 'content' as a list | |
elif isinstance(message.get('content'), list) and len(message.get('content', [])) > 0: | |
# Try to find any item with 'text' key | |
for item in message.get('content', []): | |
if isinstance(item, dict) and 'text' in item: | |
return item.get('text', '') | |
# If we can't find a 'text' key, check for 'content' key | |
for item in message.get('content', []): | |
if isinstance(item, dict) and 'content' in item: | |
return str(item.get('content', '')) | |
# If there's a 'text' field directly in the message | |
elif 'text' in message: | |
return message.get('text', '') | |
# Fallback: return empty string if we can't find the content | |
return '' | |
except Exception as e: | |
print(f"Error extracting text: {e}") | |
return '' | |
def process_data(data_path): | |
"""Process conversation data and extract metrics""" | |
print(f"Loading data from: {data_path}") | |
try: | |
with open(data_path) as f: | |
data = json.load(f) | |
print(f"Successfully loaded data: {len(data)} conversations found") | |
except Exception as e: | |
print(f"Error loading data: {e}") | |
traceback.print_exc() | |
return None | |
# Create empty lists to store various metrics | |
timestamps = [] | |
message_lengths = [] | |
conversation_lengths = [] | |
conversation_topics = [] | |
human_assistant_pairs = [] | |
weekdays = [] | |
hours = [] | |
error_counts = 0 | |
# Process the data | |
for conv_idx, conversation in enumerate(data): | |
try: | |
# Extract conversation topic/title if available | |
conv_title = conversation.get('title', 'Untitled Conversation') | |
conversation_topics.append(conv_title) | |
# Count messages in this conversation | |
conv_message_count = 0 | |
conv_timestamps = [] | |
# Try to process chat_messages if available | |
chat_messages = conversation.get('chat_messages', []) | |
if not chat_messages and 'messages' in conversation: | |
chat_messages = conversation.get('messages', []) | |
for message in chat_messages: | |
try: | |
created_at = message.get('created_at') | |
if not created_at: | |
continue | |
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00')) | |
# For all messages | |
weekdays.append(dt.strftime('%A')) | |
hours.append(dt.hour) | |
if message.get('sender') == "human": | |
timestamps.append(created_at) | |
conv_timestamps.append(dt) | |
conv_message_count += 1 | |
# Calculate message length | |
content_text = safe_extract_text(message) | |
message_lengths.append(len(content_text)) | |
except Exception as e: | |
error_counts += 1 | |
print(f"Error processing message in conversation {conv_idx}: {e}") | |
continue | |
# Store conversation length (number of human messages) | |
if conv_message_count > 0: | |
conversation_lengths.append(conv_message_count) | |
# Calculate time differences between consecutive messages in conversation | |
if len(conv_timestamps) > 1: | |
for i in range(len(conv_timestamps) - 1): | |
time_diff = (conv_timestamps[i + 1] - conv_timestamps[i]).total_seconds() / 60 # in minutes | |
# Only consider gaps less than a day (1440 minutes) to filter out long inactive periods | |
if time_diff < 1440: | |
human_assistant_pairs.append(time_diff) | |
except Exception as e: | |
error_counts += 1 | |
print(f"Error processing conversation {conv_idx}: {e}") | |
continue | |
print(f"Processed {len(timestamps)} human messages with {error_counts} errors") | |
if not timestamps: | |
print("No valid timestamps found. Cannot continue analysis.") | |
return None | |
# Convert timestamps to datetime objects and extract dates | |
dates = [datetime.fromisoformat(ts.replace('Z', '+00:00')).date() for ts in timestamps] | |
# Count messages per day | |
daily_counts = Counter(dates) | |
# Prepare results dictionary | |
results = { | |
'daily_counts': daily_counts, | |
'dates': dates, | |
'timestamps': timestamps, | |
'message_lengths': message_lengths, | |
'conversation_lengths': conversation_lengths, | |
'human_assistant_pairs': human_assistant_pairs, | |
'weekdays': weekdays, | |
'hours': hours, | |
'conversation_topics': conversation_topics | |
} | |
return results | |
def create_visualizations(results, output_dir='output'): | |
"""Create visualizations from processed data""" | |
if not results: | |
print("No results to visualize.") | |
return | |
# Create output directory if it doesn't exist | |
os.makedirs(output_dir, exist_ok=True) | |
# Extract data from results | |
daily_counts = results['daily_counts'] | |
message_lengths = results['message_lengths'] | |
conversation_lengths = results['conversation_lengths'] | |
human_assistant_pairs = results['human_assistant_pairs'] | |
weekdays = results['weekdays'] | |
hours = results['hours'] | |
# Convert to DataFrame for easier plotting | |
df_daily = pd.DataFrame.from_dict(daily_counts, orient='index').reset_index() | |
df_daily.columns = ['date', 'message_count'] | |
df_daily = df_daily.sort_values('date') | |
# Create a figure with multiple subplots using GridSpec | |
plt.figure(figsize=(15, 20)) | |
gs = GridSpec(4, 2) | |
# 1. Daily Usage Plot | |
ax1 = plt.subplot(gs[0, :]) | |
ax1.bar(df_daily['date'], df_daily['message_count'], color='skyblue') | |
ax1.set_xlabel('Date') | |
ax1.set_ylabel('Number of Messages') | |
ax1.set_title('Usage Per Day') | |
ax1.tick_params(axis='x', rotation=45) | |
ax1.grid(axis='y', linestyle='--', alpha=0.7) | |
# Annotate the total number of messages | |
total_messages = sum(daily_counts.values()) | |
ax1.annotate(f'Total Messages: {total_messages}', | |
xy=(0.95, 0.95), | |
xycoords='axes fraction', | |
horizontalalignment='right', | |
verticalalignment='top', | |
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8)) | |
# 2. Weekly Trend Analysis | |
ax2 = plt.subplot(gs[1, 0]) | |
try: | |
# Convert date column to datetime explicitly | |
df_daily['date'] = pd.to_datetime(df_daily['date']) | |
df_daily['week'] = df_daily['date'].dt.isocalendar().week | |
weekly_data = df_daily.groupby('week')['message_count'].sum().reset_index() | |
ax2.plot(weekly_data['week'], weekly_data['message_count'], marker='o', linestyle='-', color='green') | |
ax2.set_xlabel('Week Number') | |
ax2.set_ylabel('Number of Messages') | |
ax2.set_title('Weekly Message Volume') | |
ax2.grid(True, linestyle='--', alpha=0.7) | |
except Exception as e: | |
print(f"Error creating weekly trend analysis: {e}") | |
ax2.text(0.5, 0.5, "Weekly trend analysis unavailable", | |
ha='center', va='center', transform=ax2.transAxes) | |
# 3. Message Length Distribution | |
ax3 = plt.subplot(gs[1, 1]) | |
if message_lengths: | |
sns.histplot(message_lengths, bins=20, kde=True, ax=ax3, color='purple') | |
ax3.set_xlabel('Message Length (characters)') | |
ax3.set_ylabel('Frequency') | |
ax3.set_title('Distribution of Message Lengths') | |
# Add statistics as annotations | |
avg_length = np.mean(message_lengths) | |
median_length = np.median(message_lengths) | |
ax3.annotate(f'Mean: {avg_length:.1f} chars\nMedian: {median_length:.1f} chars', | |
xy=(0.95, 0.95), | |
xycoords='axes fraction', | |
horizontalalignment='right', | |
verticalalignment='top', | |
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8)) | |
else: | |
ax3.text(0.5, 0.5, "No message length data available", | |
ha='center', va='center', transform=ax3.transAxes) | |
# 4. Day of Week Activity | |
ax4 = plt.subplot(gs[2, 0]) | |
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
weekday_counts = Counter(weekdays) | |
ordered_counts = [weekday_counts.get(day, 0) for day in weekday_order] | |
ax4.bar(weekday_order, ordered_counts, color='coral') | |
ax4.set_xlabel('Day of Week') | |
ax4.set_ylabel('Number of Messages') | |
ax4.set_title('Activity by Day of Week') | |
ax4.tick_params(axis='x', rotation=45) | |
ax4.grid(axis='y', linestyle='--', alpha=0.7) | |
# 5. Hour of Day Activity | |
ax5 = plt.subplot(gs[2, 1]) | |
hour_counts = Counter(hours) | |
hour_df = pd.DataFrame.from_dict({h: hour_counts.get(h, 0) for h in range(24)}, orient='index', | |
columns=['count']).reset_index() | |
hour_df.columns = ['hour', 'count'] | |
# Use barplot with hue parameter to fix the FutureWarning | |
sns.barplot(x='hour', y='count', data=hour_df, ax=ax5, color='steelblue') | |
ax5.set_xlabel('Hour of Day (24-hour format)') | |
ax5.set_ylabel('Number of Messages') | |
ax5.set_title('Activity by Hour of Day') | |
ax5.set_xticks(range(0, 24, 2)) | |
ax5.grid(axis='y', linestyle='--', alpha=0.7) | |
# 6. Response Time Analysis | |
ax6 = plt.subplot(gs[3, 0]) | |
if human_assistant_pairs: | |
# Filter out extreme outliers for better visualization | |
response_times = [t for t in human_assistant_pairs if t < np.percentile(human_assistant_pairs, 95)] | |
sns.histplot(response_times, bins=20, kde=True, ax=ax6, color='teal') | |
ax6.set_xlabel('Response Time (minutes)') | |
ax6.set_ylabel('Frequency') | |
ax6.set_title('Distribution of Response Times') | |
# Add statistics as annotations | |
avg_response = np.mean(response_times) | |
median_response = np.median(response_times) | |
ax6.annotate(f'Mean: {avg_response:.1f} mins\nMedian: {median_response:.1f} mins', | |
xy=(0.95, 0.95), | |
xycoords='axes fraction', | |
horizontalalignment='right', | |
verticalalignment='top', | |
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8)) | |
else: | |
ax6.text(0.5, 0.5, "No response time data available", | |
ha='center', va='center', transform=ax6.transAxes) | |
# 7. Conversation Length Distribution | |
ax7 = plt.subplot(gs[3, 1]) | |
if conversation_lengths: | |
sns.histplot(conversation_lengths, bins=15, kde=True, ax=ax7, color='olive') | |
ax7.set_xlabel('Number of Messages per Conversation') | |
ax7.set_ylabel('Frequency') | |
ax7.set_title('Distribution of Conversation Lengths') | |
# Add statistics as annotations | |
avg_conv_length = np.mean(conversation_lengths) | |
median_conv_length = np.median(conversation_lengths) | |
ax7.annotate(f'Mean: {avg_conv_length:.1f} msgs\nMedian: {median_conv_length:.1f} msgs', | |
xy=(0.95, 0.95), | |
xycoords='axes fraction', | |
horizontalalignment='right', | |
verticalalignment='top', | |
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8)) | |
else: | |
ax7.text(0.5, 0.5, "No conversation length data available", | |
ha='center', va='center', transform=ax7.transAxes) | |
plt.tight_layout() | |
plt.savefig(f'{output_dir}/conversation_analytics_dashboard.png', dpi=300, bbox_inches='tight') | |
print(f"Saved analytics dashboard to {output_dir}/conversation_analytics_dashboard.png") | |
# Additional analysis: Generate a rolling 7-day average for message volume | |
if len(df_daily) > 7: | |
try: | |
plt.figure(figsize=(12, 6)) | |
# Make sure df_daily['date'] is datetime type before creating date_range | |
if not pd.api.types.is_datetime64_any_dtype(df_daily['date']): | |
df_daily['date'] = pd.to_datetime(df_daily['date']) | |
# Create a continuous date range | |
date_range = pd.date_range(start=df_daily['date'].min(), end=df_daily['date'].max()) | |
full_df = pd.DataFrame({'date': date_range}) | |
# Convert df_daily['date'] to datetime to match full_df['date'] | |
df_daily_full = pd.merge(full_df, df_daily, on='date', how='left').fillna(0) | |
# Calculate the rolling average | |
df_daily_full['7day_avg'] = df_daily_full['message_count'].rolling(window=7, min_periods=1).mean() | |
# Plot actual values and rolling average | |
plt.plot(df_daily_full['date'], df_daily_full['message_count'], color='skyblue', alpha=0.5, | |
label='Daily Messages') | |
plt.plot(df_daily_full['date'], df_daily_full['7day_avg'], color='blue', linewidth=2, | |
label='7-day Moving Average') | |
plt.xlabel('Date') | |
plt.ylabel('Number of Messages') | |
plt.title('Daily Message Volume with 7-day Moving Average') | |
plt.legend() | |
plt.grid(True, linestyle='--', alpha=0.7) | |
plt.xticks(rotation=45) | |
plt.tight_layout() | |
plt.savefig(f'{output_dir}/message_volume_trend.png', dpi=300) | |
print(f"Saved message volume trend to {output_dir}/message_volume_trend.png") | |
except Exception as e: | |
print(f"Error creating rolling average plot: {e}") | |
traceback.print_exc() | |
# Top active days | |
try: | |
top_days = df_daily.nlargest(5, 'message_count') | |
plt.figure(figsize=(10, 6)) | |
plt.bar(top_days['date'].dt.strftime('%Y-%m-%d'), top_days['message_count'], color='orange') | |
plt.xlabel('Date') | |
plt.ylabel('Number of Messages') | |
plt.title('Top 5 Most Active Days') | |
plt.xticks(rotation=45) | |
plt.grid(axis='y', linestyle='--', alpha=0.7) | |
plt.tight_layout() | |
plt.savefig(f'{output_dir}/top_active_days.png', dpi=300) | |
print(f"Saved top active days to {output_dir}/top_active_days.png") | |
except Exception as e: | |
print(f"Error creating top active days plot: {e}") | |
traceback.print_exc() | |
# Monthly activity plot | |
try: | |
# Group by year and month | |
df_daily['year_month'] = df_daily['date'].dt.to_period('M') | |
monthly_data = df_daily.groupby('year_month')['message_count'].sum().reset_index() | |
monthly_data['year_month_str'] = monthly_data['year_month'].dt.strftime('%Y-%m') | |
plt.figure(figsize=(12, 6)) | |
plt.bar(monthly_data['year_month_str'], monthly_data['message_count'], color='lightgreen') | |
plt.xlabel('Month') | |
plt.ylabel('Number of Messages') | |
plt.title('Monthly Message Volume') | |
plt.xticks(rotation=45) | |
plt.grid(axis='y', linestyle='--', alpha=0.7) | |
plt.tight_layout() | |
plt.savefig(f'{output_dir}/monthly_activity.png', dpi=300) | |
print(f"Saved monthly activity to {output_dir}/monthly_activity.png") | |
except Exception as e: | |
print(f"Error creating monthly activity plot: {e}") | |
traceback.print_exc() | |
# Export summary statistics to CSV | |
summary_stats = { | |
'Metric': [ | |
'Total Messages', | |
'Avg Messages Per Day', | |
'Avg Message Length', | |
'Median Message Length', | |
'Avg Response Time (min)', | |
'Median Response Time (min)', | |
'Avg Messages Per Conversation', | |
'Median Messages Per Conversation', | |
'Most Active Day', | |
'Most Active Hour', | |
'Total Conversations', | |
'Date Range' | |
], | |
'Value': [ | |
total_messages, | |
total_messages / len(daily_counts) if daily_counts else 0, | |
np.mean(message_lengths) if message_lengths else 0, | |
np.median(message_lengths) if message_lengths else 0, | |
np.mean(human_assistant_pairs) if human_assistant_pairs else 0, | |
np.median(human_assistant_pairs) if human_assistant_pairs else 0, | |
np.mean(conversation_lengths) if conversation_lengths else 0, | |
np.median(conversation_lengths) if conversation_lengths else 0, | |
max(weekday_counts.items(), key=lambda x: x[1])[0] if weekday_counts else 'N/A', | |
max(hour_counts.items(), key=lambda x: x[1])[0] if hour_counts else 'N/A', | |
len(conversation_lengths), | |
f"{min(daily_counts.keys())} to {max(daily_counts.keys())}" if daily_counts else 'N/A' | |
] | |
} | |
summary_df = pd.DataFrame(summary_stats) | |
summary_df.to_csv(f'{output_dir}/conversation_summary_stats.csv', index=False) | |
print(f"Saved summary statistics to {output_dir}/conversation_summary_stats.csv") | |
# Print summary for reference | |
print("\nConversation Analysis Summary:") | |
for metric, value in zip(summary_stats['Metric'], summary_stats['Value']): | |
print(f"{metric}: {value}") | |
def main(): | |
"""Main function to run the conversation analysis""" | |
import argparse | |
parser = argparse.ArgumentParser(description='Analyze Claude conversation data') | |
parser.add_argument('--file', type=str, | |
default=r"~/Downloads/data-2025-04-11-15-00-47/conversations.json", | |
help='Path to the conversations.json file') | |
parser.add_argument('--output', type=str, default='output', | |
help='Output directory for visualization files') | |
args = parser.parse_args() | |
print("Starting conversation analysis...") | |
results = process_data(args.file) | |
if results: | |
create_visualizations(results, args.output) | |
print("Analysis complete!") | |
else: | |
print("Analysis failed due to data processing errors.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment