Skip to content

Instantly share code, notes, and snippets.

@secemp9
Created April 14, 2025 11:57
Show Gist options
  • Save secemp9/dfe3ae2c3d4b57a3331b2e80a4dadc25 to your computer and use it in GitHub Desktop.
Save secemp9/dfe3ae2c3d4b57a3331b2e80a4dadc25 to your computer and use it in GitHub Desktop.
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.dates as mdates
from matplotlib.gridspec import GridSpec
import re
import os
import traceback
# Set style for prettier plots
plt.style.use('ggplot')
sns.set_palette("pastel")
def safe_extract_text(message):
"""Safely extract text content from a message with various possible structures"""
try:
# Check if it's the old structure with 'content.parts'
if isinstance(message.get('content'), dict) and 'parts' in message.get('content', {}):
return message.get('content', {}).get('parts', [''])[0]
# Check if it's the new structure with 'content' as a list
elif isinstance(message.get('content'), list) and len(message.get('content', [])) > 0:
# Try to find any item with 'text' key
for item in message.get('content', []):
if isinstance(item, dict) and 'text' in item:
return item.get('text', '')
# If we can't find a 'text' key, check for 'content' key
for item in message.get('content', []):
if isinstance(item, dict) and 'content' in item:
return str(item.get('content', ''))
# If there's a 'text' field directly in the message
elif 'text' in message:
return message.get('text', '')
# Fallback: return empty string if we can't find the content
return ''
except Exception as e:
print(f"Error extracting text: {e}")
return ''
def process_data(data_path):
"""Process conversation data and extract metrics"""
print(f"Loading data from: {data_path}")
try:
with open(data_path) as f:
data = json.load(f)
print(f"Successfully loaded data: {len(data)} conversations found")
except Exception as e:
print(f"Error loading data: {e}")
traceback.print_exc()
return None
# Create empty lists to store various metrics
timestamps = []
message_lengths = []
conversation_lengths = []
conversation_topics = []
human_assistant_pairs = []
weekdays = []
hours = []
error_counts = 0
# Process the data
for conv_idx, conversation in enumerate(data):
try:
# Extract conversation topic/title if available
conv_title = conversation.get('title', 'Untitled Conversation')
conversation_topics.append(conv_title)
# Count messages in this conversation
conv_message_count = 0
conv_timestamps = []
# Try to process chat_messages if available
chat_messages = conversation.get('chat_messages', [])
if not chat_messages and 'messages' in conversation:
chat_messages = conversation.get('messages', [])
for message in chat_messages:
try:
created_at = message.get('created_at')
if not created_at:
continue
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
# For all messages
weekdays.append(dt.strftime('%A'))
hours.append(dt.hour)
if message.get('sender') == "human":
timestamps.append(created_at)
conv_timestamps.append(dt)
conv_message_count += 1
# Calculate message length
content_text = safe_extract_text(message)
message_lengths.append(len(content_text))
except Exception as e:
error_counts += 1
print(f"Error processing message in conversation {conv_idx}: {e}")
continue
# Store conversation length (number of human messages)
if conv_message_count > 0:
conversation_lengths.append(conv_message_count)
# Calculate time differences between consecutive messages in conversation
if len(conv_timestamps) > 1:
for i in range(len(conv_timestamps) - 1):
time_diff = (conv_timestamps[i + 1] - conv_timestamps[i]).total_seconds() / 60 # in minutes
# Only consider gaps less than a day (1440 minutes) to filter out long inactive periods
if time_diff < 1440:
human_assistant_pairs.append(time_diff)
except Exception as e:
error_counts += 1
print(f"Error processing conversation {conv_idx}: {e}")
continue
print(f"Processed {len(timestamps)} human messages with {error_counts} errors")
if not timestamps:
print("No valid timestamps found. Cannot continue analysis.")
return None
# Convert timestamps to datetime objects and extract dates
dates = [datetime.fromisoformat(ts.replace('Z', '+00:00')).date() for ts in timestamps]
# Count messages per day
daily_counts = Counter(dates)
# Prepare results dictionary
results = {
'daily_counts': daily_counts,
'dates': dates,
'timestamps': timestamps,
'message_lengths': message_lengths,
'conversation_lengths': conversation_lengths,
'human_assistant_pairs': human_assistant_pairs,
'weekdays': weekdays,
'hours': hours,
'conversation_topics': conversation_topics
}
return results
def create_visualizations(results, output_dir='output'):
"""Create visualizations from processed data"""
if not results:
print("No results to visualize.")
return
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Extract data from results
daily_counts = results['daily_counts']
message_lengths = results['message_lengths']
conversation_lengths = results['conversation_lengths']
human_assistant_pairs = results['human_assistant_pairs']
weekdays = results['weekdays']
hours = results['hours']
# Convert to DataFrame for easier plotting
df_daily = pd.DataFrame.from_dict(daily_counts, orient='index').reset_index()
df_daily.columns = ['date', 'message_count']
df_daily = df_daily.sort_values('date')
# Create a figure with multiple subplots using GridSpec
plt.figure(figsize=(15, 20))
gs = GridSpec(4, 2)
# 1. Daily Usage Plot
ax1 = plt.subplot(gs[0, :])
ax1.bar(df_daily['date'], df_daily['message_count'], color='skyblue')
ax1.set_xlabel('Date')
ax1.set_ylabel('Number of Messages')
ax1.set_title('Usage Per Day')
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
# Annotate the total number of messages
total_messages = sum(daily_counts.values())
ax1.annotate(f'Total Messages: {total_messages}',
xy=(0.95, 0.95),
xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='top',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
# 2. Weekly Trend Analysis
ax2 = plt.subplot(gs[1, 0])
try:
# Convert date column to datetime explicitly
df_daily['date'] = pd.to_datetime(df_daily['date'])
df_daily['week'] = df_daily['date'].dt.isocalendar().week
weekly_data = df_daily.groupby('week')['message_count'].sum().reset_index()
ax2.plot(weekly_data['week'], weekly_data['message_count'], marker='o', linestyle='-', color='green')
ax2.set_xlabel('Week Number')
ax2.set_ylabel('Number of Messages')
ax2.set_title('Weekly Message Volume')
ax2.grid(True, linestyle='--', alpha=0.7)
except Exception as e:
print(f"Error creating weekly trend analysis: {e}")
ax2.text(0.5, 0.5, "Weekly trend analysis unavailable",
ha='center', va='center', transform=ax2.transAxes)
# 3. Message Length Distribution
ax3 = plt.subplot(gs[1, 1])
if message_lengths:
sns.histplot(message_lengths, bins=20, kde=True, ax=ax3, color='purple')
ax3.set_xlabel('Message Length (characters)')
ax3.set_ylabel('Frequency')
ax3.set_title('Distribution of Message Lengths')
# Add statistics as annotations
avg_length = np.mean(message_lengths)
median_length = np.median(message_lengths)
ax3.annotate(f'Mean: {avg_length:.1f} chars\nMedian: {median_length:.1f} chars',
xy=(0.95, 0.95),
xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='top',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
else:
ax3.text(0.5, 0.5, "No message length data available",
ha='center', va='center', transform=ax3.transAxes)
# 4. Day of Week Activity
ax4 = plt.subplot(gs[2, 0])
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts = Counter(weekdays)
ordered_counts = [weekday_counts.get(day, 0) for day in weekday_order]
ax4.bar(weekday_order, ordered_counts, color='coral')
ax4.set_xlabel('Day of Week')
ax4.set_ylabel('Number of Messages')
ax4.set_title('Activity by Day of Week')
ax4.tick_params(axis='x', rotation=45)
ax4.grid(axis='y', linestyle='--', alpha=0.7)
# 5. Hour of Day Activity
ax5 = plt.subplot(gs[2, 1])
hour_counts = Counter(hours)
hour_df = pd.DataFrame.from_dict({h: hour_counts.get(h, 0) for h in range(24)}, orient='index',
columns=['count']).reset_index()
hour_df.columns = ['hour', 'count']
# Use barplot with hue parameter to fix the FutureWarning
sns.barplot(x='hour', y='count', data=hour_df, ax=ax5, color='steelblue')
ax5.set_xlabel('Hour of Day (24-hour format)')
ax5.set_ylabel('Number of Messages')
ax5.set_title('Activity by Hour of Day')
ax5.set_xticks(range(0, 24, 2))
ax5.grid(axis='y', linestyle='--', alpha=0.7)
# 6. Response Time Analysis
ax6 = plt.subplot(gs[3, 0])
if human_assistant_pairs:
# Filter out extreme outliers for better visualization
response_times = [t for t in human_assistant_pairs if t < np.percentile(human_assistant_pairs, 95)]
sns.histplot(response_times, bins=20, kde=True, ax=ax6, color='teal')
ax6.set_xlabel('Response Time (minutes)')
ax6.set_ylabel('Frequency')
ax6.set_title('Distribution of Response Times')
# Add statistics as annotations
avg_response = np.mean(response_times)
median_response = np.median(response_times)
ax6.annotate(f'Mean: {avg_response:.1f} mins\nMedian: {median_response:.1f} mins',
xy=(0.95, 0.95),
xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='top',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
else:
ax6.text(0.5, 0.5, "No response time data available",
ha='center', va='center', transform=ax6.transAxes)
# 7. Conversation Length Distribution
ax7 = plt.subplot(gs[3, 1])
if conversation_lengths:
sns.histplot(conversation_lengths, bins=15, kde=True, ax=ax7, color='olive')
ax7.set_xlabel('Number of Messages per Conversation')
ax7.set_ylabel('Frequency')
ax7.set_title('Distribution of Conversation Lengths')
# Add statistics as annotations
avg_conv_length = np.mean(conversation_lengths)
median_conv_length = np.median(conversation_lengths)
ax7.annotate(f'Mean: {avg_conv_length:.1f} msgs\nMedian: {median_conv_length:.1f} msgs',
xy=(0.95, 0.95),
xycoords='axes fraction',
horizontalalignment='right',
verticalalignment='top',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
else:
ax7.text(0.5, 0.5, "No conversation length data available",
ha='center', va='center', transform=ax7.transAxes)
plt.tight_layout()
plt.savefig(f'{output_dir}/conversation_analytics_dashboard.png', dpi=300, bbox_inches='tight')
print(f"Saved analytics dashboard to {output_dir}/conversation_analytics_dashboard.png")
# Additional analysis: Generate a rolling 7-day average for message volume
if len(df_daily) > 7:
try:
plt.figure(figsize=(12, 6))
# Make sure df_daily['date'] is datetime type before creating date_range
if not pd.api.types.is_datetime64_any_dtype(df_daily['date']):
df_daily['date'] = pd.to_datetime(df_daily['date'])
# Create a continuous date range
date_range = pd.date_range(start=df_daily['date'].min(), end=df_daily['date'].max())
full_df = pd.DataFrame({'date': date_range})
# Convert df_daily['date'] to datetime to match full_df['date']
df_daily_full = pd.merge(full_df, df_daily, on='date', how='left').fillna(0)
# Calculate the rolling average
df_daily_full['7day_avg'] = df_daily_full['message_count'].rolling(window=7, min_periods=1).mean()
# Plot actual values and rolling average
plt.plot(df_daily_full['date'], df_daily_full['message_count'], color='skyblue', alpha=0.5,
label='Daily Messages')
plt.plot(df_daily_full['date'], df_daily_full['7day_avg'], color='blue', linewidth=2,
label='7-day Moving Average')
plt.xlabel('Date')
plt.ylabel('Number of Messages')
plt.title('Daily Message Volume with 7-day Moving Average')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f'{output_dir}/message_volume_trend.png', dpi=300)
print(f"Saved message volume trend to {output_dir}/message_volume_trend.png")
except Exception as e:
print(f"Error creating rolling average plot: {e}")
traceback.print_exc()
# Top active days
try:
top_days = df_daily.nlargest(5, 'message_count')
plt.figure(figsize=(10, 6))
plt.bar(top_days['date'].dt.strftime('%Y-%m-%d'), top_days['message_count'], color='orange')
plt.xlabel('Date')
plt.ylabel('Number of Messages')
plt.title('Top 5 Most Active Days')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f'{output_dir}/top_active_days.png', dpi=300)
print(f"Saved top active days to {output_dir}/top_active_days.png")
except Exception as e:
print(f"Error creating top active days plot: {e}")
traceback.print_exc()
# Monthly activity plot
try:
# Group by year and month
df_daily['year_month'] = df_daily['date'].dt.to_period('M')
monthly_data = df_daily.groupby('year_month')['message_count'].sum().reset_index()
monthly_data['year_month_str'] = monthly_data['year_month'].dt.strftime('%Y-%m')
plt.figure(figsize=(12, 6))
plt.bar(monthly_data['year_month_str'], monthly_data['message_count'], color='lightgreen')
plt.xlabel('Month')
plt.ylabel('Number of Messages')
plt.title('Monthly Message Volume')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(f'{output_dir}/monthly_activity.png', dpi=300)
print(f"Saved monthly activity to {output_dir}/monthly_activity.png")
except Exception as e:
print(f"Error creating monthly activity plot: {e}")
traceback.print_exc()
# Export summary statistics to CSV
summary_stats = {
'Metric': [
'Total Messages',
'Avg Messages Per Day',
'Avg Message Length',
'Median Message Length',
'Avg Response Time (min)',
'Median Response Time (min)',
'Avg Messages Per Conversation',
'Median Messages Per Conversation',
'Most Active Day',
'Most Active Hour',
'Total Conversations',
'Date Range'
],
'Value': [
total_messages,
total_messages / len(daily_counts) if daily_counts else 0,
np.mean(message_lengths) if message_lengths else 0,
np.median(message_lengths) if message_lengths else 0,
np.mean(human_assistant_pairs) if human_assistant_pairs else 0,
np.median(human_assistant_pairs) if human_assistant_pairs else 0,
np.mean(conversation_lengths) if conversation_lengths else 0,
np.median(conversation_lengths) if conversation_lengths else 0,
max(weekday_counts.items(), key=lambda x: x[1])[0] if weekday_counts else 'N/A',
max(hour_counts.items(), key=lambda x: x[1])[0] if hour_counts else 'N/A',
len(conversation_lengths),
f"{min(daily_counts.keys())} to {max(daily_counts.keys())}" if daily_counts else 'N/A'
]
}
summary_df = pd.DataFrame(summary_stats)
summary_df.to_csv(f'{output_dir}/conversation_summary_stats.csv', index=False)
print(f"Saved summary statistics to {output_dir}/conversation_summary_stats.csv")
# Print summary for reference
print("\nConversation Analysis Summary:")
for metric, value in zip(summary_stats['Metric'], summary_stats['Value']):
print(f"{metric}: {value}")
def main():
"""Main function to run the conversation analysis"""
import argparse
parser = argparse.ArgumentParser(description='Analyze Claude conversation data')
parser.add_argument('--file', type=str,
default=r"~/Downloads/data-2025-04-11-15-00-47/conversations.json",
help='Path to the conversations.json file')
parser.add_argument('--output', type=str, default='output',
help='Output directory for visualization files')
args = parser.parse_args()
print("Starting conversation analysis...")
results = process_data(args.file)
if results:
create_visualizations(results, args.output)
print("Analysis complete!")
else:
print("Analysis failed due to data processing errors.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment