secemp9 · April 14, 2025 11:57
diff --git a/plot_claude.py b/plot_claude.py
 import json
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from collections import Counter
 from datetime import datetime, timedelta
 import seaborn as sns
 import matplotlib.dates as mdates
 from matplotlib.gridspec import GridSpec
 import re
 import os
 import traceback

 # Set style for prettier plots
 plt.style.use('ggplot')
 sns.set_palette("pastel")


 def safe_extract_text(message):
    """Safely extract text content from a message with various possible structures"""
    try:
        # Check if it's the old structure with 'content.parts'
        if isinstance(message.get('content'), dict) and 'parts' in message.get('content', {}):
            return message.get('content', {}).get('parts', [''])[0]

        # Check if it's the new structure with 'content' as a list
        elif isinstance(message.get('content'), list) and len(message.get('content', [])) > 0:
            # Try to find any item with 'text' key
            for item in message.get('content', []):
                if isinstance(item, dict) and 'text' in item:
                    return item.get('text', '')

            # If we can't find a 'text' key, check for 'content' key
            for item in message.get('content', []):
                if isinstance(item, dict) and 'content' in item:
                    return str(item.get('content', ''))

        # If there's a 'text' field directly in the message
        elif 'text' in message:
            return message.get('text', '')

        # Fallback: return empty string if we can't find the content
        return ''
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ''


 def process_data(data_path):
    """Process conversation data and extract metrics"""
    print(f"Loading data from: {data_path}")

    try:
        with open(data_path) as f:
            data = json.load(f)
        print(f"Successfully loaded data: {len(data)} conversations found")
    except Exception as e:
        print(f"Error loading data: {e}")
        traceback.print_exc()
        return None

    # Create empty lists to store various metrics
    timestamps = []
    message_lengths = []
    conversation_lengths = []
    conversation_topics = []
    human_assistant_pairs = []
    weekdays = []
    hours = []
    error_counts = 0

    # Process the data
    for conv_idx, conversation in enumerate(data):
        try:
            # Extract conversation topic/title if available
            conv_title = conversation.get('title', 'Untitled Conversation')
            conversation_topics.append(conv_title)

            # Count messages in this conversation
            conv_message_count = 0
            conv_timestamps = []

            # Try to process chat_messages if available
            chat_messages = conversation.get('chat_messages', [])
            if not chat_messages and 'messages' in conversation:
                chat_messages = conversation.get('messages', [])

            for message in chat_messages:
                try:
                    created_at = message.get('created_at')
                    if not created_at:
                        continue

                    dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))

                    # For all messages
                    weekdays.append(dt.strftime('%A'))
                    hours.append(dt.hour)

                    if message.get('sender') == "human":
                        timestamps.append(created_at)
                        conv_timestamps.append(dt)
                        conv_message_count += 1

                        # Calculate message length
                        content_text = safe_extract_text(message)
                        message_lengths.append(len(content_text))

                except Exception as e:
                    error_counts += 1
                    print(f"Error processing message in conversation {conv_idx}: {e}")
                    continue

            # Store conversation length (number of human messages)
            if conv_message_count > 0:
                conversation_lengths.append(conv_message_count)

            # Calculate time differences between consecutive messages in conversation
            if len(conv_timestamps) > 1:
                for i in range(len(conv_timestamps) - 1):
                    time_diff = (conv_timestamps[i + 1] - conv_timestamps[i]).total_seconds() / 60  # in minutes
                    # Only consider gaps less than a day (1440 minutes) to filter out long inactive periods
                    if time_diff < 1440:
                        human_assistant_pairs.append(time_diff)

        except Exception as e:
            error_counts += 1
            print(f"Error processing conversation {conv_idx}: {e}")
            continue

    print(f"Processed {len(timestamps)} human messages with {error_counts} errors")

    if not timestamps:
        print("No valid timestamps found. Cannot continue analysis.")
        return None

    # Convert timestamps to datetime objects and extract dates
    dates = [datetime.fromisoformat(ts.replace('Z', '+00:00')).date() for ts in timestamps]

    # Count messages per day
    daily_counts = Counter(dates)

    # Prepare results dictionary
    results = {
        'daily_counts': daily_counts,
        'dates': dates,
        'timestamps': timestamps,
        'message_lengths': message_lengths,
        'conversation_lengths': conversation_lengths,
        'human_assistant_pairs': human_assistant_pairs,
        'weekdays': weekdays,
        'hours': hours,
        'conversation_topics': conversation_topics
    }

    return results


 def create_visualizations(results, output_dir='output'):
    """Create visualizations from processed data"""
    if not results:
        print("No results to visualize.")
        return

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Extract data from results
    daily_counts = results['daily_counts']
    message_lengths = results['message_lengths']
    conversation_lengths = results['conversation_lengths']
    human_assistant_pairs = results['human_assistant_pairs']
    weekdays = results['weekdays']
    hours = results['hours']

    # Convert to DataFrame for easier plotting
    df_daily = pd.DataFrame.from_dict(daily_counts, orient='index').reset_index()
    df_daily.columns = ['date', 'message_count']
    df_daily = df_daily.sort_values('date')

    # Create a figure with multiple subplots using GridSpec
    plt.figure(figsize=(15, 20))
    gs = GridSpec(4, 2)

    # 1. Daily Usage Plot
    ax1 = plt.subplot(gs[0, :])
    ax1.bar(df_daily['date'], df_daily['message_count'], color='skyblue')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Messages')
    ax1.set_title('Usage Per Day')
    ax1.tick_params(axis='x', rotation=45)
    ax1.grid(axis='y', linestyle='--', alpha=0.7)

    # Annotate the total number of messages
    total_messages = sum(daily_counts.values())
    ax1.annotate(f'Total Messages: {total_messages}',
                 xy=(0.95, 0.95),
                 xycoords='axes fraction',
                 horizontalalignment='right',
                 verticalalignment='top',
                 bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))

    # 2. Weekly Trend Analysis
    ax2 = plt.subplot(gs[1, 0])
    try:
        # Convert date column to datetime explicitly
        df_daily['date'] = pd.to_datetime(df_daily['date'])
        df_daily['week'] = df_daily['date'].dt.isocalendar().week
        weekly_data = df_daily.groupby('week')['message_count'].sum().reset_index()
        ax2.plot(weekly_data['week'], weekly_data['message_count'], marker='o', linestyle='-', color='green')
        ax2.set_xlabel('Week Number')
        ax2.set_ylabel('Number of Messages')
        ax2.set_title('Weekly Message Volume')
        ax2.grid(True, linestyle='--', alpha=0.7)
    except Exception as e:
        print(f"Error creating weekly trend analysis: {e}")
        ax2.text(0.5, 0.5, "Weekly trend analysis unavailable",
                 ha='center', va='center', transform=ax2.transAxes)

    # 3. Message Length Distribution
    ax3 = plt.subplot(gs[1, 1])
    if message_lengths:
        sns.histplot(message_lengths, bins=20, kde=True, ax=ax3, color='purple')
        ax3.set_xlabel('Message Length (characters)')
        ax3.set_ylabel('Frequency')
        ax3.set_title('Distribution of Message Lengths')

        # Add statistics as annotations
        avg_length = np.mean(message_lengths)
        median_length = np.median(message_lengths)
        ax3.annotate(f'Mean: {avg_length:.1f} chars\nMedian: {median_length:.1f} chars',
                     xy=(0.95, 0.95),
                     xycoords='axes fraction',
                     horizontalalignment='right',
                     verticalalignment='top',
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
    else:
        ax3.text(0.5, 0.5, "No message length data available",
                 ha='center', va='center', transform=ax3.transAxes)

    # 4. Day of Week Activity
    ax4 = plt.subplot(gs[2, 0])
    weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    weekday_counts = Counter(weekdays)
    ordered_counts = [weekday_counts.get(day, 0) for day in weekday_order]
    ax4.bar(weekday_order, ordered_counts, color='coral')
    ax4.set_xlabel('Day of Week')
    ax4.set_ylabel('Number of Messages')
    ax4.set_title('Activity by Day of Week')
    ax4.tick_params(axis='x', rotation=45)
    ax4.grid(axis='y', linestyle='--', alpha=0.7)

    # 5. Hour of Day Activity
    ax5 = plt.subplot(gs[2, 1])
    hour_counts = Counter(hours)
    hour_df = pd.DataFrame.from_dict({h: hour_counts.get(h, 0) for h in range(24)}, orient='index',
                                     columns=['count']).reset_index()
    hour_df.columns = ['hour', 'count']
    # Use barplot with hue parameter to fix the FutureWarning
    sns.barplot(x='hour', y='count', data=hour_df, ax=ax5, color='steelblue')
    ax5.set_xlabel('Hour of Day (24-hour format)')
    ax5.set_ylabel('Number of Messages')
    ax5.set_title('Activity by Hour of Day')
    ax5.set_xticks(range(0, 24, 2))
    ax5.grid(axis='y', linestyle='--', alpha=0.7)

    # 6. Response Time Analysis
    ax6 = plt.subplot(gs[3, 0])
    if human_assistant_pairs:
        # Filter out extreme outliers for better visualization
        response_times = [t for t in human_assistant_pairs if t < np.percentile(human_assistant_pairs, 95)]
        sns.histplot(response_times, bins=20, kde=True, ax=ax6, color='teal')
        ax6.set_xlabel('Response Time (minutes)')
        ax6.set_ylabel('Frequency')
        ax6.set_title('Distribution of Response Times')

        # Add statistics as annotations
        avg_response = np.mean(response_times)
        median_response = np.median(response_times)
        ax6.annotate(f'Mean: {avg_response:.1f} mins\nMedian: {median_response:.1f} mins',
                     xy=(0.95, 0.95),
                     xycoords='axes fraction',
                     horizontalalignment='right',
                     verticalalignment='top',
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
    else:
        ax6.text(0.5, 0.5, "No response time data available",
                 ha='center', va='center', transform=ax6.transAxes)

    # 7. Conversation Length Distribution
    ax7 = plt.subplot(gs[3, 1])
    if conversation_lengths:
        sns.histplot(conversation_lengths, bins=15, kde=True, ax=ax7, color='olive')
        ax7.set_xlabel('Number of Messages per Conversation')
        ax7.set_ylabel('Frequency')
        ax7.set_title('Distribution of Conversation Lengths')

        # Add statistics as annotations
        avg_conv_length = np.mean(conversation_lengths)
        median_conv_length = np.median(conversation_lengths)
        ax7.annotate(f'Mean: {avg_conv_length:.1f} msgs\nMedian: {median_conv_length:.1f} msgs',
                     xy=(0.95, 0.95),
                     xycoords='axes fraction',
                     horizontalalignment='right',
                     verticalalignment='top',
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
    else:
        ax7.text(0.5, 0.5, "No conversation length data available",
                 ha='center', va='center', transform=ax7.transAxes)

    plt.tight_layout()
    plt.savefig(f'{output_dir}/conversation_analytics_dashboard.png', dpi=300, bbox_inches='tight')
    print(f"Saved analytics dashboard to {output_dir}/conversation_analytics_dashboard.png")

    # Additional analysis: Generate a rolling 7-day average for message volume
    if len(df_daily) > 7:
        try:
            plt.figure(figsize=(12, 6))

            # Make sure df_daily['date'] is datetime type before creating date_range
            if not pd.api.types.is_datetime64_any_dtype(df_daily['date']):
                df_daily['date'] = pd.to_datetime(df_daily['date'])

            # Create a continuous date range
            date_range = pd.date_range(start=df_daily['date'].min(), end=df_daily['date'].max())
            full_df = pd.DataFrame({'date': date_range})

            # Convert df_daily['date'] to datetime to match full_df['date']
            df_daily_full = pd.merge(full_df, df_daily, on='date', how='left').fillna(0)

            # Calculate the rolling average
            df_daily_full['7day_avg'] = df_daily_full['message_count'].rolling(window=7, min_periods=1).mean()

            # Plot actual values and rolling average
            plt.plot(df_daily_full['date'], df_daily_full['message_count'], color='skyblue', alpha=0.5,
                     label='Daily Messages')
            plt.plot(df_daily_full['date'], df_daily_full['7day_avg'], color='blue', linewidth=2,
                     label='7-day Moving Average')

            plt.xlabel('Date')
            plt.ylabel('Number of Messages')
            plt.title('Daily Message Volume with 7-day Moving Average')
            plt.legend()
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f'{output_dir}/message_volume_trend.png', dpi=300)
            print(f"Saved message volume trend to {output_dir}/message_volume_trend.png")
        except Exception as e:
            print(f"Error creating rolling average plot: {e}")
            traceback.print_exc()

    # Top active days
    try:
        top_days = df_daily.nlargest(5, 'message_count')
        plt.figure(figsize=(10, 6))
        plt.bar(top_days['date'].dt.strftime('%Y-%m-%d'), top_days['message_count'], color='orange')
        plt.xlabel('Date')
        plt.ylabel('Number of Messages')
        plt.title('Top 5 Most Active Days')
        plt.xticks(rotation=45)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(f'{output_dir}/top_active_days.png', dpi=300)
        print(f"Saved top active days to {output_dir}/top_active_days.png")
    except Exception as e:
        print(f"Error creating top active days plot: {e}")
        traceback.print_exc()

    # Monthly activity plot
    try:
        # Group by year and month
        df_daily['year_month'] = df_daily['date'].dt.to_period('M')
        monthly_data = df_daily.groupby('year_month')['message_count'].sum().reset_index()
        monthly_data['year_month_str'] = monthly_data['year_month'].dt.strftime('%Y-%m')

        plt.figure(figsize=(12, 6))
        plt.bar(monthly_data['year_month_str'], monthly_data['message_count'], color='lightgreen')
        plt.xlabel('Month')
        plt.ylabel('Number of Messages')
        plt.title('Monthly Message Volume')
        plt.xticks(rotation=45)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(f'{output_dir}/monthly_activity.png', dpi=300)
        print(f"Saved monthly activity to {output_dir}/monthly_activity.png")
    except Exception as e:
        print(f"Error creating monthly activity plot: {e}")
        traceback.print_exc()

    # Export summary statistics to CSV
    summary_stats = {
        'Metric': [
            'Total Messages',
            'Avg Messages Per Day',
            'Avg Message Length',
            'Median Message Length',
            'Avg Response Time (min)',
            'Median Response Time (min)',
            'Avg Messages Per Conversation',
            'Median Messages Per Conversation',
            'Most Active Day',
            'Most Active Hour',
            'Total Conversations',
            'Date Range'
        ],
        'Value': [
            total_messages,
            total_messages / len(daily_counts) if daily_counts else 0,
            np.mean(message_lengths) if message_lengths else 0,
            np.median(message_lengths) if message_lengths else 0,
            np.mean(human_assistant_pairs) if human_assistant_pairs else 0,
            np.median(human_assistant_pairs) if human_assistant_pairs else 0,
            np.mean(conversation_lengths) if conversation_lengths else 0,
            np.median(conversation_lengths) if conversation_lengths else 0,
            max(weekday_counts.items(), key=lambda x: x[1])[0] if weekday_counts else 'N/A',
            max(hour_counts.items(), key=lambda x: x[1])[0] if hour_counts else 'N/A',
            len(conversation_lengths),
            f"{min(daily_counts.keys())} to {max(daily_counts.keys())}" if daily_counts else 'N/A'
        ]
    }

    summary_df = pd.DataFrame(summary_stats)
    summary_df.to_csv(f'{output_dir}/conversation_summary_stats.csv', index=False)
    print(f"Saved summary statistics to {output_dir}/conversation_summary_stats.csv")

    # Print summary for reference
    print("\nConversation Analysis Summary:")
    for metric, value in zip(summary_stats['Metric'], summary_stats['Value']):
        print(f"{metric}: {value}")


 def main():
    """Main function to run the conversation analysis"""
    import argparse

    parser = argparse.ArgumentParser(description='Analyze Claude conversation data')
    parser.add_argument('--file', type=str,
                        default=r"~/Downloads/data-2025-04-11-15-00-47/conversations.json",
                        help='Path to the conversations.json file')
    parser.add_argument('--output', type=str, default='output',
                        help='Output directory for visualization files')

    args = parser.parse_args()

    print("Starting conversation analysis...")
    results = process_data(args.file)

    if results:
        create_visualizations(results, args.output)
        print("Analysis complete!")
    else:
        print("Analysis failed due to data processing errors.")


 if __name__ == "__main__":
    main()
	import json
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	from collections import Counter
	from datetime import datetime, timedelta
	import seaborn as sns
	import matplotlib.dates as mdates
	from matplotlib.gridspec import GridSpec
	import re
	import os
	import traceback

	# Set style for prettier plots
	plt.style.use('ggplot')
	sns.set_palette("pastel")


	def safe_extract_text(message):
	"""Safely extract text content from a message with various possible structures"""
	try:
	# Check if it's the old structure with 'content.parts'
	if isinstance(message.get('content'), dict) and 'parts' in message.get('content', {}):
	return message.get('content', {}).get('parts', [''])[0]

	# Check if it's the new structure with 'content' as a list
	elif isinstance(message.get('content'), list) and len(message.get('content', [])) > 0:
	# Try to find any item with 'text' key
	for item in message.get('content', []):
	if isinstance(item, dict) and 'text' in item:
	return item.get('text', '')

	# If we can't find a 'text' key, check for 'content' key
	for item in message.get('content', []):
	if isinstance(item, dict) and 'content' in item:
	return str(item.get('content', ''))

	# If there's a 'text' field directly in the message
	elif 'text' in message:
	return message.get('text', '')

	# Fallback: return empty string if we can't find the content
	return ''
	except Exception as e:
	print(f"Error extracting text: {e}")
	return ''


	def process_data(data_path):
	"""Process conversation data and extract metrics"""
	print(f"Loading data from: {data_path}")

	try:
	with open(data_path) as f:
	data = json.load(f)
	print(f"Successfully loaded data: {len(data)} conversations found")
	except Exception as e:
	print(f"Error loading data: {e}")
	traceback.print_exc()
	return None

	# Create empty lists to store various metrics
	timestamps = []
	message_lengths = []
	conversation_lengths = []
	conversation_topics = []
	human_assistant_pairs = []
	weekdays = []
	hours = []
	error_counts = 0

	# Process the data
	for conv_idx, conversation in enumerate(data):
	try:
	# Extract conversation topic/title if available
	conv_title = conversation.get('title', 'Untitled Conversation')
	conversation_topics.append(conv_title)

	# Count messages in this conversation
	conv_message_count = 0
	conv_timestamps = []

	# Try to process chat_messages if available
	chat_messages = conversation.get('chat_messages', [])
	if not chat_messages and 'messages' in conversation:
	chat_messages = conversation.get('messages', [])

	for message in chat_messages:
	try:
	created_at = message.get('created_at')
	if not created_at:
	continue

	dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))

	# For all messages
	weekdays.append(dt.strftime('%A'))
	hours.append(dt.hour)

	if message.get('sender') == "human":
	timestamps.append(created_at)
	conv_timestamps.append(dt)
	conv_message_count += 1

	# Calculate message length
	content_text = safe_extract_text(message)
	message_lengths.append(len(content_text))

	except Exception as e:
	error_counts += 1
	print(f"Error processing message in conversation {conv_idx}: {e}")
	continue

	# Store conversation length (number of human messages)
	if conv_message_count > 0:
	conversation_lengths.append(conv_message_count)

	# Calculate time differences between consecutive messages in conversation
	if len(conv_timestamps) > 1:
	for i in range(len(conv_timestamps) - 1):
	time_diff = (conv_timestamps[i + 1] - conv_timestamps[i]).total_seconds() / 60 # in minutes
	# Only consider gaps less than a day (1440 minutes) to filter out long inactive periods
	if time_diff < 1440:
	human_assistant_pairs.append(time_diff)

	except Exception as e:
	error_counts += 1
	print(f"Error processing conversation {conv_idx}: {e}")
	continue

	print(f"Processed {len(timestamps)} human messages with {error_counts} errors")

	if not timestamps:
	print("No valid timestamps found. Cannot continue analysis.")
	return None

	# Convert timestamps to datetime objects and extract dates
	dates = [datetime.fromisoformat(ts.replace('Z', '+00:00')).date() for ts in timestamps]

	# Count messages per day
	daily_counts = Counter(dates)

	# Prepare results dictionary
	results = {
	'daily_counts': daily_counts,
	'dates': dates,
	'timestamps': timestamps,
	'message_lengths': message_lengths,
	'conversation_lengths': conversation_lengths,
	'human_assistant_pairs': human_assistant_pairs,
	'weekdays': weekdays,
	'hours': hours,
	'conversation_topics': conversation_topics
	}

	return results


	def create_visualizations(results, output_dir='output'):
	"""Create visualizations from processed data"""
	if not results:
	print("No results to visualize.")
	return

	# Create output directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	# Extract data from results
	daily_counts = results['daily_counts']
	message_lengths = results['message_lengths']
	conversation_lengths = results['conversation_lengths']
	human_assistant_pairs = results['human_assistant_pairs']
	weekdays = results['weekdays']
	hours = results['hours']

	# Convert to DataFrame for easier plotting
	df_daily = pd.DataFrame.from_dict(daily_counts, orient='index').reset_index()
	df_daily.columns = ['date', 'message_count']
	df_daily = df_daily.sort_values('date')

	# Create a figure with multiple subplots using GridSpec
	plt.figure(figsize=(15, 20))
	gs = GridSpec(4, 2)

	# 1. Daily Usage Plot
	ax1 = plt.subplot(gs[0, :])
	ax1.bar(df_daily['date'], df_daily['message_count'], color='skyblue')
	ax1.set_xlabel('Date')
	ax1.set_ylabel('Number of Messages')
	ax1.set_title('Usage Per Day')
	ax1.tick_params(axis='x', rotation=45)
	ax1.grid(axis='y', linestyle='--', alpha=0.7)

	# Annotate the total number of messages
	total_messages = sum(daily_counts.values())
	ax1.annotate(f'Total Messages: {total_messages}',
	xy=(0.95, 0.95),
	xycoords='axes fraction',
	horizontalalignment='right',
	verticalalignment='top',
	bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))

	# 2. Weekly Trend Analysis
	ax2 = plt.subplot(gs[1, 0])
	try:
	# Convert date column to datetime explicitly
	df_daily['date'] = pd.to_datetime(df_daily['date'])
	df_daily['week'] = df_daily['date'].dt.isocalendar().week
	weekly_data = df_daily.groupby('week')['message_count'].sum().reset_index()
	ax2.plot(weekly_data['week'], weekly_data['message_count'], marker='o', linestyle='-', color='green')
	ax2.set_xlabel('Week Number')
	ax2.set_ylabel('Number of Messages')
	ax2.set_title('Weekly Message Volume')
	ax2.grid(True, linestyle='--', alpha=0.7)
	except Exception as e:
	print(f"Error creating weekly trend analysis: {e}")
	ax2.text(0.5, 0.5, "Weekly trend analysis unavailable",
	ha='center', va='center', transform=ax2.transAxes)

	# 3. Message Length Distribution
	ax3 = plt.subplot(gs[1, 1])
	if message_lengths:
	sns.histplot(message_lengths, bins=20, kde=True, ax=ax3, color='purple')
	ax3.set_xlabel('Message Length (characters)')
	ax3.set_ylabel('Frequency')
	ax3.set_title('Distribution of Message Lengths')

	# Add statistics as annotations
	avg_length = np.mean(message_lengths)
	median_length = np.median(message_lengths)
	ax3.annotate(f'Mean: {avg_length:.1f} chars\nMedian: {median_length:.1f} chars',
	xy=(0.95, 0.95),
	xycoords='axes fraction',
	horizontalalignment='right',
	verticalalignment='top',
	bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
	else:
	ax3.text(0.5, 0.5, "No message length data available",
	ha='center', va='center', transform=ax3.transAxes)

	# 4. Day of Week Activity
	ax4 = plt.subplot(gs[2, 0])
	weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
	weekday_counts = Counter(weekdays)
	ordered_counts = [weekday_counts.get(day, 0) for day in weekday_order]
	ax4.bar(weekday_order, ordered_counts, color='coral')
	ax4.set_xlabel('Day of Week')
	ax4.set_ylabel('Number of Messages')
	ax4.set_title('Activity by Day of Week')
	ax4.tick_params(axis='x', rotation=45)
	ax4.grid(axis='y', linestyle='--', alpha=0.7)

	# 5. Hour of Day Activity
	ax5 = plt.subplot(gs[2, 1])
	hour_counts = Counter(hours)
	hour_df = pd.DataFrame.from_dict({h: hour_counts.get(h, 0) for h in range(24)}, orient='index',
	columns=['count']).reset_index()
	hour_df.columns = ['hour', 'count']
	# Use barplot with hue parameter to fix the FutureWarning
	sns.barplot(x='hour', y='count', data=hour_df, ax=ax5, color='steelblue')
	ax5.set_xlabel('Hour of Day (24-hour format)')
	ax5.set_ylabel('Number of Messages')
	ax5.set_title('Activity by Hour of Day')
	ax5.set_xticks(range(0, 24, 2))
	ax5.grid(axis='y', linestyle='--', alpha=0.7)

	# 6. Response Time Analysis
	ax6 = plt.subplot(gs[3, 0])
	if human_assistant_pairs:
	# Filter out extreme outliers for better visualization
	response_times = [t for t in human_assistant_pairs if t < np.percentile(human_assistant_pairs, 95)]
	sns.histplot(response_times, bins=20, kde=True, ax=ax6, color='teal')
	ax6.set_xlabel('Response Time (minutes)')
	ax6.set_ylabel('Frequency')
	ax6.set_title('Distribution of Response Times')

	# Add statistics as annotations
	avg_response = np.mean(response_times)
	median_response = np.median(response_times)
	ax6.annotate(f'Mean: {avg_response:.1f} mins\nMedian: {median_response:.1f} mins',
	xy=(0.95, 0.95),
	xycoords='axes fraction',
	horizontalalignment='right',
	verticalalignment='top',
	bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
	else:
	ax6.text(0.5, 0.5, "No response time data available",
	ha='center', va='center', transform=ax6.transAxes)

	# 7. Conversation Length Distribution
	ax7 = plt.subplot(gs[3, 1])
	if conversation_lengths:
	sns.histplot(conversation_lengths, bins=15, kde=True, ax=ax7, color='olive')
	ax7.set_xlabel('Number of Messages per Conversation')
	ax7.set_ylabel('Frequency')
	ax7.set_title('Distribution of Conversation Lengths')

	# Add statistics as annotations
	avg_conv_length = np.mean(conversation_lengths)
	median_conv_length = np.median(conversation_lengths)
	ax7.annotate(f'Mean: {avg_conv_length:.1f} msgs\nMedian: {median_conv_length:.1f} msgs',
	xy=(0.95, 0.95),
	xycoords='axes fraction',
	horizontalalignment='right',
	verticalalignment='top',
	bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
	else:
	ax7.text(0.5, 0.5, "No conversation length data available",
	ha='center', va='center', transform=ax7.transAxes)

	plt.tight_layout()
	plt.savefig(f'{output_dir}/conversation_analytics_dashboard.png', dpi=300, bbox_inches='tight')
	print(f"Saved analytics dashboard to {output_dir}/conversation_analytics_dashboard.png")

	# Additional analysis: Generate a rolling 7-day average for message volume
	if len(df_daily) > 7:
	try:
	plt.figure(figsize=(12, 6))

	# Make sure df_daily['date'] is datetime type before creating date_range
	if not pd.api.types.is_datetime64_any_dtype(df_daily['date']):
	df_daily['date'] = pd.to_datetime(df_daily['date'])

	# Create a continuous date range
	date_range = pd.date_range(start=df_daily['date'].min(), end=df_daily['date'].max())
	full_df = pd.DataFrame({'date': date_range})

	# Convert df_daily['date'] to datetime to match full_df['date']
	df_daily_full = pd.merge(full_df, df_daily, on='date', how='left').fillna(0)

	# Calculate the rolling average
	df_daily_full['7day_avg'] = df_daily_full['message_count'].rolling(window=7, min_periods=1).mean()

	# Plot actual values and rolling average
	plt.plot(df_daily_full['date'], df_daily_full['message_count'], color='skyblue', alpha=0.5,
	label='Daily Messages')
	plt.plot(df_daily_full['date'], df_daily_full['7day_avg'], color='blue', linewidth=2,
	label='7-day Moving Average')

	plt.xlabel('Date')
	plt.ylabel('Number of Messages')
	plt.title('Daily Message Volume with 7-day Moving Average')
	plt.legend()
	plt.grid(True, linestyle='--', alpha=0.7)
	plt.xticks(rotation=45)
	plt.tight_layout()
	plt.savefig(f'{output_dir}/message_volume_trend.png', dpi=300)
	print(f"Saved message volume trend to {output_dir}/message_volume_trend.png")
	except Exception as e:
	print(f"Error creating rolling average plot: {e}")
	traceback.print_exc()

	# Top active days
	try:
	top_days = df_daily.nlargest(5, 'message_count')
	plt.figure(figsize=(10, 6))
	plt.bar(top_days['date'].dt.strftime('%Y-%m-%d'), top_days['message_count'], color='orange')
	plt.xlabel('Date')
	plt.ylabel('Number of Messages')
	plt.title('Top 5 Most Active Days')
	plt.xticks(rotation=45)
	plt.grid(axis='y', linestyle='--', alpha=0.7)
	plt.tight_layout()
	plt.savefig(f'{output_dir}/top_active_days.png', dpi=300)
	print(f"Saved top active days to {output_dir}/top_active_days.png")
	except Exception as e:
	print(f"Error creating top active days plot: {e}")
	traceback.print_exc()

	# Monthly activity plot
	try:
	# Group by year and month
	df_daily['year_month'] = df_daily['date'].dt.to_period('M')
	monthly_data = df_daily.groupby('year_month')['message_count'].sum().reset_index()
	monthly_data['year_month_str'] = monthly_data['year_month'].dt.strftime('%Y-%m')

	plt.figure(figsize=(12, 6))
	plt.bar(monthly_data['year_month_str'], monthly_data['message_count'], color='lightgreen')
	plt.xlabel('Month')
	plt.ylabel('Number of Messages')
	plt.title('Monthly Message Volume')
	plt.xticks(rotation=45)
	plt.grid(axis='y', linestyle='--', alpha=0.7)
	plt.tight_layout()
	plt.savefig(f'{output_dir}/monthly_activity.png', dpi=300)
	print(f"Saved monthly activity to {output_dir}/monthly_activity.png")
	except Exception as e:
	print(f"Error creating monthly activity plot: {e}")
	traceback.print_exc()

	# Export summary statistics to CSV
	summary_stats = {
	'Metric': [
	'Total Messages',
	'Avg Messages Per Day',
	'Avg Message Length',
	'Median Message Length',
	'Avg Response Time (min)',
	'Median Response Time (min)',
	'Avg Messages Per Conversation',
	'Median Messages Per Conversation',
	'Most Active Day',
	'Most Active Hour',
	'Total Conversations',
	'Date Range'
	],
	'Value': [
	total_messages,
	total_messages / len(daily_counts) if daily_counts else 0,
	np.mean(message_lengths) if message_lengths else 0,
	np.median(message_lengths) if message_lengths else 0,
	np.mean(human_assistant_pairs) if human_assistant_pairs else 0,
	np.median(human_assistant_pairs) if human_assistant_pairs else 0,
	np.mean(conversation_lengths) if conversation_lengths else 0,
	np.median(conversation_lengths) if conversation_lengths else 0,
	max(weekday_counts.items(), key=lambda x: x[1])[0] if weekday_counts else 'N/A',
	max(hour_counts.items(), key=lambda x: x[1])[0] if hour_counts else 'N/A',
	len(conversation_lengths),
	f"{min(daily_counts.keys())} to {max(daily_counts.keys())}" if daily_counts else 'N/A'
	]
	}

	summary_df = pd.DataFrame(summary_stats)
	summary_df.to_csv(f'{output_dir}/conversation_summary_stats.csv', index=False)
	print(f"Saved summary statistics to {output_dir}/conversation_summary_stats.csv")

	# Print summary for reference
	print("\nConversation Analysis Summary:")
	for metric, value in zip(summary_stats['Metric'], summary_stats['Value']):
	print(f"{metric}: {value}")


	def main():
	"""Main function to run the conversation analysis"""
	import argparse

	parser = argparse.ArgumentParser(description='Analyze Claude conversation data')
	parser.add_argument('--file', type=str,
	default=r"~/Downloads/data-2025-04-11-15-00-47/conversations.json",
	help='Path to the conversations.json file')
	parser.add_argument('--output', type=str, default='output',
	help='Output directory for visualization files')

	args = parser.parse_args()

	print("Starting conversation analysis...")
	results = process_data(args.file)

	if results:
	create_visualizations(results, args.output)
	print("Analysis complete!")
	else:
	print("Analysis failed due to data processing errors.")


	if __name__ == "__main__":
	main()