jwalsh · December 12, 2024 03:06 · jwalsh · Dec 12, 2024
diff --git a/_.md b/_.md
diff --git a/ai-performance-metrics.md b/ai-performance-metrics.md
diff --git a/beswick-fireside-chat-summary.md b/beswick-fireside-chat-summary.md
diff --git a/converza-case-study-summary.md b/converza-case-study-summary.md
diff --git a/evaluating-llms-human-feedback.md b/evaluating-llms-human-feedback.md
diff --git a/future-of-genai.md b/future-of-genai.md
diff --git a/gretel-synthetic-data.md b/gretel-synthetic-data.md
diff --git a/guardrails-mitigating-volatility.md b/guardrails-mitigating-volatility.md
diff --git a/manji-ai-agents.md b/manji-ai-agents.md
diff --git a/predibase-incremental-training.md b/predibase-incremental-training.md
diff --git a/small-models-panel-summary.md b/small-models-panel-summary.md
diff --git a/smallcon-2024-summary.md b/smallcon-2024-summary.md
diff --git a/smallcon_live_transcribe.sh b/smallcon_live_transcribe.sh
 #!/bin/bash

 # Enable error handling
 set -euo pipefail

 # Configuration
 TRANSCRIPT_DIR="smallcon_transcripts"
 TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 COMBINED_LOG="${TRANSCRIPT_DIR}/smallcon_${TIMESTAMP}_full.txt"
 SEGMENT_LENGTH=30  # Length of each audio segment in seconds
 CURRENT_SESSION=""

 # ANSI colors
 CYAN='\033[1;36m'
 YELLOW='\033[1;33m'
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 GRAY='\033[0;37m'
 NC='\033[0m'

 # Create transcript directory
 mkdir -p "$TRANSCRIPT_DIR"

 # Clear screen and move cursor to top
 clear

 # Initialize combined log with session info
 {
    printf "${CYAN}%s\n" "==================================================="
    printf "${CYAN}SmallCon Conference Transcription - Started at $(date)\n"
    printf "${CYAN}==================================================${NC}\n\n"
 } | tee "$COMBINED_LOG"

 # Cleanup function
 cleanup() {
    echo
    printf "${CYAN}Cleaning up and saving final transcript...\n"
    printf "${CYAN}==================================================="
    echo "Session ended at $(date)" | tee -a "$COMBINED_LOG"
    printf "${CYAN}===================================================${NC}\n"
    rm -f temp_stream.wav
    printf "${CYAN}Transcript saved to: $COMBINED_LOG${NC}\n"
    exit 0
 }

 # Set up cleanup on script exit
 trap cleanup EXIT INT TERM

 # Function to format timestamp
 format_timestamp() {
    printf "${GRAY}[%s]${NC}" "$1"
 }

 # Function to detect and format session changes
 detect_session() {
    local text="$1"
    if [[ $text =~ "Session" ]] || [[ $text =~ "Panel" ]] || [[ $text =~ "Keynote" ]]; then
        printf "\n${BLUE}>>> New Session Detected: ${text}${NC}\n\n" | tee -a "$COMBINED_LOG"
    fi
 }

 # Main transcription loop
 while true; do
    printf "${CYAN}\nRecording ${SEGMENT_LENGTH}s segment...${NC}\n"
    
    # Record audio segment with preprocessing
    ffmpeg -v quiet -f avfoundation -i ":0" -t "$SEGMENT_LENGTH" \
        -af "highpass=f=50,lowpass=f=3000" \
        -ar 16000 -ac 1 -c:a pcm_s16le temp_stream.wav
    
    # Add timestamp to display and log
    CURRENT_TIME=$(date '+%H:%M:%S')
    echo -e "${YELLOW}--- ${CURRENT_TIME} ---${NC}" | tee -a "$COMBINED_LOG"
    
    # Transcribe with improved formatting
    ./main -m models/ggml-base.bin -f temp_stream.wav -np -otxt 2>/dev/null | \
        while IFS= read -r line; do
            if [ ! -z "$line" ]; then
                # Detect session changes
                detect_session "$line"
                
                # Format and output the line
                if [[ $line =~ ^\[.*\] ]]; then
                    # This is a timestamp line
                    printf "  ${GRAY}%s${NC}\n" "$line" | tee -a "$COMBINED_LOG"
                else
                    # This is transcript content
                    printf "  %s\n" "$line" | tee -a "$COMBINED_LOG"
                fi
            fi
        done
    
    echo >> "$COMBINED_LOG"
    rm -f temp_stream.wav
 done
diff --git a/smallcon_summarize_session_transcripts.sh b/smallcon_summarize_session_transcripts.sh
 #!/bin/bash

 # Enable strict error handling
 set -euo pipefail

 # Configuration
 CONF_DIR="smallcon_transcripts"
 TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 OUTPUT_FILE="smallcon_summaries_${TIMESTAMP}.json"
 WIP_FILE="temp_summary.json"

 # JSON schema for structured output
 JSON_SCHEMA='{"type":"object","properties":{"summary":{"type":"string"}},"required":["summary"]}'

 # Initialize output file
 {
    echo "# SmallCon Session Summaries"
    echo "# Generated: $(date)"
    echo "# ----------------------------------------"
 } | tee "$OUTPUT_FILE"

 # Process each transcript
 for transcript in "$CONF_DIR"/smallcon_*_full.txt; do
    session_time=$(basename "$transcript" | cut -d'_' -f2)
    
    prompt="Given this transcript from a SmallCon session about Small Language Models (SLMs), 
    provide a single-sentence summary capturing the core message:
    
    $(cat "$transcript")"

    {
        # Generate and parse summary
        ollama run llama3.2 --format "$JSON_SCHEMA" "$prompt" | tee "$WIP_FILE"
        echo "Session Time: $session_time"
        echo "Summary: $(jq -r .summary < "$WIP_FILE")"
        echo "----------------------------------------"
    } | tee -a "$OUTPUT_FILE"
 done

 # Cleanup
 rm -f "$WIP_FILE"
 echo "Summaries saved to: $OUTPUT_FILE"
diff --git a/upstage-solar-llms-document-ai.md b/upstage-solar-llms-document-ai.md
	#!/bin/bash

	# Enable error handling
	set -euo pipefail

	# Configuration
	TRANSCRIPT_DIR="smallcon_transcripts"
	TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
	COMBINED_LOG="${TRANSCRIPT_DIR}/smallcon_${TIMESTAMP}_full.txt"
	SEGMENT_LENGTH=30 # Length of each audio segment in seconds
	CURRENT_SESSION=""

	# ANSI colors
	CYAN='\033[1;36m'
	YELLOW='\033[1;33m'
	GREEN='\033[0;32m'
	BLUE='\033[0;34m'
	GRAY='\033[0;37m'
	NC='\033[0m'

	# Create transcript directory
	mkdir -p "$TRANSCRIPT_DIR"

	# Clear screen and move cursor to top
	clear

	# Initialize combined log with session info
	{
	printf "${CYAN}%s\n" "==================================================="
	printf "${CYAN}SmallCon Conference Transcription - Started at $(date)\n"
	printf "${CYAN}==================================================${NC}\n\n"
	} \| tee "$COMBINED_LOG"

	# Cleanup function
	cleanup() {
	echo
	printf "${CYAN}Cleaning up and saving final transcript...\n"
	printf "${CYAN}==================================================="
	echo "Session ended at $(date)" \| tee -a "$COMBINED_LOG"
	printf "${CYAN}===================================================${NC}\n"
	rm -f temp_stream.wav
	printf "${CYAN}Transcript saved to: $COMBINED_LOG${NC}\n"
	exit 0
	}

	# Set up cleanup on script exit
	trap cleanup EXIT INT TERM

	# Function to format timestamp
	format_timestamp() {
	printf "${GRAY}[%s]${NC}" "$1"
	}

	# Function to detect and format session changes
	detect_session() {
	local text="$1"
	if [[ $text =~ "Session" ]] \|\| [[ $text =~ "Panel" ]] \|\| [[ $text =~ "Keynote" ]]; then
	printf "\n${BLUE}>>> New Session Detected: ${text}${NC}\n\n" \| tee -a "$COMBINED_LOG"
	fi
	}

	# Main transcription loop
	while true; do
	printf "${CYAN}\nRecording ${SEGMENT_LENGTH}s segment...${NC}\n"

	# Record audio segment with preprocessing
	ffmpeg -v quiet -f avfoundation -i ":0" -t "$SEGMENT_LENGTH" \
	-af "highpass=f=50,lowpass=f=3000" \
	-ar 16000 -ac 1 -c:a pcm_s16le temp_stream.wav

	# Add timestamp to display and log
	CURRENT_TIME=$(date '+%H:%M:%S')
	echo -e "${YELLOW}--- ${CURRENT_TIME} ---${NC}" \| tee -a "$COMBINED_LOG"

	# Transcribe with improved formatting
	./main -m models/ggml-base.bin -f temp_stream.wav -np -otxt 2>/dev/null \| \
	while IFS= read -r line; do
	if [ ! -z "$line" ]; then
	# Detect session changes
	detect_session "$line"

	# Format and output the line
	if [[ $line =~ ^\[.*\] ]]; then
	# This is a timestamp line
	printf " ${GRAY}%s${NC}\n" "$line" \| tee -a "$COMBINED_LOG"
	else
	# This is transcript content
	printf " %s\n" "$line" \| tee -a "$COMBINED_LOG"
	fi
	fi
	done

	echo >> "$COMBINED_LOG"
	rm -f temp_stream.wav
	done
	#!/bin/bash

	# Enable strict error handling
	set -euo pipefail

	# Configuration
	CONF_DIR="smallcon_transcripts"
	TIMESTAMP=$(date +%Y%m%d_%H%M%S)
	OUTPUT_FILE="smallcon_summaries_${TIMESTAMP}.json"
	WIP_FILE="temp_summary.json"

	# JSON schema for structured output
	JSON_SCHEMA='{"type":"object","properties":{"summary":{"type":"string"}},"required":["summary"]}'

	# Initialize output file
	{
	echo "# SmallCon Session Summaries"
	echo "# Generated: $(date)"
	echo "# ----------------------------------------"
	} \| tee "$OUTPUT_FILE"

	# Process each transcript
	for transcript in "$CONF_DIR"/smallcon_*_full.txt; do
	session_time=$(basename "$transcript" \| cut -d'_' -f2)

	prompt="Given this transcript from a SmallCon session about Small Language Models (SLMs),
	provide a single-sentence summary capturing the core message:

	$(cat "$transcript")"

	{
	# Generate and parse summary
	ollama run llama3.2 --format "$JSON_SCHEMA" "$prompt" \| tee "$WIP_FILE"
	echo "Session Time: $session_time"
	echo "Summary: $(jq -r .summary < "$WIP_FILE")"
	echo "----------------------------------------"
	} \| tee -a "$OUTPUT_FILE"
	done

	# Cleanup
	rm -f "$WIP_FILE"
	echo "Summaries saved to: $OUTPUT_FILE"