SamSaffron · May 19, 2025 22:04
diff --git a/watch-word-matcher.rb b/watch-word-matcher.rb
 # Ruby benchmark script: Regex vs Aho-Corasick for word matching (v2)

 require "benchmark/ips"
 require "set"

 puts "Script starting: Benchmark for Regex vs Aho-Corasick word matching (v2)."
 puts "---"

 # --- Configuration ---
 DICTIONARY_SIZE = 1000
 TEXT_WORDS = 10_000
 RANDOM_WORD_MIN_LENGTH = 4
 RANDOM_WORD_MAX_LENGTH = 12
 # More diverse alphabet for Unicode testing
 ALPHABET =
  ("a".."z").to_a + ("A".."Z").to_a + ("0".."9").to_a +
    %w[ä ö ü ß á é í ó ú ñ ç ã õ ê ô こんにちは 世界]
 STATIC_WORDS = %w[benchmark ruby script unicode optimization performance 안녕하세요] # Added some static words

 # --- Helper: Random Word and Text Generation ---
 def generate_random_word(min_len, max_len)
  len = rand(min_len..max_len)
  Array.new(len) { ALPHABET.sample }.join
 end

 def generate_random_text(
  num_words,
  min_word_len,
  max_word_len,
  static_words_to_include
 )
  words =
    Array.new(num_words) { generate_random_word(min_word_len, max_word_len) }

  # Sprinkle in static words to ensure they appear in the text
  static_words_to_include.each_with_index do |static_word, i|
    # Insert at various positions, not just the beginning
    insert_at = (words.length / (static_words_to_include.length + 1)) * (i + 1)
    words.insert(insert_at, static_word)
  end
  words.shuffle! # Shuffle after insertion to make positions less predictable

  separators = [" ", ". ", ", ", "; ", "\n", " - ", "... ", "! ", "? "]
  text = ""
  words.each_with_index do |word, i|
    text << word
    text << separators.sample if i < words.length - 1
  end
  text
 end

 puts "Generating a dictionary of #{DICTIONARY_SIZE} random words, including static words..."
 DICTIONARY = STATIC_WORDS.dup # Start with static words
 while DICTIONARY.size < DICTIONARY_SIZE
  DICTIONARY << generate_random_word(
    RANDOM_WORD_MIN_LENGTH,
    RANDOM_WORD_MAX_LENGTH
  )
  DICTIONARY.uniq! # Ensure uniqueness
 end
 # If DICTIONARY_SIZE is small, it might be filled by static words + a few random ones.
 # If DICTIONARY_SIZE is larger, it will contain static words and many random ones.
 DICTIONARY.uniq! # Final ensure uniqueness
 puts "Actual dictionary size: #{DICTIONARY.size}"

 puts "Generating a text body of ~#{TEXT_WORDS} words, ensuring static words are included..."
 TEXT_BODY =
  generate_random_text(
    TEXT_WORDS,
    RANDOM_WORD_MIN_LENGTH,
    RANDOM_WORD_MAX_LENGTH,
    STATIC_WORDS
  )

 puts "Dictionary sample (first 10): #{DICTIONARY.take(10).inspect}"
 puts "Text body sample (first 150 chars): #{TEXT_BODY[0..149].inspect}..."
 puts "---"

 # --- Regex Matching Implementation ---
 def build_combined_regex(words)
  return nil if words.empty?
  regex_clauses =
    words.map do |word|
      escaped_word = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
      escaped_word_with_wildcards = escaped_word.gsub("\\*", '\S*') # Assuming '*' is a wildcard
      "(?:[^[:word:]]|^)(#{escaped_word_with_wildcards})(?=[^[:word:]]|$)"
    end
  combined_regex_str = regex_clauses.join("|")
  Regexp.new(combined_regex_str) # Pre-compile the regex
 end

 def match_with_compiled_regex(text, compiled_regex)
  return [] if compiled_regex.nil?
  matches = []
  text.scan(compiled_regex) do |match_data|
    actual_match = match_data.compact.first
    matches << actual_match if actual_match
  end
  matches
 end

 # --- Aho-Corasick Implementation ---
 class AhoCorasick
  class Node
    attr_accessor :children, :parent_char, :failure_link, :output
    def initialize(parent = nil, char = nil)
      @parent = parent
      @parent_char = char
      @children = {}
      @failure_link = nil
      @output = Set.new
    end
    def transition(char)
      @children[char]
    end
    def add_child(char)
      @children[char] ||= Node.new(self, char)
    end
  end

  attr_reader :root

  def initialize(dictionary)
    @root = Node.new
    @root.failure_link = @root
    add_words(dictionary)
    build_failure_links
  end

  def add_words(dictionary)
    dictionary.each { |word| add_word(word) }
  end

  def add_word(word)
    return if word.nil? || word.empty?
    node = @root
    word.each_char { |char| node = node.add_child(char) }
    node.output << word
  end

  def build_failure_links
    queue = @root.children.values.compact
    queue.each { |node| node.failure_link = @root }
    head = 0
    while head < queue.length
      current_node = queue[head]
      head += 1
      current_node.children.each do |char, child_node|
        queue << child_node
        temp_failure_node = current_node.failure_link
        while temp_failure_node.transition(char).nil? &&
                temp_failure_node != @root
          temp_failure_node = temp_failure_node.failure_link
        end
        child_node.failure_link = temp_failure_node.transition(char) || @root
        child_node.output.merge(child_node.failure_link.output)
      end
    end
  end

  def is_word_char?(char)
    return false if char.nil?
    !!(char =~ /\p{Word}/)
  end

  def search(text)
    matches = []
    current_node = @root
    text.each_char.with_index do |char, index|
      while current_node.transition(char).nil? && current_node != @root
        current_node = current_node.failure_link
      end
      current_node = current_node.transition(char) || @root
      current_node.output.each do |found_word|
        start_index = index - found_word.length + 1
        prev_char_is_boundary =
          (start_index == 0 || !is_word_char?(text[start_index - 1]))
        next_char_is_boundary =
          ((index + 1) == text.length || !is_word_char?(text[index + 1]))
        matches << found_word if prev_char_is_boundary && next_char_is_boundary
      end
    end
    matches
  end
 end

 puts "--- Pre-compiling structures for benchmark ---"
 # Pre-compile Regex
 puts "Building combined regex..."
 COMPILED_REGEX = build_combined_regex(DICTIONARY)
 if COMPILED_REGEX.nil?
  puts "Warning: Compiled regex is nil (dictionary might be empty)."
 end

 # Pre-build Aho-Corasick automaton
 puts "Building Aho-Corasick automaton..."
 AHOCORASICK_MATCHER = AhoCorasick.new(DICTIONARY)

 puts "--- Initial Matching (to verify correctness and ensure >0 results) ---"

 # Perform regex matching
 puts "Running Regex matching for verification..."
 regex_matches = match_with_compiled_regex(TEXT_BODY, COMPILED_REGEX)
 puts "Regex matches found: #{regex_matches.length}"
 if regex_matches.empty? && !DICTIONARY.empty?
  puts "WARNING: Regex found 0 matches. Check static word inclusion or regex logic."
 else
  if regex_matches.any?
    puts "Sample Regex matches: #{regex_matches.uniq.sort.take(15).inspect}"
  end
 end

 # Perform Aho-Corasick matching
 puts "Running Aho-Corasick search for verification..."
 ac_matches = AHOCORASICK_MATCHER.search(TEXT_BODY)
 puts "Aho-Corasick matches found: #{ac_matches.length}"
 if ac_matches.empty? && !DICTIONARY.empty?
  puts "WARNING: Aho-Corasick found 0 matches. Check static word inclusion or AC logic."
 else
  if ac_matches.any?
    puts "Sample Aho-Corasick matches: #{ac_matches.uniq.sort.take(15).inspect}"
  end
 end

 # Basic check if counts are similar
 # Note: Due to the nature of overlapping matches and how `scan` vs. AC state transitions might report them,
 # counts might not be perfectly identical without more complex deduplication logic on the regex side
 # if multiple clauses of the OR'd regex could match at the same position.
 # For this benchmark, we are primarily concerned with the raw throughput of finding instances.
 # A more robust comparison of *identical output sets* would involve converting both to Sets of (word, position).
 # Here, we primarily care that both are finding things.
 if regex_matches.length == ac_matches.length
  puts "Match counts are identical. Good."
 elsif (regex_matches.length - ac_matches.length).abs <
      (0.05 * [regex_matches.length, ac_matches.length].max)
  puts "Match counts are close: Regex: #{regex_matches.length}, AC: #{ac_matches.length}. This might be acceptable."
 else
  puts "WARNING: Match counts differ significantly! Regex: #{regex_matches.length}, AC: #{ac_matches.length}"
  # For debugging differences:
  # regex_set = Set.new(regex_matches.map(&:downcase).sort) # example normalization
  # ac_set = Set.new(ac_matches.map(&:downcase).sort)
  # puts "Only in Regex (sample): #{(regex_set - ac_set).to_a.take(5)}"
  # puts "Only in AC (sample): #{(ac_set - regex_set).to_a.take(5)}"
 end
 puts "---"

 puts "--- Benchmarking (Search Phase Only) ---"
 Benchmark.ips do |x|
  x.report("Regex Search (pre-compiled)") do
    match_with_compiled_regex(TEXT_BODY, COMPILED_REGEX)
  end

  x.report("Aho-Corasick Search (pre-built)") do
    AHOCORASICK_MATCHER.search(TEXT_BODY)
  end

  x.compare!
 end

 puts "--- Script finished ---"
	# Ruby benchmark script: Regex vs Aho-Corasick for word matching (v2)

	require "benchmark/ips"
	require "set"

	puts "Script starting: Benchmark for Regex vs Aho-Corasick word matching (v2)."
	puts "---"

	# --- Configuration ---
	DICTIONARY_SIZE = 1000
	TEXT_WORDS = 10_000
	RANDOM_WORD_MIN_LENGTH = 4
	RANDOM_WORD_MAX_LENGTH = 12
	# More diverse alphabet for Unicode testing
	ALPHABET =
	("a".."z").to_a + ("A".."Z").to_a + ("0".."9").to_a +
	%w[ä ö ü ß á é í ó ú ñ ç ã õ ê ô こんにちは世界]
	STATIC_WORDS = %w[benchmark ruby script unicode optimization performance 안녕하세요] # Added some static words

	# --- Helper: Random Word and Text Generation ---
	def generate_random_word(min_len, max_len)
	len = rand(min_len..max_len)
	Array.new(len) { ALPHABET.sample }.join
	end

	def generate_random_text(
	num_words,
	min_word_len,
	max_word_len,
	static_words_to_include
	)
	words =
	Array.new(num_words) { generate_random_word(min_word_len, max_word_len) }

	# Sprinkle in static words to ensure they appear in the text
	static_words_to_include.each_with_index do \|static_word, i\|
	# Insert at various positions, not just the beginning
	insert_at = (words.length / (static_words_to_include.length + 1)) * (i + 1)
	words.insert(insert_at, static_word)
	end
	words.shuffle! # Shuffle after insertion to make positions less predictable

	separators = [" ", ". ", ", ", "; ", "\n", " - ", "... ", "! ", "? "]
	text = ""
	words.each_with_index do \|word, i\|
	text << word
	text << separators.sample if i < words.length - 1
	end
	text
	end

	puts "Generating a dictionary of #{DICTIONARY_SIZE} random words, including static words..."
	DICTIONARY = STATIC_WORDS.dup # Start with static words
	while DICTIONARY.size < DICTIONARY_SIZE
	DICTIONARY << generate_random_word(
	RANDOM_WORD_MIN_LENGTH,
	RANDOM_WORD_MAX_LENGTH
	)
	DICTIONARY.uniq! # Ensure uniqueness
	end
	# If DICTIONARY_SIZE is small, it might be filled by static words + a few random ones.
	# If DICTIONARY_SIZE is larger, it will contain static words and many random ones.
	DICTIONARY.uniq! # Final ensure uniqueness
	puts "Actual dictionary size: #{DICTIONARY.size}"

	puts "Generating a text body of ~#{TEXT_WORDS} words, ensuring static words are included..."
	TEXT_BODY =
	generate_random_text(
	TEXT_WORDS,
	RANDOM_WORD_MIN_LENGTH,
	RANDOM_WORD_MAX_LENGTH,
	STATIC_WORDS
	)

	puts "Dictionary sample (first 10): #{DICTIONARY.take(10).inspect}"
	puts "Text body sample (first 150 chars): #{TEXT_BODY[0..149].inspect}..."
	puts "---"

	# --- Regex Matching Implementation ---
	def build_combined_regex(words)
	return nil if words.empty?
	regex_clauses =
	words.map do \|word\|
	escaped_word = word.gsub(/([.*+?^${}()\|\[\]\\])/, '\\\\\1')
	escaped_word_with_wildcards = escaped_word.gsub("\\", '\S') # Assuming '*' is a wildcard
	"(?:[^[:word:]]\|^)(#{escaped_word_with_wildcards})(?=[^[:word:]]\|$)"
	end
	combined_regex_str = regex_clauses.join("\|")
	Regexp.new(combined_regex_str) # Pre-compile the regex
	end

	def match_with_compiled_regex(text, compiled_regex)
	return [] if compiled_regex.nil?
	matches = []
	text.scan(compiled_regex) do \|match_data\|
	actual_match = match_data.compact.first
	matches << actual_match if actual_match
	end
	matches
	end

	# --- Aho-Corasick Implementation ---
	class AhoCorasick
	class Node
	attr_accessor :children, :parent_char, :failure_link, :output
	def initialize(parent = nil, char = nil)
	@parent = parent
	@parent_char = char
	@children = {}
	@failure_link = nil
	@output = Set.new
	end
	def transition(char)
	@children[char]
	end
	def add_child(char)
	@children[char] \|\|= Node.new(self, char)
	end
	end

	attr_reader :root

	def initialize(dictionary)
	@root = Node.new
	@root.failure_link = @root
	add_words(dictionary)
	build_failure_links
	end

	def add_words(dictionary)
	dictionary.each { \|word\| add_word(word) }
	end

	def add_word(word)
	return if word.nil? \|\| word.empty?
	node = @root
	word.each_char { \|char\| node = node.add_child(char) }
	node.output << word
	end

	def build_failure_links
	queue = @root.children.values.compact
	queue.each { \|node\| node.failure_link = @root }
	head = 0
	while head < queue.length
	current_node = queue[head]
	head += 1
	current_node.children.each do \|char, child_node\|
	queue << child_node
	temp_failure_node = current_node.failure_link
	while temp_failure_node.transition(char).nil? &&
	temp_failure_node != @root
	temp_failure_node = temp_failure_node.failure_link
	end
	child_node.failure_link = temp_failure_node.transition(char) \|\| @root
	child_node.output.merge(child_node.failure_link.output)
	end
	end
	end

	def is_word_char?(char)
	return false if char.nil?
	!!(char =~ /\p{Word}/)
	end

	def search(text)
	matches = []
	current_node = @root
	text.each_char.with_index do \|char, index\|
	while current_node.transition(char).nil? && current_node != @root
	current_node = current_node.failure_link
	end
	current_node = current_node.transition(char) \|\| @root
	current_node.output.each do \|found_word\|
	start_index = index - found_word.length + 1
	prev_char_is_boundary =
	(start_index == 0 \|\| !is_word_char?(text[start_index - 1]))
	next_char_is_boundary =
	((index + 1) == text.length \|\| !is_word_char?(text[index + 1]))
	matches << found_word if prev_char_is_boundary && next_char_is_boundary
	end
	end
	matches
	end
	end

	puts "--- Pre-compiling structures for benchmark ---"
	# Pre-compile Regex
	puts "Building combined regex..."
	COMPILED_REGEX = build_combined_regex(DICTIONARY)
	if COMPILED_REGEX.nil?
	puts "Warning: Compiled regex is nil (dictionary might be empty)."
	end

	# Pre-build Aho-Corasick automaton
	puts "Building Aho-Corasick automaton..."
	AHOCORASICK_MATCHER = AhoCorasick.new(DICTIONARY)

	puts "--- Initial Matching (to verify correctness and ensure >0 results) ---"

	# Perform regex matching
	puts "Running Regex matching for verification..."
	regex_matches = match_with_compiled_regex(TEXT_BODY, COMPILED_REGEX)
	puts "Regex matches found: #{regex_matches.length}"
	if regex_matches.empty? && !DICTIONARY.empty?
	puts "WARNING: Regex found 0 matches. Check static word inclusion or regex logic."
	else
	if regex_matches.any?
	puts "Sample Regex matches: #{regex_matches.uniq.sort.take(15).inspect}"
	end
	end

	# Perform Aho-Corasick matching
	puts "Running Aho-Corasick search for verification..."
	ac_matches = AHOCORASICK_MATCHER.search(TEXT_BODY)
	puts "Aho-Corasick matches found: #{ac_matches.length}"
	if ac_matches.empty? && !DICTIONARY.empty?
	puts "WARNING: Aho-Corasick found 0 matches. Check static word inclusion or AC logic."
	else
	if ac_matches.any?
	puts "Sample Aho-Corasick matches: #{ac_matches.uniq.sort.take(15).inspect}"
	end
	end

	# Basic check if counts are similar
	# Note: Due to the nature of overlapping matches and how `scan` vs. AC state transitions might report them,
	# counts might not be perfectly identical without more complex deduplication logic on the regex side
	# if multiple clauses of the OR'd regex could match at the same position.
	# For this benchmark, we are primarily concerned with the raw throughput of finding instances.
	# A more robust comparison of identical output sets would involve converting both to Sets of (word, position).
	# Here, we primarily care that both are finding things.
	if regex_matches.length == ac_matches.length
	puts "Match counts are identical. Good."
	elsif (regex_matches.length - ac_matches.length).abs <
	(0.05 * [regex_matches.length, ac_matches.length].max)
	puts "Match counts are close: Regex: #{regex_matches.length}, AC: #{ac_matches.length}. This might be acceptable."
	else
	puts "WARNING: Match counts differ significantly! Regex: #{regex_matches.length}, AC: #{ac_matches.length}"
	# For debugging differences:
	# regex_set = Set.new(regex_matches.map(&:downcase).sort) # example normalization
	# ac_set = Set.new(ac_matches.map(&:downcase).sort)
	# puts "Only in Regex (sample): #{(regex_set - ac_set).to_a.take(5)}"
	# puts "Only in AC (sample): #{(ac_set - regex_set).to_a.take(5)}"
	end
	puts "---"

	puts "--- Benchmarking (Search Phase Only) ---"
	Benchmark.ips do \|x\|
	x.report("Regex Search (pre-compiled)") do
	match_with_compiled_regex(TEXT_BODY, COMPILED_REGEX)
	end

	x.report("Aho-Corasick Search (pre-built)") do
	AHOCORASICK_MATCHER.search(TEXT_BODY)
	end

	x.compare!
	end

	puts "--- Script finished ---"