Skip to content

Instantly share code, notes, and snippets.

@SamSaffron
Created May 19, 2025 22:04
Show Gist options
  • Save SamSaffron/8e5915676ad81f6bda9997eff5af8612 to your computer and use it in GitHub Desktop.
Save SamSaffron/8e5915676ad81f6bda9997eff5af8612 to your computer and use it in GitHub Desktop.
aho-corasick
# Ruby benchmark script: Regex vs Aho-Corasick for word matching (v2)
require "benchmark/ips"
require "set"
puts "Script starting: Benchmark for Regex vs Aho-Corasick word matching (v2)."
puts "---"
# --- Configuration ---
DICTIONARY_SIZE = 1000
TEXT_WORDS = 10_000
RANDOM_WORD_MIN_LENGTH = 4
RANDOM_WORD_MAX_LENGTH = 12
# More diverse alphabet for Unicode testing
ALPHABET =
("a".."z").to_a + ("A".."Z").to_a + ("0".."9").to_a +
%w[ä ö ü ß á é í ó ú ñ ç ã õ ê ô こんにちは 世界]
STATIC_WORDS = %w[benchmark ruby script unicode optimization performance 안녕하세요] # Added some static words
# --- Helper: Random Word and Text Generation ---
def generate_random_word(min_len, max_len)
len = rand(min_len..max_len)
Array.new(len) { ALPHABET.sample }.join
end
def generate_random_text(
num_words,
min_word_len,
max_word_len,
static_words_to_include
)
words =
Array.new(num_words) { generate_random_word(min_word_len, max_word_len) }
# Sprinkle in static words to ensure they appear in the text
static_words_to_include.each_with_index do |static_word, i|
# Insert at various positions, not just the beginning
insert_at = (words.length / (static_words_to_include.length + 1)) * (i + 1)
words.insert(insert_at, static_word)
end
words.shuffle! # Shuffle after insertion to make positions less predictable
separators = [" ", ". ", ", ", "; ", "\n", " - ", "... ", "! ", "? "]
text = ""
words.each_with_index do |word, i|
text << word
text << separators.sample if i < words.length - 1
end
text
end
puts "Generating a dictionary of #{DICTIONARY_SIZE} random words, including static words..."
DICTIONARY = STATIC_WORDS.dup # Start with static words
while DICTIONARY.size < DICTIONARY_SIZE
DICTIONARY << generate_random_word(
RANDOM_WORD_MIN_LENGTH,
RANDOM_WORD_MAX_LENGTH
)
DICTIONARY.uniq! # Ensure uniqueness
end
# If DICTIONARY_SIZE is small, it might be filled by static words + a few random ones.
# If DICTIONARY_SIZE is larger, it will contain static words and many random ones.
DICTIONARY.uniq! # Final ensure uniqueness
puts "Actual dictionary size: #{DICTIONARY.size}"
puts "Generating a text body of ~#{TEXT_WORDS} words, ensuring static words are included..."
TEXT_BODY =
generate_random_text(
TEXT_WORDS,
RANDOM_WORD_MIN_LENGTH,
RANDOM_WORD_MAX_LENGTH,
STATIC_WORDS
)
puts "Dictionary sample (first 10): #{DICTIONARY.take(10).inspect}"
puts "Text body sample (first 150 chars): #{TEXT_BODY[0..149].inspect}..."
puts "---"
# --- Regex Matching Implementation ---
def build_combined_regex(words)
return nil if words.empty?
regex_clauses =
words.map do |word|
escaped_word = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
escaped_word_with_wildcards = escaped_word.gsub("\\*", '\S*') # Assuming '*' is a wildcard
"(?:[^[:word:]]|^)(#{escaped_word_with_wildcards})(?=[^[:word:]]|$)"
end
combined_regex_str = regex_clauses.join("|")
Regexp.new(combined_regex_str) # Pre-compile the regex
end
def match_with_compiled_regex(text, compiled_regex)
return [] if compiled_regex.nil?
matches = []
text.scan(compiled_regex) do |match_data|
actual_match = match_data.compact.first
matches << actual_match if actual_match
end
matches
end
# --- Aho-Corasick Implementation ---
class AhoCorasick
class Node
attr_accessor :children, :parent_char, :failure_link, :output
def initialize(parent = nil, char = nil)
@parent = parent
@parent_char = char
@children = {}
@failure_link = nil
@output = Set.new
end
def transition(char)
@children[char]
end
def add_child(char)
@children[char] ||= Node.new(self, char)
end
end
attr_reader :root
def initialize(dictionary)
@root = Node.new
@root.failure_link = @root
add_words(dictionary)
build_failure_links
end
def add_words(dictionary)
dictionary.each { |word| add_word(word) }
end
def add_word(word)
return if word.nil? || word.empty?
node = @root
word.each_char { |char| node = node.add_child(char) }
node.output << word
end
def build_failure_links
queue = @root.children.values.compact
queue.each { |node| node.failure_link = @root }
head = 0
while head < queue.length
current_node = queue[head]
head += 1
current_node.children.each do |char, child_node|
queue << child_node
temp_failure_node = current_node.failure_link
while temp_failure_node.transition(char).nil? &&
temp_failure_node != @root
temp_failure_node = temp_failure_node.failure_link
end
child_node.failure_link = temp_failure_node.transition(char) || @root
child_node.output.merge(child_node.failure_link.output)
end
end
end
def is_word_char?(char)
return false if char.nil?
!!(char =~ /\p{Word}/)
end
def search(text)
matches = []
current_node = @root
text.each_char.with_index do |char, index|
while current_node.transition(char).nil? && current_node != @root
current_node = current_node.failure_link
end
current_node = current_node.transition(char) || @root
current_node.output.each do |found_word|
start_index = index - found_word.length + 1
prev_char_is_boundary =
(start_index == 0 || !is_word_char?(text[start_index - 1]))
next_char_is_boundary =
((index + 1) == text.length || !is_word_char?(text[index + 1]))
matches << found_word if prev_char_is_boundary && next_char_is_boundary
end
end
matches
end
end
puts "--- Pre-compiling structures for benchmark ---"
# Pre-compile Regex
puts "Building combined regex..."
COMPILED_REGEX = build_combined_regex(DICTIONARY)
if COMPILED_REGEX.nil?
puts "Warning: Compiled regex is nil (dictionary might be empty)."
end
# Pre-build Aho-Corasick automaton
puts "Building Aho-Corasick automaton..."
AHOCORASICK_MATCHER = AhoCorasick.new(DICTIONARY)
puts "--- Initial Matching (to verify correctness and ensure >0 results) ---"
# Perform regex matching
puts "Running Regex matching for verification..."
regex_matches = match_with_compiled_regex(TEXT_BODY, COMPILED_REGEX)
puts "Regex matches found: #{regex_matches.length}"
if regex_matches.empty? && !DICTIONARY.empty?
puts "WARNING: Regex found 0 matches. Check static word inclusion or regex logic."
else
if regex_matches.any?
puts "Sample Regex matches: #{regex_matches.uniq.sort.take(15).inspect}"
end
end
# Perform Aho-Corasick matching
puts "Running Aho-Corasick search for verification..."
ac_matches = AHOCORASICK_MATCHER.search(TEXT_BODY)
puts "Aho-Corasick matches found: #{ac_matches.length}"
if ac_matches.empty? && !DICTIONARY.empty?
puts "WARNING: Aho-Corasick found 0 matches. Check static word inclusion or AC logic."
else
if ac_matches.any?
puts "Sample Aho-Corasick matches: #{ac_matches.uniq.sort.take(15).inspect}"
end
end
# Basic check if counts are similar
# Note: Due to the nature of overlapping matches and how `scan` vs. AC state transitions might report them,
# counts might not be perfectly identical without more complex deduplication logic on the regex side
# if multiple clauses of the OR'd regex could match at the same position.
# For this benchmark, we are primarily concerned with the raw throughput of finding instances.
# A more robust comparison of *identical output sets* would involve converting both to Sets of (word, position).
# Here, we primarily care that both are finding things.
if regex_matches.length == ac_matches.length
puts "Match counts are identical. Good."
elsif (regex_matches.length - ac_matches.length).abs <
(0.05 * [regex_matches.length, ac_matches.length].max)
puts "Match counts are close: Regex: #{regex_matches.length}, AC: #{ac_matches.length}. This might be acceptable."
else
puts "WARNING: Match counts differ significantly! Regex: #{regex_matches.length}, AC: #{ac_matches.length}"
# For debugging differences:
# regex_set = Set.new(regex_matches.map(&:downcase).sort) # example normalization
# ac_set = Set.new(ac_matches.map(&:downcase).sort)
# puts "Only in Regex (sample): #{(regex_set - ac_set).to_a.take(5)}"
# puts "Only in AC (sample): #{(ac_set - regex_set).to_a.take(5)}"
end
puts "---"
puts "--- Benchmarking (Search Phase Only) ---"
Benchmark.ips do |x|
x.report("Regex Search (pre-compiled)") do
match_with_compiled_regex(TEXT_BODY, COMPILED_REGEX)
end
x.report("Aho-Corasick Search (pre-built)") do
AHOCORASICK_MATCHER.search(TEXT_BODY)
end
x.compare!
end
puts "--- Script finished ---"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment