Created
May 19, 2025 22:04
-
-
Save SamSaffron/8e5915676ad81f6bda9997eff5af8612 to your computer and use it in GitHub Desktop.
aho-corasick
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ruby benchmark script: Regex vs Aho-Corasick for word matching (v2) | |
require "benchmark/ips" | |
require "set" | |
puts "Script starting: Benchmark for Regex vs Aho-Corasick word matching (v2)." | |
puts "---" | |
# --- Configuration --- | |
DICTIONARY_SIZE = 1000 | |
TEXT_WORDS = 10_000 | |
RANDOM_WORD_MIN_LENGTH = 4 | |
RANDOM_WORD_MAX_LENGTH = 12 | |
# More diverse alphabet for Unicode testing | |
ALPHABET = | |
("a".."z").to_a + ("A".."Z").to_a + ("0".."9").to_a + | |
%w[ä ö ü ß á é í ó ú ñ ç ã õ ê ô こんにちは 世界] | |
STATIC_WORDS = %w[benchmark ruby script unicode optimization performance 안녕하세요] # Added some static words | |
# --- Helper: Random Word and Text Generation --- | |
def generate_random_word(min_len, max_len) | |
len = rand(min_len..max_len) | |
Array.new(len) { ALPHABET.sample }.join | |
end | |
def generate_random_text( | |
num_words, | |
min_word_len, | |
max_word_len, | |
static_words_to_include | |
) | |
words = | |
Array.new(num_words) { generate_random_word(min_word_len, max_word_len) } | |
# Sprinkle in static words to ensure they appear in the text | |
static_words_to_include.each_with_index do |static_word, i| | |
# Insert at various positions, not just the beginning | |
insert_at = (words.length / (static_words_to_include.length + 1)) * (i + 1) | |
words.insert(insert_at, static_word) | |
end | |
words.shuffle! # Shuffle after insertion to make positions less predictable | |
separators = [" ", ". ", ", ", "; ", "\n", " - ", "... ", "! ", "? "] | |
text = "" | |
words.each_with_index do |word, i| | |
text << word | |
text << separators.sample if i < words.length - 1 | |
end | |
text | |
end | |
puts "Generating a dictionary of #{DICTIONARY_SIZE} random words, including static words..." | |
DICTIONARY = STATIC_WORDS.dup # Start with static words | |
while DICTIONARY.size < DICTIONARY_SIZE | |
DICTIONARY << generate_random_word( | |
RANDOM_WORD_MIN_LENGTH, | |
RANDOM_WORD_MAX_LENGTH | |
) | |
DICTIONARY.uniq! # Ensure uniqueness | |
end | |
# If DICTIONARY_SIZE is small, it might be filled by static words + a few random ones. | |
# If DICTIONARY_SIZE is larger, it will contain static words and many random ones. | |
DICTIONARY.uniq! # Final ensure uniqueness | |
puts "Actual dictionary size: #{DICTIONARY.size}" | |
puts "Generating a text body of ~#{TEXT_WORDS} words, ensuring static words are included..." | |
TEXT_BODY = | |
generate_random_text( | |
TEXT_WORDS, | |
RANDOM_WORD_MIN_LENGTH, | |
RANDOM_WORD_MAX_LENGTH, | |
STATIC_WORDS | |
) | |
puts "Dictionary sample (first 10): #{DICTIONARY.take(10).inspect}" | |
puts "Text body sample (first 150 chars): #{TEXT_BODY[0..149].inspect}..." | |
puts "---" | |
# --- Regex Matching Implementation --- | |
def build_combined_regex(words) | |
return nil if words.empty? | |
regex_clauses = | |
words.map do |word| | |
escaped_word = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1') | |
escaped_word_with_wildcards = escaped_word.gsub("\\*", '\S*') # Assuming '*' is a wildcard | |
"(?:[^[:word:]]|^)(#{escaped_word_with_wildcards})(?=[^[:word:]]|$)" | |
end | |
combined_regex_str = regex_clauses.join("|") | |
Regexp.new(combined_regex_str) # Pre-compile the regex | |
end | |
def match_with_compiled_regex(text, compiled_regex) | |
return [] if compiled_regex.nil? | |
matches = [] | |
text.scan(compiled_regex) do |match_data| | |
actual_match = match_data.compact.first | |
matches << actual_match if actual_match | |
end | |
matches | |
end | |
# --- Aho-Corasick Implementation --- | |
class AhoCorasick | |
class Node | |
attr_accessor :children, :parent_char, :failure_link, :output | |
def initialize(parent = nil, char = nil) | |
@parent = parent | |
@parent_char = char | |
@children = {} | |
@failure_link = nil | |
@output = Set.new | |
end | |
def transition(char) | |
@children[char] | |
end | |
def add_child(char) | |
@children[char] ||= Node.new(self, char) | |
end | |
end | |
attr_reader :root | |
def initialize(dictionary) | |
@root = Node.new | |
@root.failure_link = @root | |
add_words(dictionary) | |
build_failure_links | |
end | |
def add_words(dictionary) | |
dictionary.each { |word| add_word(word) } | |
end | |
def add_word(word) | |
return if word.nil? || word.empty? | |
node = @root | |
word.each_char { |char| node = node.add_child(char) } | |
node.output << word | |
end | |
def build_failure_links | |
queue = @root.children.values.compact | |
queue.each { |node| node.failure_link = @root } | |
head = 0 | |
while head < queue.length | |
current_node = queue[head] | |
head += 1 | |
current_node.children.each do |char, child_node| | |
queue << child_node | |
temp_failure_node = current_node.failure_link | |
while temp_failure_node.transition(char).nil? && | |
temp_failure_node != @root | |
temp_failure_node = temp_failure_node.failure_link | |
end | |
child_node.failure_link = temp_failure_node.transition(char) || @root | |
child_node.output.merge(child_node.failure_link.output) | |
end | |
end | |
end | |
def is_word_char?(char) | |
return false if char.nil? | |
!!(char =~ /\p{Word}/) | |
end | |
def search(text) | |
matches = [] | |
current_node = @root | |
text.each_char.with_index do |char, index| | |
while current_node.transition(char).nil? && current_node != @root | |
current_node = current_node.failure_link | |
end | |
current_node = current_node.transition(char) || @root | |
current_node.output.each do |found_word| | |
start_index = index - found_word.length + 1 | |
prev_char_is_boundary = | |
(start_index == 0 || !is_word_char?(text[start_index - 1])) | |
next_char_is_boundary = | |
((index + 1) == text.length || !is_word_char?(text[index + 1])) | |
matches << found_word if prev_char_is_boundary && next_char_is_boundary | |
end | |
end | |
matches | |
end | |
end | |
puts "--- Pre-compiling structures for benchmark ---" | |
# Pre-compile Regex | |
puts "Building combined regex..." | |
COMPILED_REGEX = build_combined_regex(DICTIONARY) | |
if COMPILED_REGEX.nil? | |
puts "Warning: Compiled regex is nil (dictionary might be empty)." | |
end | |
# Pre-build Aho-Corasick automaton | |
puts "Building Aho-Corasick automaton..." | |
AHOCORASICK_MATCHER = AhoCorasick.new(DICTIONARY) | |
puts "--- Initial Matching (to verify correctness and ensure >0 results) ---" | |
# Perform regex matching | |
puts "Running Regex matching for verification..." | |
regex_matches = match_with_compiled_regex(TEXT_BODY, COMPILED_REGEX) | |
puts "Regex matches found: #{regex_matches.length}" | |
if regex_matches.empty? && !DICTIONARY.empty? | |
puts "WARNING: Regex found 0 matches. Check static word inclusion or regex logic." | |
else | |
if regex_matches.any? | |
puts "Sample Regex matches: #{regex_matches.uniq.sort.take(15).inspect}" | |
end | |
end | |
# Perform Aho-Corasick matching | |
puts "Running Aho-Corasick search for verification..." | |
ac_matches = AHOCORASICK_MATCHER.search(TEXT_BODY) | |
puts "Aho-Corasick matches found: #{ac_matches.length}" | |
if ac_matches.empty? && !DICTIONARY.empty? | |
puts "WARNING: Aho-Corasick found 0 matches. Check static word inclusion or AC logic." | |
else | |
if ac_matches.any? | |
puts "Sample Aho-Corasick matches: #{ac_matches.uniq.sort.take(15).inspect}" | |
end | |
end | |
# Basic check if counts are similar | |
# Note: Due to the nature of overlapping matches and how `scan` vs. AC state transitions might report them, | |
# counts might not be perfectly identical without more complex deduplication logic on the regex side | |
# if multiple clauses of the OR'd regex could match at the same position. | |
# For this benchmark, we are primarily concerned with the raw throughput of finding instances. | |
# A more robust comparison of *identical output sets* would involve converting both to Sets of (word, position). | |
# Here, we primarily care that both are finding things. | |
if regex_matches.length == ac_matches.length | |
puts "Match counts are identical. Good." | |
elsif (regex_matches.length - ac_matches.length).abs < | |
(0.05 * [regex_matches.length, ac_matches.length].max) | |
puts "Match counts are close: Regex: #{regex_matches.length}, AC: #{ac_matches.length}. This might be acceptable." | |
else | |
puts "WARNING: Match counts differ significantly! Regex: #{regex_matches.length}, AC: #{ac_matches.length}" | |
# For debugging differences: | |
# regex_set = Set.new(regex_matches.map(&:downcase).sort) # example normalization | |
# ac_set = Set.new(ac_matches.map(&:downcase).sort) | |
# puts "Only in Regex (sample): #{(regex_set - ac_set).to_a.take(5)}" | |
# puts "Only in AC (sample): #{(ac_set - regex_set).to_a.take(5)}" | |
end | |
puts "---" | |
puts "--- Benchmarking (Search Phase Only) ---" | |
Benchmark.ips do |x| | |
x.report("Regex Search (pre-compiled)") do | |
match_with_compiled_regex(TEXT_BODY, COMPILED_REGEX) | |
end | |
x.report("Aho-Corasick Search (pre-built)") do | |
AHOCORASICK_MATCHER.search(TEXT_BODY) | |
end | |
x.compare! | |
end | |
puts "--- Script finished ---" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment