Last active
November 22, 2024 00:35
-
-
Save bdevel/d7646a431fac149bd20e0ab293864ba9 to your computer and use it in GitHub Desktop.
Fast text searching of many, many sub-strings using Trie data structure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; add [net.sourceforge.jregex/jregex "1.2_01"] as a dependency. Better Regex than Java. | |
(ns text-helpers | |
(:import [jregex Pattern Replacer Substitution MatchResult TextBuffer])) | |
(defn hexify | |
"debug helper for determine Unicode hex value for a char." | |
[s] | |
(apply str | |
(map #(format "0x%02x " (int %)) s))) | |
;; can take some time to build the regex patter, so cache those. Caution with dynamic regex. | |
(def replacer-cache (atom {})) | |
(defn gsub | |
"Global replace rx with replacement." | |
[text rx replacemnt] | |
;; use "[$&]" for first match | |
(let [replacer (or (get @replacer-cache rx) | |
(let [p (Pattern. rx) | |
r (.replacer p replacemnt)] | |
(swap! replacer-cache assoc rx r) | |
r))] | |
(.replace replacer text))) | |
(comment | |
(gsub "I'd like 5 beers and 2 waters." "\\d" "_") | |
;; https://jregex.sourceforge.net/gstarted-unicode.html#blocks | |
(gsub "Dr. Brave's Bingo! \"Mango\" 喜 ö 2020) 🌟 rating: 6.2 / #10 📣" "[\\p{Cs}|\\p{Pc}|\\p{Pd}|\\p{Ps}|\\p{Pi}|\\p{Pf}|\\p{Po}]" "x") | |
) | |
(defn normalize-text | |
"Strips Punctuation, Controls and Surrogate emojis. May consider applying lower case depending on your usecase." | |
[text] | |
;; See: | |
;; https://jregex.sourceforge.net/gstarted-unicode.html#blocks | |
(-> text | |
(gsub "[\\p{Pc}|\\p{Pd}|\\p{Ps}|\\p{Pi}|\\p{Pf}|\\p{Po}|\\p{Cs}|\\p{Cc}|\\p{Cf}|\\p{Co}|\\p{Cn}]" " ") | |
(gsub "\\s+" " ") ;; make single space | |
(clojure.string/trim))) | |
(comment | |
(normalize-text " Dr. Brave's Bingo! \"Mango\" 喜 ö 2020) 🌟 rating: 6.2 / #10 📣 ") | |
(hexify "\n") | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Example of building a large table. | |
;; Recommended to build the scan table once and save it serialized if possible (ie, not adding/removing values). | |
;; | |
;; Sorry, these are just some code snippets and my not run as pasted in. | |
;; | |
(ns title-matcher | |
(:require | |
[trie-scan :as scan] | |
[taoensso.nippy :as nippy] ;; Dependency, [com.taoensso/nippy "3.2.0"] | |
[text-helpers :refer [normalize-text]] | |
)) | |
(def scanner-table-atom (atom nil)) | |
(defn get-scanner-table | |
"" | |
[] | |
(or @scanner-table-atom | |
(locking scanner-table-atom ;; only allow one thread to start loading | |
(swap! scanner-table-atom (fn [v] | |
(if v | |
v | |
(try | |
(log/info (str "Starting to load index file " scan-table-file)) | |
(log/duration (str "Finished loading index file") | |
(nippy/thaw-from-file scan-table-file)) | |
(catch java.io.FileNotFoundException e | |
(log/warn (str "Could not load index file " scan-table-file " Using empty table.")) | |
{})))))))) | |
(defn rebuild-scanner-table | |
"Creates the scanner table, saves to .nippy file for later use. | |
Returns number of items in the index." | |
[titles-file] | |
(let [items-to-index (get-all-relevant-assets titles-file) | |
new-table (reduce (fn [table item] | |
(scan/assoc-in-table table | |
(normalize-text (:title item)) | |
{:id (:id item) | |
:title (:title item) | |
:type (:type item) | |
})) | |
{} | |
items-to-index) ] | |
(nippy/freeze-to-file scan-table-file new-table) | |
(reset! scanner-table-atom nil) | |
(count items-to-index))) | |
(defn search | |
"Scans the entire database for titles that are found in text. | |
Returns value stored in the Trie Scanner for the matches. | |
Will return any and all matches, may have duplicates" | |
[text] | |
(scan/find-all-matches (normalize-text text) | |
(get-scanner-table))) | |
(defn uniq-by | |
"" | |
[f items] | |
(:uniq (reduce (fn [acc m] | |
(let [k (f m) ] | |
(if (get-in acc [:seen k]) | |
acc | |
(-> acc | |
(update :seen assoc k true) | |
(update :uniq conj m))))) | |
{:seen {} :uniq []} | |
items))) | |
(comment | |
(uniq-by :name [{:name "Bob"} {:name "Jane"} {:name "Bob"}]) | |
) | |
;; Add a main function if you want to run a command to update the nippy file. | |
;; Add this to project.clj | |
;; :aliases {"update-title-index" ["run" "-m" "scrubm-clj.title-matcher" ] } | |
;; Then in your shell run: | |
;; lein update-title-index titles.tsv | |
(defn -main | |
"Updates the index file for seaching" | |
[& args] | |
(if-let [titles-file (first args)] | |
(if (.exists (io/file titles-file)) | |
(do | |
(println (str "Reading from '" titles-file "' to update '" scan-table-file "' index file.")) | |
(log/duration | |
"Build scanner index" | |
(rebuild-scanner-table titles-file)) | |
(println "Done! New file size:" (int (/ (.length (io/file scan-table-file)) 1024 1024)) "MiB")) | |
(println "Cannot find" titles-file "to parse. Specify location as first argument.")) | |
(do | |
(println "Specify the locdation of the titles.tsv as the first argument.")))) | |
(comment | |
(search "I've been thinking this is my Last Chance to watch Dreaming Youth and Fuses at the theater.") | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns scrubm-clj.trie-scan | |
"Fast text searches. Build a table with all your text search terms and what value you want associated when matched. | |
When searching, will only pass through the string once and will not scan the full list of search terms. | |
The search time depends on length of the text searching and not the number of search terms.") | |
(defn assoc-in-table | |
"" | |
[table text value] | |
(let [k (first text) ] | |
(if (get table k) | |
(update table k (fn [v] | |
(if (next text);; already exist | |
(update v 0 #(assoc-in-table % (next text) value)) | |
(conj v value) ;;end! | |
))) | |
(if (next text) | |
(assoc table | |
k | |
[(assoc-in-table {} (next text) value ) ]) | |
(assoc table | |
k | |
[{} value];; END! | |
))))) | |
(defn char-is-boundry? | |
"" | |
[c] | |
;; a single space doesn't match \b, so adding x here | |
(some? | |
(re-find (java.util.regex.Pattern/compile "x[\\p{Punct}|\\p{IsWhite_Space}]x") (str "x" c "x")))) | |
(comment | |
(char-is-boundry? (first " ")) | |
(char-is-boundry? (first "_")) | |
(char-is-boundry? \a) | |
) | |
(defn find-all-matches | |
"Walks each charecter of text through the keys of root-table, and walks the next table for each subsequent charecter until match is found or the end is reached." | |
([text root-table] (find-all-matches text | |
root-table | |
{:allow-substring-matches false})) | |
([text root-table opts] | |
(assert (string? text) "Text provided was not a String. Probably given a table.") | |
(assert (map? root-table) "Table provided was not a hash map.") | |
(loop [cur-text text | |
at-start-boundry true ;; start of text is always a start boundry | |
candiate-tables [] ;; may have more than one sub string match going at a time | |
all-matches [] ] | |
(let [cur-char (first cur-text) | |
cur-at-end-boundry (char-is-boundry? (or (first (next cur-text)) " "));; support end of text, which would be nil, change to " " | |
[new-candiate-tables candidate-matches] (reduce (fn [[cts ms] ct];; candiate-tables, matches, candiate-table | |
(if-let [[next-t & new-ms] (get ct cur-char) ] | |
[(conj cts next-t) ;; char found, add any matches | |
;; merged results, but only if | |
(if (or cur-at-end-boundry | |
(:allow-substring-matches opts)) | |
(into ms new-ms) | |
ms) | |
];; end vector for accumulator | |
;; dead end | |
[cts ms])) | |
[[] []] | |
candiate-tables) | |
;; todo, does this work with single char matches? | |
[new-table & root-matches] (if (or at-start-boundry ;; only add new candidates when starting at a boundry | |
(:allow-substring-matches opts)) | |
(get root-table cur-char)) ;; probably no new matches unless single char | |
;; add the start of the new candidate, if there was one. | |
next-candiate-tables (if new-table | |
(conj new-candiate-tables new-table) | |
new-candiate-tables) | |
next-matches (-> all-matches | |
(into candidate-matches) | |
(into (if (or cur-at-end-boundry | |
(:allow-substring-matches opts)) | |
root-matches))) | |
] | |
(if (next cur-text) | |
(recur | |
(next cur-text) | |
(char-is-boundry? cur-char) | |
next-candiate-tables | |
next-matches) | |
;; end of text | |
next-matches))))) | |
(defn has-key? | |
"Determines if the text is included in the table." | |
[text table] | |
(assert (string? text) "Text provided was not a String. Probably given a table.") | |
(assert (map? table) "Table provided was not a hash map.") | |
(let [v (get-in table | |
(butlast (interleave (seq text) | |
(repeat 0)))) ] | |
(some? (and v | |
(next v))))) | |
(comment | |
(let [ex-table (-> {} | |
(assoc-in-table "c" :c) | |
(assoc-in-table "cat" :cat) | |
(assoc-in-table "cats" :cats) | |
(assoc-in-table "cathrine" :cathrine ) | |
(assoc-in-table "cat man" :cat-matn ) | |
(assoc-in-table "cam" :cam) | |
(assoc-in-table "camera" :camera) | |
(assoc-in-table "cameron" :cameron) | |
(assoc-in-table "camping" :camping) | |
(assoc-in-table "camera phone" :camera-phone) | |
) | |
ex-text "I like cats, the letter c, and cathrine when she is camping"] | |
(find-all-matches ex-text ex-table {:allow-substring-matches false})) | |
(def sample-table | |
(-> {} | |
(assoc-in-table "cats" :cats ) | |
(assoc-in-table "cathrine" :cathrine ) | |
(assoc-in-table "cat man" :cat-matn ) | |
(assoc-in-table "cam" :cam) | |
(assoc-in-table "camera" :camera) | |
(assoc-in-table "cameron" :cameron) | |
(assoc-in-table "camera phone" :camera-phone) | |
)) | |
(get-in sample-table [\c 0 ]) | |
(interleave [:a :b :c] (take 10 (repeat 0))) | |
(has? "cats" sample-table) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment