Created
December 29, 2020 13:32
-
-
Save searls/c0b7ffedbcf7261b654c5cb492250a3a to your computer and use it in GitHub Desktop.
KameSame's December 2020 search overhaul
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- This all starts with some functions and a *materialized* postgres view that unnests several | |
-- arrays of strings of definitions into flattened rows that are easier to search. Fun fact: | |
-- you can even create indexes on materialized views' columns! They'll refresh whenever the view | |
-- is refreshed (which in my case is every time that we pull new dictionary data from WaniKani or JMDICT | |
-- This function will take an array of strings and convert all the double-width alphanumeric characters | |
-- and normalize them as half-width. That way a search query can be massaged from "OK" to "ok" easily | |
CREATE OR REPLACE FUNCTION array_hankakufy_romaji(character varying[]) | |
RETURNS character varying[] | |
AS | |
$$ | |
DECLARE | |
arrTexts ALIAS FOR $1; | |
retVal character varying[]; | |
BEGIN | |
FOR I IN array_lower(arrTexts, 1)..array_upper(arrTexts, 1) LOOP | |
retVal[I] := translate(arrTexts[I], | |
'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', | |
'0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz' | |
); | |
END LOOP; | |
RETURN retVal; | |
END; | |
$$ | |
LANGUAGE plpgsql | |
STABLE | |
RETURNS NULL ON NULL INPUT; | |
-- Similarly, this function normalizes english definitions by lowercasing them and removing extra spaces between words | |
CREATE OR REPLACE FUNCTION massage_english(character varying) | |
RETURNS character varying | |
AS | |
$$ | |
BEGIN | |
return regexp_replace( | |
regexp_replace(lower($1), '[^a-z0-9\s]+', '','g'), | |
'\s+', ' ', 'g' | |
); | |
END; | |
$$ | |
LANGUAGE plpgsql | |
STABLE | |
RETURNS NULL ON NULL INPUT; | |
-- And this function just runs an array through the above function | |
CREATE OR REPLACE FUNCTION array_massage_english(character varying[]) | |
RETURNS character varying[] | |
AS | |
$$ | |
DECLARE | |
arrTexts ALIAS FOR $1; | |
retVal character varying[]; | |
BEGIN | |
FOR I IN array_lower(arrTexts, 1)..array_upper(arrTexts, 1) LOOP | |
retVal[I] := massage_english(arrTexts[I]); | |
END LOOP; | |
RETURN retVal; | |
END; | |
$$ | |
LANGUAGE plpgsql | |
STABLE | |
RETURNS NULL ON NULL INPUT; | |
-- Here's the real view! It will unnest all Japanese spellings and readings + all English definitions | |
-- into a mega table where either/both are searchable and can load he entire row into an `Item` model | |
create materialized view searchable_items as | |
select items.*, j_text, e_text | |
from items, | |
unnest( | |
array_hankakufy_romaji(array_prepend( | |
items.text, | |
array_cat( | |
items.alt_texts, | |
items.reading_texts | |
) | |
)) | |
) j_text, | |
unnest(array_massage_english(items.meaning_texts)) e_text | |
where items.type <> 'radical'; | |
-- Here's me throwing some spaghetti at the index wall. The trigram ops do NOT work | |
-- on the Japanese text column, but the similarity() function *does* work on them when they are cast to | |
-- byte strings, so I do both b-tree and gin trigram ops on both english & Japanese here | |
CREATE INDEX searchable_items_e_text_idx ON searchable_items USING GIN (e_text gin_trgm_ops); | |
CREATE INDEX searchable_items_e_text_btree_idx ON searchable_items(e_text); | |
CREATE INDEX searchable_items_j_text_idx ON searchable_items USING GIN (j_text gin_trgm_ops); | |
CREATE INDEX searchable_items_j_text_btree_idx ON searchable_items(j_text); | |
-- Gotta initially refresh (fresh?) that view with some data! | |
refresh materialized view searchable_items; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Here's the main search query itself, pulled out of a Ruby file that executes it as SQL with | |
-- ever `:`-led token being a bind parameter of queries or tokens or tokens wrapped in '%' for like | |
-- operators | |
select *, | |
(select count(*) from learnings where user_id = :user_id and item_id = o2.id) learnings_count | |
from ( | |
select distinct on (o1.id) o1.*, | |
greatest( | |
(case when :e_tokens_present then similarity(o1.e_text, :e_query) else 0 end), | |
(case when :j_tokens_present then similarity(o1.j_text::bytea::text, :j_query::bytea::text) else 0 end) | |
) rank | |
from (( | |
-- Exact Matches on whole query | |
select distinct on (searchable_items.id) searchable_items.* | |
from searchable_items | |
where ( | |
(:j_query_present and j_text = :j_query) | |
or | |
(:e_query_present and e_text = :e_query) | |
) | |
limit :exact_limit | |
) union ( | |
-- LIKE matches on any one space-delimited token in the query | |
select distinct on (searchable_items.id) searchable_items.* | |
from searchable_items | |
where ( | |
(:j_tokens_present and j_text like any (array[:j_likes])) | |
or | |
(:e_tokens_present and e_text like any (array[:e_likes])) | |
) | |
limit :like_limit | |
) union ( | |
-- Similarity for English queries based on pg_trgm similarity & index | |
select distinct on (searchable_items.id) searchable_items.* | |
from searchable_items | |
where :e_query_present and e_text % :e_query | |
limit :e_similar_limit | |
) union ( | |
-- Strip conjugation from verbs & adjectives | |
select distinct on (searchable_items.id) searchable_items.* | |
from searchable_items | |
where ( | |
parts_of_speech && '{する verb,godan verb, ichidan verb,い adjective}'::varchar[] | |
and | |
j_text like any (array[:j_stems]) | |
) | |
limit :stems_limit | |
) union ( | |
-- Alternate english definitions (user defined + official) | |
select distinct on (searchable_items.id) | |
searchable_items.id, | |
searchable_items.type, | |
searchable_items.wanikani_api_url, | |
searchable_items.wanikani_app_url, | |
searchable_items.wanikani_level, | |
searchable_items.wanikani_slug, | |
searchable_items.text, | |
searchable_items.parts_of_speech, | |
searchable_items.wanikani_component_item_ids, | |
searchable_items.wanikani_created_at, | |
searchable_items.wanikani_updated_at, | |
searchable_items.created_at, | |
searchable_items.updated_at, | |
searchable_items.wanikani_id, | |
searchable_items.meaning_texts, | |
searchable_items.reading_texts, | |
searchable_items.source, | |
searchable_items.jmdict_id, | |
searchable_items.jmdict_updated_on, | |
searchable_items.priority, | |
searchable_items.alt_texts, | |
searchable_items.usually_written_using_kana_alone, | |
searchable_items.common_word, | |
searchable_items.conjugation_code, | |
searchable_items.j_text, | |
massage_english(definitions.text) e_text --- And then there's Maude! | |
from searchable_items | |
join definitions on searchable_items.id = definitions.item_id and (definitions.official or definitions.user_id = :user_id) | |
where ( | |
:e_tokens_present | |
and | |
( | |
definitions.text = :query | |
or | |
massage_english(definitions.text) = any (array[:e_tokens]) | |
) | |
) | |
limit :english_definition_limit | |
) union ( | |
-- Alternate Japanese spellings (user defined + official) | |
select distinct on (searchable_items.id) | |
searchable_items.id, | |
searchable_items.type, | |
searchable_items.wanikani_api_url, | |
searchable_items.wanikani_app_url, | |
searchable_items.wanikani_level, | |
searchable_items.wanikani_slug, | |
searchable_items.text, | |
searchable_items.parts_of_speech, | |
searchable_items.wanikani_component_item_ids, | |
searchable_items.wanikani_created_at, | |
searchable_items.wanikani_updated_at, | |
searchable_items.created_at, | |
searchable_items.updated_at, | |
searchable_items.wanikani_id, | |
searchable_items.meaning_texts, | |
searchable_items.reading_texts, | |
searchable_items.source, | |
searchable_items.jmdict_id, | |
searchable_items.jmdict_updated_on, | |
searchable_items.priority, | |
searchable_items.alt_texts, | |
searchable_items.usually_written_using_kana_alone, | |
searchable_items.common_word, | |
searchable_items.conjugation_code, | |
spellings.text j_text, --- And then there's Maude! | |
searchable_items.e_text | |
from searchable_items | |
join spellings on searchable_items.id = spellings.item_id and (spellings.official or spellings.user_id = :user_id) | |
where ( | |
:j_tokens_present | |
and | |
( | |
spellings.text = :query | |
or | |
spellings.text = any (array[:j_tokens]) | |
) | |
) | |
limit :japanese_spelling_limit | |
)) o1 | |
) o2 | |
order by o2.rank desc | |
limit :results_limit |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- There are edge cases that are expensive to query but still important | |
-- to cover, which I'm lumping in this query and only executing it when | |
-- the first query's results are garbage (in Ruby, I score every query result | |
-- based on a dozen factors in order to sort them more accurately and if they're bad | |
-- and there's time remaining on the self-imposed search SLA, I will run this query too | |
-- | |
-- Note I pulled a couple out of here and into the main query after speeding them up significantly | |
-- so right now there is only a Japanese similarity query (again, b/c trigram ops are slow on byte | |
-- strings and Japanese isn't strictly supported | |
select *, | |
(select count(*) from learnings where user_id = :user_id and item_id = o.id) learnings_count | |
from (( | |
-- Misspellings in Japanese queries (similarity is not possible with pg_trgm, so using levenshtein) | |
select distinct on (searchable_items.id) searchable_items.*, | |
(case when :j_tokens_present then similarity(j_text::bytea::text, :j_query::bytea::text) else 0 end) rank | |
from searchable_items | |
where :j_query_present and levenshtein_less_equal(j_text, :j_query, 1) < 2 | |
limit :j_similar_limit | |
)) o | |
order by o.rank desc | |
limit :results_limit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment