Last active
June 23, 2021 22:34
-
-
Save frsyuki/3ace7d2188699b058570939535cca964 to your computer and use it in GitHub Desktop.
PostgreSQL bi-gram full text search without application code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- ref: https://www.clear-code.com/blog/2020/12/22.html | |
create type fts_bigram_token as (token text, position bigint); | |
create or replace function fts_bigram_tokenize(value text) | |
returns setof fts_bigram_token as $$ | |
select trim(coalesce(lag(c) over (), '') || c), row_number() over () | |
from ( | |
select unnest( | |
regexp_matches(regexp_replace(value, '[[:punct:]]', ' ', 'g') || ' ', '(?!\s\s).', 'g') | |
) as c | |
) chars | |
$$ language sql immutable; | |
create or replace function fts_bigram_to_tsvector(cols text[]) | |
returns tsvector as $$ | |
select string_agg(token || ':' || position, ' ')::tsvector | |
from ( | |
select token, position from fts_bigram_tokenize(array_to_string(cols, ' ')) | |
) tokenizer | |
$$ language sql immutable; | |
create aggregate tsquery_phrase_agg(tsquery) ( | |
SFUNC = tsquery_phrase, | |
STYPE = tsquery | |
); | |
create or replace function fts_bigram_to_tsquery(word text) | |
returns tsquery as $$ | |
select | |
coalesce(q, tsquery(trim(word))) | |
from ( | |
select tsquery_phrase_agg(tsquery(token)) as q | |
from ( | |
select token, position from fts_bigram_tokenize(word) | |
) tokenizer | |
where length(token) > 1 | |
) phrase | |
$$ language sql immutable; | |
-- Setup table | |
drop table if exists fts_test cascade; | |
create table fts_test ( | |
id bigserial primary key, | |
name text, | |
descr text | |
); | |
create index fts_test_gist on fts_test using gist (fts_bigram_to_tsvector(array[name, descr])); | |
create view fts_test_search | |
as select *, fts_bigram_to_tsvector(array[name, descr]) as fts_text | |
from fts_test; | |
-- Insert example data | |
insert into fts_test (name, descr) | |
values ('楽しい', '日本 京都府庁'), | |
('検索結果は', '東京都庁'), | |
('English', 'search abc ttt'); | |
select * from fts_test_search; | |
-- id | name | descr | fts_text | |
-- ----+------------+----------------+----------------------------------------------------------------------------------------------------------------------------------------------------------- | |
-- 1 | 楽しい | 日本 京都府庁 | 'い':4 'しい':3 '京':8 '京都':9 '庁':12 '府庁':11 '日':5 '日本':6 '本':7 '楽':1 '楽し':2 '都府':10 | |
-- 2 | 検索結果は | 東京都庁 | 'は':6 '京都':9 '庁':11 '東':7 '東京':8 '果は':5 '検':1 '検索':2 '索結':3 '結果':4 '都庁':10 | |
-- 3 | English | search abc ttt | 'E':1 'En':2 'a':16 'ab':17 'ar':12 'bc':18 'c':19 'ch':14 'ea':11 'gl':4 'h':8,15 'is':6 'li':5 'ng':3 'rc':13 's':9 'se':10 'sh':7 't':20,23 'tt':21,22 | |
-- Query | |
select fts_bigram_to_tsquery('東京都'); | |
-- fts_bigram_to_tsquery | |
-- ----------------------- | |
-- '東京' <-> '京都' | |
select id, name, descr, ts_rank(fts_text, fts_bigram_to_tsquery('東京都')) as ts_rank | |
from fts_test_search | |
where fts_text @@ fts_bigram_to_tsquery('東京都') | |
order by ts_rank desc; | |
-- id | name | descr | ts_rank | |
-- ----+------------+----------+----------- | |
-- 2 | 検索結果は | 東京都庁 | 0.0991032 | |
explain analyze | |
select id, name, descr | |
from fts_test_search | |
where fts_text @@ fts_bigram_to_tsquery('東京都'); | |
-- QUERY PLAN | |
-- ------------------------------------------------------------------------------------------------------------------------- | |
-- Index Scan using fts_test_gist on fts_test (cost=0.14..8.16 rows=1 width=72) (actual time=0.185..0.185 rows=1 loops=1) | |
-- Index Cond: (fts_bigram_to_tsvector(ARRAY[name, descr]) @@ '''東京'' <-> ''京都'''::tsquery) | |
-- Planning Time: 0.306 ms | |
-- Execution Time: 0.204 ms | |
explain analyze | |
select id, name, descr, ts_rank(fts_text, fts_bigram_to_tsquery('東京都')) as ts_rank | |
from fts_test_search | |
where fts_text @@ fts_bigram_to_tsquery('東京都') | |
order by ts_rank desc; | |
-- QUERY PLAN | |
-- ------------------------------------------------------------------------------------------------------------------------------- | |
-- Sort (cost=8.42..8.43 rows=1 width=76) (actual time=0.459..0.459 rows=1 loops=1) | |
-- Sort Key: (ts_rank(fts_bigram_to_tsvector(ARRAY[fts_test.name, fts_test.descr]), '''東京'' <-> ''京都'''::tsquery)) DESC | |
-- Sort Method: quicksort Memory: 25kB | |
-- -> Index Scan using fts_test_gist on fts_test (cost=0.14..8.41 rows=1 width=76) (actual time=0.455..0.456 rows=1 loops=1) | |
-- Index Cond: (fts_bigram_to_tsvector(ARRAY[name, descr]) @@ '''東京'' <-> ''京都'''::tsquery) | |
-- Planning Time: 0.466 ms | |
-- Execution Time: 0.481 ms |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment