Skip to content

Instantly share code, notes, and snippets.

@w32zhong
Last active April 17, 2026 06:36
Show Gist options
  • Select an option

  • Save w32zhong/aa8f8b602bcabea3fcca1f778cb0761b to your computer and use it in GitHub Desktop.

Select an option

Save w32zhong/aa8f8b602bcabea3fcca1f778cb0761b to your computer and use it in GitHub Desktop.
[package]
name = "chinese_search"
version = "0.1.0"
edition = "2024"
[dependencies]
jieba-rs = "0.9.0"
tantivy = "0.26.0"
tantivy-jieba = "0.19.0"
mod tokenizer;
use tokenizer::CustomJiebaTokenizer;
use jieba_rs::Jieba;
use tantivy::schema::*;
use tantivy::{doc, Index, Term};
use tantivy::query::PhraseQuery;
use tantivy::collector::TopDocs;
fn main() -> tantivy::Result<()> {
let mut jieba = Jieba::new();
jieba.add_word("成為", None, None);
let custom_tokenizer = CustomJiebaTokenizer::new(jieba);
let text = "貓和狗可以成為最好的朋友";
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("custom_jieba")
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
).set_stored();
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
index.tokenizers().register("custom_jieba", tantivy::tokenizer::TextAnalyzer::builder(custom_tokenizer).build());
let mut index_writer = index.writer(15_000_000)?;
index_writer.add_document(doc!(text_field => text))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
// ==========================================
// Programmatic PhraseQuery with custom Slop
// ==========================================
let search_phrase = "貓狗朋友";
let program_slop: u32 = 10;
// 1. We tokenize the search phrase using the registered analyzer
let mut analyzer = index.tokenizers().get("custom_jieba").unwrap();
let mut token_stream = analyzer.token_stream(search_phrase);
let mut terms_with_offset = Vec::new();
while let Some(token) = token_stream.next() {
let term = Term::from_field_text(text_field, &token.text);
// We use the token's position to keep relative offsets intact!
terms_with_offset.push((token.position, term));
}
// 2. Programmatically create the query with the custom slop
let query = PhraseQuery::new_with_offset_and_slop(terms_with_offset, program_slop);
println!("Programmatically constructed query: {:?}", query);
let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?;
println!("Search results for programmatic '{}' with slop {}:", search_phrase, program_slop);
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc::<tantivy::TantivyDocument>(doc_address)?;
println!("Found document: {}", retrieved_doc.to_json(&schema));
}
Ok(())
}
use jieba_rs::Jieba;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
#[derive(Clone)]
pub struct CustomJiebaTokenizer {
jieba: std::sync::Arc<Jieba>,
}
impl CustomJiebaTokenizer {
pub fn new(jieba: Jieba) -> Self {
Self {
jieba: std::sync::Arc::new(jieba),
}
}
}
impl Tokenizer for CustomJiebaTokenizer {
type TokenStream<'a> = CustomJiebaTokenStream<'a>;
fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> {
let tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Default, false);
CustomJiebaTokenStream {
text,
tokens,
index: 0,
token: Token::default(),
sequence_position: 0,
}
}
}
pub struct CustomJiebaTokenStream<'a> {
text: &'a str,
tokens: Vec<jieba_rs::Token<'a>>,
index: usize,
token: Token,
sequence_position: usize,
}
impl<'a> TokenStream for CustomJiebaTokenStream<'a> {
fn advance(&mut self) -> bool {
while self.index < self.tokens.len() {
let t = &self.tokens[self.index];
let word = t.word.trim();
self.index += 1;
// Skip whitespace tokens
if word.is_empty() {
continue;
}
self.token.text.clear();
self.token.text.push_str(word);
self.token.offset_from = t.word.as_ptr() as usize - self.text.as_ptr() as usize;
self.token.offset_to = self.token.offset_from + word.len();
// Use sequential positional indices
self.token.position = self.sequence_position;
self.token.position_length = 1;
self.sequence_position += 1;
return true;
}
false
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment