Last active
April 17, 2026 06:36
-
-
Save w32zhong/aa8f8b602bcabea3fcca1f778cb0761b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [package] | |
| name = "chinese_search" | |
| version = "0.1.0" | |
| edition = "2024" | |
| [dependencies] | |
| jieba-rs = "0.9.0" | |
| tantivy = "0.26.0" | |
| tantivy-jieba = "0.19.0" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| mod tokenizer; | |
| use tokenizer::CustomJiebaTokenizer; | |
| use jieba_rs::Jieba; | |
| use tantivy::schema::*; | |
| use tantivy::{doc, Index, Term}; | |
| use tantivy::query::PhraseQuery; | |
| use tantivy::collector::TopDocs; | |
| fn main() -> tantivy::Result<()> { | |
| let mut jieba = Jieba::new(); | |
| jieba.add_word("成為", None, None); | |
| let custom_tokenizer = CustomJiebaTokenizer::new(jieba); | |
| let text = "貓和狗可以成為最好的朋友"; | |
| let mut schema_builder = Schema::builder(); | |
| let text_options = TextOptions::default().set_indexing_options( | |
| TextFieldIndexing::default() | |
| .set_tokenizer("custom_jieba") | |
| .set_index_option(IndexRecordOption::WithFreqsAndPositions) | |
| ).set_stored(); | |
| let text_field = schema_builder.add_text_field("text", text_options); | |
| let schema = schema_builder.build(); | |
| let index = Index::create_in_ram(schema.clone()); | |
| index.tokenizers().register("custom_jieba", tantivy::tokenizer::TextAnalyzer::builder(custom_tokenizer).build()); | |
| let mut index_writer = index.writer(15_000_000)?; | |
| index_writer.add_document(doc!(text_field => text))?; | |
| index_writer.commit()?; | |
| let reader = index.reader()?; | |
| let searcher = reader.searcher(); | |
| // ========================================== | |
| // Programmatic PhraseQuery with custom Slop | |
| // ========================================== | |
| let search_phrase = "貓狗朋友"; | |
| let program_slop: u32 = 10; | |
| // 1. We tokenize the search phrase using the registered analyzer | |
| let mut analyzer = index.tokenizers().get("custom_jieba").unwrap(); | |
| let mut token_stream = analyzer.token_stream(search_phrase); | |
| let mut terms_with_offset = Vec::new(); | |
| while let Some(token) = token_stream.next() { | |
| let term = Term::from_field_text(text_field, &token.text); | |
| // We use the token's position to keep relative offsets intact! | |
| terms_with_offset.push((token.position, term)); | |
| } | |
| // 2. Programmatically create the query with the custom slop | |
| let query = PhraseQuery::new_with_offset_and_slop(terms_with_offset, program_slop); | |
| println!("Programmatically constructed query: {:?}", query); | |
| let top_docs = searcher.search(&query, &TopDocs::with_limit(10).order_by_score())?; | |
| println!("Search results for programmatic '{}' with slop {}:", search_phrase, program_slop); | |
| for (_score, doc_address) in top_docs { | |
| let retrieved_doc = searcher.doc::<tantivy::TantivyDocument>(doc_address)?; | |
| println!("Found document: {}", retrieved_doc.to_json(&schema)); | |
| } | |
| Ok(()) | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use jieba_rs::Jieba; | |
| use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; | |
| #[derive(Clone)] | |
| pub struct CustomJiebaTokenizer { | |
| jieba: std::sync::Arc<Jieba>, | |
| } | |
| impl CustomJiebaTokenizer { | |
| pub fn new(jieba: Jieba) -> Self { | |
| Self { | |
| jieba: std::sync::Arc::new(jieba), | |
| } | |
| } | |
| } | |
| impl Tokenizer for CustomJiebaTokenizer { | |
| type TokenStream<'a> = CustomJiebaTokenStream<'a>; | |
| fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> { | |
| let tokens = self.jieba.tokenize(text, jieba_rs::TokenizeMode::Default, false); | |
| CustomJiebaTokenStream { | |
| text, | |
| tokens, | |
| index: 0, | |
| token: Token::default(), | |
| sequence_position: 0, | |
| } | |
| } | |
| } | |
| pub struct CustomJiebaTokenStream<'a> { | |
| text: &'a str, | |
| tokens: Vec<jieba_rs::Token<'a>>, | |
| index: usize, | |
| token: Token, | |
| sequence_position: usize, | |
| } | |
| impl<'a> TokenStream for CustomJiebaTokenStream<'a> { | |
| fn advance(&mut self) -> bool { | |
| while self.index < self.tokens.len() { | |
| let t = &self.tokens[self.index]; | |
| let word = t.word.trim(); | |
| self.index += 1; | |
| // Skip whitespace tokens | |
| if word.is_empty() { | |
| continue; | |
| } | |
| self.token.text.clear(); | |
| self.token.text.push_str(word); | |
| self.token.offset_from = t.word.as_ptr() as usize - self.text.as_ptr() as usize; | |
| self.token.offset_to = self.token.offset_from + word.len(); | |
| // Use sequential positional indices | |
| self.token.position = self.sequence_position; | |
| self.token.position_length = 1; | |
| self.sequence_position += 1; | |
| return true; | |
| } | |
| false | |
| } | |
| fn token(&self) -> &Token { | |
| &self.token | |
| } | |
| fn token_mut(&mut self) -> &mut Token { | |
| &mut self.token | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment