Skip to content

Instantly share code, notes, and snippets.

@elijahbenizzy
Created December 31, 2024 23:21
Show Gist options
  • Save elijahbenizzy/d7801374f21eb31717d2ec4b2ec4950f to your computer and use it in GitHub Desktop.
Save elijahbenizzy/d7801374f21eb31717d2ec4b2ec4950f to your computer and use it in GitHub Desktop.
embedding_model = get_registry().get("openai").create()
class TextDocument(LanceModel):
"""Simple data structure to hold a piece of text associated with a url."""
url: str
position: int
text: str = embedding_model.SourceField()
vector: Vector(dim=embedding_model.ndims()) = embedding_model.VectorField()
# Constants
CHUNK_SIZE = 2000 # Target size of each chunk in tokens
MIN_CHUNK_LENGTH_TO_EMBED = 5 # Discard chunks shorter than this
# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")
def get_text_chunks(text: str, chunk_token_size: Optional[int] = CHUNK_SIZE) -> List[str]:
"""
Split text into chunks of approximately `chunk_token_size` tokens.
Args:
text: The input text to chunk.
chunk_token_size: Target size of each chunk in tokens.
Returns:
A list of text chunks.
"""
if not text.strip():
return []
tokens = tokenizer.encode(text)
chunks = []
chunk_size = chunk_token_size or CHUNK_SIZE
while tokens:
# Extract chunk tokens and decode to text
chunk_tokens = tokens[:chunk_size]
chunk_text = tokenizer.decode(chunk_tokens).strip()
# Skip chunks that are too short
if len(chunk_text) > MIN_CHUNK_LENGTH_TO_EMBED:
chunks.append(chunk_text)
# Remove processed tokens
tokens = tokens[len(chunk_tokens):]
return chunks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment