Created
December 31, 2024 23:21
-
-
Save elijahbenizzy/d7801374f21eb31717d2ec4b2ec4950f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
embedding_model = get_registry().get("openai").create() | |
class TextDocument(LanceModel): | |
"""Simple data structure to hold a piece of text associated with a url.""" | |
url: str | |
position: int | |
text: str = embedding_model.SourceField() | |
vector: Vector(dim=embedding_model.ndims()) = embedding_model.VectorField() | |
# Constants | |
CHUNK_SIZE = 2000 # Target size of each chunk in tokens | |
MIN_CHUNK_LENGTH_TO_EMBED = 5 # Discard chunks shorter than this | |
# Initialize the tokenizer | |
tokenizer = tiktoken.get_encoding("cl100k_base") | |
def get_text_chunks(text: str, chunk_token_size: Optional[int] = CHUNK_SIZE) -> List[str]: | |
""" | |
Split text into chunks of approximately `chunk_token_size` tokens. | |
Args: | |
text: The input text to chunk. | |
chunk_token_size: Target size of each chunk in tokens. | |
Returns: | |
A list of text chunks. | |
""" | |
if not text.strip(): | |
return [] | |
tokens = tokenizer.encode(text) | |
chunks = [] | |
chunk_size = chunk_token_size or CHUNK_SIZE | |
while tokens: | |
# Extract chunk tokens and decode to text | |
chunk_tokens = tokens[:chunk_size] | |
chunk_text = tokenizer.decode(chunk_tokens).strip() | |
# Skip chunks that are too short | |
if len(chunk_text) > MIN_CHUNK_LENGTH_TO_EMBED: | |
chunks.append(chunk_text) | |
# Remove processed tokens | |
tokens = tokens[len(chunk_tokens):] | |
return chunks |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment