class MarkdownLLMSplitter()
A class to split Markdown files into sections according to GPT token size limits. Currently supports OpenAI models only, since it uses the tiktoken
library for tokenization.
Attributes: gptoker: GPT tokenizer instance used to calculate token sizes. gptok_limit: The maximum number of GPT tokens allowed per section. md_meta: Metadata found in the source Markdown file. md_str: The source Markdown string. md_doc: The parsed source Markdown document as a mistletoe Document instance. md_dict: A dictionary representing the structure of the Markdown document. md_path: The absolute path to the source Markdown file. md_sections: List of sections (dicts) containing the Markdown content as “md” and its size in tokens as “gptok_size”.
Args: gptok_model: The GPT tokenizer model to use for calculating token sizes, defaults to “gpt-3.5-turbo”. gptok_limit: The maximum number of GPT tokens allowed per section, defaults to the model’s maximum tokens.
def load_md_path(md_path: Union[Path, str]) -> None
Load a Markdown file from a file path.
Args: md_path: The file path to the source Markdown file.
def load_md_file(md_file: TextIOWrapper) -> None
Load a Markdown file from a file-like object.
Args: md_file: The file-like object containing the source Markdown content.
def load_md_str(md_str: str) -> None
Load a Markdown file from a string.
Args: md_str: The source Markdown content as a string.
def load_md(md: Union[str, Path, TextIOWrapper]) -> None
Load a Markdown file from a string, file path, or file-like object.
Args: md: The source Markdown content, can be a string, file path or file-like object.
@lru_cache(maxsize=None)
def gpttok_size(text: str) -> int
Calculates the number of GPT tokens in a text string.
Args: text: The text string to calculate the token size.
Returns: The number of GPT tokens in the text string.
def build_md_dict() -> None
Builds a dictionary representing the structure of the source Markdown document.
def calculate_sizes(md_dict: Dict) -> int
Recursively calculates the total size of GPT tokens in the provided Markdown dictionary.
Args: md_dict: The Markdown dictionary to calculate token sizes.
Returns: The total number of GPT tokens in the Markdown dictionary.
def process_item(
item: Dict, current_section: List[str], current_size: int,
md_sections: List[Dict[str, Union[str,
int]]]) -> Tuple[List[str], int]
Processes an item in the Markdown dictionary and adds it to the appropriate section.
Args: item: The Markdown item to process. current_section: The current section being built as a list of strings. current_size: The current size of the section in GPT tokens. md_sections: The list of sections (dicts with “md” and “gptok_size”) being built.
Returns: A tuple containing the updated current_section and the current_size.
def prep_section(section_text: str,
size: int = None) -> Dict[str, Union[str, int]]
Prepares a section by removing excessive newlines and calculating the section size if not provided.
Args: section_text: The Markdown content of the section. size: The size of the section in GPT tokens, defaults to None (automatically calculated).
Returns: Dict with the prepared Markdown content as “md” and its size in tokens as “gptok_size”.
def process_md(
item: Dict, current_section: List[str], current_size: int,
md_sections: List[Dict[str, Union[str,
int]]]) -> Tuple[List[str], int]
Processes a Markdown item and adds it to the appropriate section.
Args: item: The Markdown item to process. current_section: The current section being built as a list of strings. current_size: The current size of the section in GPT tokens. md_sections: The list of sections (dicts with “md” and “gptok_size”) being built.
Returns: A tuple containing the updated current_section and the current_size.
def get_sections_from_md_dict_by_limit(
md_dict: Dict) -> List[Dict[str, Union[str, int]]]
Builds the sections from the provided Markdown dictionary by fitting the content within token limits.
Args: md_dict: The Markdown dictionary to build sections from.
Returns: A list of sections (dicts) containing the Markdown content as “md” and its size in tokens as “gptok_size”.
def build() -> None
Builds the sections by processing the loaded Markdown document.
def list_section_dicts() -> List[Dict[str, Union[str, int]]]
Returns a list of section dictionaries containing the Markdown content and its size.
Returns: A list of dictionaries with keys ‘md’ and ‘size’.
def gen_section_dicts() -> Generator[Dict[str, Union[str, int]], None, None]
Generator that yields section dictionaries containing the Markdown content and its size.
def list_section_texts() -> List[str]
Returns a list of section texts containing the Markdown content.
Returns: A list of strings with the Markdown content of each section.
def gen_section_texts() -> Generator[str, None, None]
Generator that yields the Markdown content of each section.
def split(md: Union[str, Path, TextIOWrapper]) -> List[str]
Splits the loaded Markdown document into sections according to the GPT token size limits.
Args: md: The source Markdown content, can be a string, file path or file-like object.
Returns: A list of strings with the Markdown content of each section.
def split(md: Union[str, Path, TextIOWrapper],
model: str = "gpt-3.5-turbo",
limit: int = None) -> List[str]
A utility function to split a Markdown document into sections according to GPT token size limits.
Args: md: The source Markdown content, can be a string, file path or file-like object. model: The GPT tokenizer model to use for calculating token sizes, defaults to “gpt-3.5-turbo”. limit: The maximum number of GPT tokens allowed per section, defaults to the model’s maximum tokens.
Returns: A list of strings with the Markdown content of each section.
def split_md_file(md_path: Union[str, Path],
model: str = "gpt-3.5-turbo",
limit: int = None,
separator: str = "=== SPLIT ===") -> str
Splits a Markdown file into sections according to GPT token size limits.
This tool loads a Markdown file, and splits its content into sections that are within the specified token size limit using the desired GPT tokenizing model. The resulting sections are then concatenated using the specified separator and returned as a single string.
Args: md_path: The path of the source Markdown file to be split. model: The GPT tokenizer model to use for calculating token sizes. Defaults to “gpt-3.5-turbo”. limit: The maximum number of GPT tokens allowed per section. Defaults to the model’s maximum tokens. separator: The string used to separate sections in the output. Defaults to “=== SPLIT ===”.
Returns: A single string containing the Markdown content of the file, split into sections and separated by the specified separator.