| from langchain.text_splitter import MarkdownHeaderTextSplitter | |
| from langchain.schema import Document | |
| def split_text_by_markdown(input_md: str) -> list: | |
| headers_to_split_on = [ | |
| ("#", "Header 1"), | |
| ("##", "Header 2"), | |
| ("###", "Header 3"), | |
| ] | |
| splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) | |
| chunks = splitter.split_text(input_md) | |
| documents = [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks] | |
| return documents |