File size: 522 Bytes
ee00031
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.schema import Document

def split_text_by_markdown(input_md: str) -> list:
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    chunks = splitter.split_text(input_md)
    documents = [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks]
    return documents