Duc Trung
init backend
ee00031
raw
history blame
522 Bytes
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.schema import Document
def split_text_by_markdown(input_md: str) -> list:
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
chunks = splitter.split_text(input_md)
documents = [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks]
return documents