change style of chunking to combine between markdown and recursive
Browse files- Readme.md +7 -8
- utils/chunking.py +34 -6
Readme.md
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
This is the FastAPI backend for my RAG chatbot.
|
| 13 |
-
It processes PDFs, stores embeddings in Pinecone, and answers queries using Groq + Gemini.
|
|
|
|
| 1 |
---
|
| 2 |
+
title: FastAPI Backend ChatbotRAG
|
| 3 |
+
emoji: 💻
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
short_description: This is backend of chatbotRAG project
|
| 10 |
---
|
| 11 |
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
utils/chunking.py
CHANGED
|
@@ -1,13 +1,41 @@
|
|
| 1 |
-
from langchain.text_splitter import MarkdownHeaderTextSplitter
|
| 2 |
from langchain.schema import Document
|
|
|
|
| 3 |
|
| 4 |
-
def split_text_by_markdown(input_md: str) -> list:
|
|
|
|
| 5 |
headers_to_split_on = [
|
| 6 |
("#", "Header 1"),
|
| 7 |
("##", "Header 2"),
|
| 8 |
("###", "Header 3"),
|
| 9 |
]
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
|
| 2 |
from langchain.schema import Document
|
| 3 |
+
import tiktoken
|
| 4 |
|
| 5 |
+
def split_text_by_markdown(input_md: str, max_tokens: int = 2048, model: str = "cl100k_base") -> list:
|
| 6 |
+
# Step 1: Split by headers
|
| 7 |
headers_to_split_on = [
|
| 8 |
("#", "Header 1"),
|
| 9 |
("##", "Header 2"),
|
| 10 |
("###", "Header 3"),
|
| 11 |
]
|
| 12 |
+
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
|
| 13 |
+
header_chunks = md_splitter.split_text(input_md)
|
| 14 |
+
|
| 15 |
+
# Step 2: Tokenizer (OpenAI/Groq style)
|
| 16 |
+
encoding = tiktoken.get_encoding(model)
|
| 17 |
+
|
| 18 |
+
# Step 3: For each header chunk, further split if it’s too long
|
| 19 |
+
final_docs = []
|
| 20 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 21 |
+
chunk_size=1000, # characters per chunk (roughly ~500 tokens, safe buffer)
|
| 22 |
+
chunk_overlap=100 # overlap to preserve context
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
for chunk in header_chunks:
|
| 26 |
+
token_count = len(encoding.encode(chunk.page_content))
|
| 27 |
+
|
| 28 |
+
if token_count > max_tokens:
|
| 29 |
+
# Split into smaller parts
|
| 30 |
+
sub_chunks = text_splitter.split_text(chunk.page_content)
|
| 31 |
+
for sub in sub_chunks:
|
| 32 |
+
final_docs.append(
|
| 33 |
+
Document(page_content=sub, metadata=chunk.metadata)
|
| 34 |
+
)
|
| 35 |
+
else:
|
| 36 |
+
# Keep as is
|
| 37 |
+
final_docs.append(
|
| 38 |
+
Document(page_content=chunk.page_content, metadata=chunk.metadata)
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return final_docs
|