ngdwtrg commited on
Commit
dc8b7be
·
1 Parent(s): 88ab67c

change style of chunking to combine between markdown and recursive

Browse files
Files changed (2) hide show
  1. Readme.md +7 -8
  2. utils/chunking.py +34 -6
Readme.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
- title: RAG Chatbot Backend
3
- emoji: 🤖
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: docker
7
  pinned: false
 
 
8
  ---
9
 
10
- # RAG Chatbot Backend
11
-
12
- This is the FastAPI backend for my RAG chatbot.
13
- It processes PDFs, stores embeddings in Pinecone, and answers queries using Groq + Gemini.
 
1
  ---
2
+ title: FastAPI Backend ChatbotRAG
3
+ emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
+ short_description: This is backend of chatbotRAG project
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
utils/chunking.py CHANGED
@@ -1,13 +1,41 @@
1
- from langchain.text_splitter import MarkdownHeaderTextSplitter
2
  from langchain.schema import Document
 
3
 
4
- def split_text_by_markdown(input_md: str) -> list:
 
5
  headers_to_split_on = [
6
  ("#", "Header 1"),
7
  ("##", "Header 2"),
8
  ("###", "Header 3"),
9
  ]
10
- splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
11
- chunks = splitter.split_text(input_md)
12
- documents = [Document(page_content=chunk.page_content, metadata=chunk.metadata) for chunk in chunks]
13
- return documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
2
  from langchain.schema import Document
3
+ import tiktoken
4
 
5
+ def split_text_by_markdown(input_md: str, max_tokens: int = 2048, model: str = "cl100k_base") -> list:
6
+ # Step 1: Split by headers
7
  headers_to_split_on = [
8
  ("#", "Header 1"),
9
  ("##", "Header 2"),
10
  ("###", "Header 3"),
11
  ]
12
+ md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
13
+ header_chunks = md_splitter.split_text(input_md)
14
+
15
+ # Step 2: Tokenizer (OpenAI/Groq style)
16
+ encoding = tiktoken.get_encoding(model)
17
+
18
+ # Step 3: For each header chunk, further split if it’s too long
19
+ final_docs = []
20
+ text_splitter = RecursiveCharacterTextSplitter(
21
+ chunk_size=1000, # characters per chunk (roughly ~500 tokens, safe buffer)
22
+ chunk_overlap=100 # overlap to preserve context
23
+ )
24
+
25
+ for chunk in header_chunks:
26
+ token_count = len(encoding.encode(chunk.page_content))
27
+
28
+ if token_count > max_tokens:
29
+ # Split into smaller parts
30
+ sub_chunks = text_splitter.split_text(chunk.page_content)
31
+ for sub in sub_chunks:
32
+ final_docs.append(
33
+ Document(page_content=sub, metadata=chunk.metadata)
34
+ )
35
+ else:
36
+ # Keep as is
37
+ final_docs.append(
38
+ Document(page_content=chunk.page_content, metadata=chunk.metadata)
39
+ )
40
+
41
+ return final_docs