Spaces:

ashishanand
/

car_manual_assistant

Sleeping

App Files Files Community

ashishanand commited on Nov 17, 2024

Commit

1a492a3

1 Parent(s): d7955f6

Removed chromadb directory from tracking

Browse files

Files changed (8) hide show

.gitignore +1 -0
app.py +120 -10
chromadb/chroma.sqlite3 → car-manuals/manual_Astor.pdf +2 -2
chromadb/e820442b-1d6c-4933-8a2c-981f60377458/data_level0.bin → car-manuals/manual_Tiago.pdf +2 -2
chromadb/e820442b-1d6c-4933-8a2c-981f60377458/header.bin +0 -3
chromadb/e820442b-1d6c-4933-8a2c-981f60377458/length.bin +0 -3
chromadb/e820442b-1d6c-4933-8a2c-981f60377458/link_lists.bin +0 -0
requirements.txt +0 -2

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ chromadb/

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from transformers import AutoTokenizer
 from rerankers import Reranker
 from transformers import GPT2TokenizerFast
 from groq import Groq
 import gradio as gr
 # Retrieve the API key from environment variables (Hugging Face Secrets)
@@ -23,7 +23,22 @@ groq_api_key = os.environ.get('GROQ_API_KEY')
 chat_client = Groq(api_key=groq_api_key)
 model = "llama-3.2-90b-text-preview"
-# Define your functions (same as before)
 def preprocess_text(text):
     # ... (same as your original function)
     text = re.sub(r'\s+', ' ', text)
@@ -53,6 +68,49 @@ def call_Llama_api(query, context):
     response = chat_completion.choices[0].message.content
     return response
 def is_car_model_available(query, available_models):
     # ... (same as your original function)
     for model in available_models:
@@ -60,6 +118,16 @@ def is_car_model_available(query, available_models):
             return model
     return None
 def colbert_rerank(query=None, chunks=None):
     # ... (same as your original function)
     d = ranker.rank(query=query, docs=chunks)
@@ -111,27 +179,68 @@ def initialize():
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"Using device: {device}")
     # Initialize embedding model
     embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
         model_name="all-MiniLM-L12-v2", device=device
     )
-    # Load the persisted ChromaDB collection
     client = PersistentClient(path="./chromadb")
     # Get the collection
     collection_name = "car_manuals5"
-    collection = client.get_collection(
-        name=collection_name,
-        embedding_function=embedding_function
-    )
     # Set available car models
-    available_car_models = ['TIAGO', 'Astor']
     # Initialize the ranker
     ranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type='colbert')
 # Call initialize function
 initialize()
@@ -145,4 +254,5 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 from rerankers import Reranker
 from transformers import GPT2TokenizerFast
 from groq import Groq
+from chromadb import PersistentClient
 import gradio as gr
 # Retrieve the API key from environment variables (Hugging Face Secrets)
 chat_client = Groq(api_key=groq_api_key)
 model = "llama-3.2-90b-text-preview"
+def parse_pdf(pdf_path):
+    texts = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            text = page.extract_text()
+            if text:
+                texts.append({
+                    'text': text,
+                    'metadata': {
+                        'page_number': page_num
+                    }
+                })
+    return texts
 def preprocess_text(text):
     # ... (same as your original function)
     text = re.sub(r'\s+', ' ', text)
     response = chat_completion.choices[0].message.content
     return response
+def chunk_texts(texts, max_tokens=500, overlap_tokens=50):
+    """
+    Splits texts into chunks based on paragraphs with overlap to preserve context.
+    """
+    chunks = []
+    for item in texts:
+        text = preprocess_text(item['text'])
+        if not text:
+            continue
+        metadata = item['metadata']
+        # Split text into paragraphs
+        paragraphs = text.split('\n\n')
+        current_chunk = ''
+        current_tokens = 0
+        for i, paragraph in enumerate(paragraphs):
+            paragraph = paragraph.strip()
+            if not paragraph:
+                continue
+            paragraph_tokens = len(tokenizer.encode(paragraph))
+            if current_tokens + paragraph_tokens <= max_tokens:
+                current_chunk += paragraph + '\n\n'
+                current_tokens += paragraph_tokens
+            else:
+                # Save the current chunk
+                chunk = {
+                    'text': current_chunk.strip(),
+                    'metadata': metadata
+                }
+                chunks.append(chunk)
+                # Start a new chunk with overlap
+                overlap_text = ' '.join(current_chunk.split()[-overlap_tokens:])
+                current_chunk = overlap_text + ' ' + paragraph + '\n\n'
+                current_tokens = len(tokenizer.encode(current_chunk))
+        if current_chunk:
+            chunk = {
+                'text': current_chunk.strip(),
+                'metadata': metadata
+            }
+            chunks.append(chunk)
+    return chunks
 def is_car_model_available(query, available_models):
     # ... (same as your original function)
     for model in available_models:
             return model
     return None
+def extract_car_model(pdf_filename):
+    base_name = os.path.basename(pdf_filename)
+    match = re.search(r'manual_(.+)\.pdf', base_name)
+    if match:
+        model_name = match.group(1).replace('_', ' ').title()
+        return model_name
+    else:
+        return 'Unknown Model'
 def colbert_rerank(query=None, chunks=None):
     # ... (same as your original function)
     d = ranker.rank(query=query, docs=chunks)
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     print(f"Using device: {device}")
+    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")  # For token counting
     # Initialize embedding model
     embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
         model_name="all-MiniLM-L12-v2", device=device
     )
     client = PersistentClient(path="./chromadb")
     # Get the collection
     collection_name = "car_manuals5"
+    if collection_name in [col.name for col in client.list_collections()]:
+        collection = client.get_collection(
+            name=collection_name,
+            embedding_function=embedding_function
+        )
+        available_car_models = ['Tiago', 'Astor']
+    else:
+        collection = client.create_collection(
+            name=collection_name,
+            embedding_function=embedding_function
+        )
+    # collection = client.get_or_create_collection(
+    #     name=collection_name,
+    #     embedding_function=embedding_function
+    # )
     # Set available car models
+    # available_car_models = ['TIAGO', 'Astor']
+        pdf_files = ['./car_manuals/manual_Tiago.pdf', './car_manuals/manual_Astor.pdf']
+        available_car_models = []
+        for pdf_file in pdf_files:
+            print(f"Parsing {pdf_file}...")
+            pdf_texts = parse_pdf(pdf_file)
+            car_model = extract_car_model(pdf_file)
+            available_car_models.append(car_model)
+            # Add car model to metadata
+            for item in pdf_texts:
+                item['metadata']['car_model'] = car_model
+            # Chunk texts using the refined strategy
+            chunks = chunk_texts(pdf_texts, max_tokens=500, overlap_tokens=50)
+            # Prepare data for ChromaDB
+            documents = [chunk['text'] for chunk in chunks]
+            metadatas = [chunk['metadata'] for chunk in chunks]
+            ids = [f"{car_model}_{i}" for i in range(len(documents))]
+            # Add to ChromaDB collection
+            collection.add(
+                documents=documents,
+                metadatas=metadatas,
+                ids=ids
+            )
     # Initialize the ranker
     ranker = Reranker("answerdotai/answerai-colbert-small-v1", model_type='colbert')
 # Call initialize function
 initialize()
 )
 if __name__ == "__main__":
+    # iface.launch(server_name="0.0.0.0", server_port=7860)
+    iface.launch()

chromadb/chroma.sqlite3 → car-manuals/manual_Astor.pdf RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:481a2f22b50f9edd260645533393020b100f9c8e43ba5393925af96c02af9a2f
-size 6451200

 version https://git-lfs.github.com/spec/v1
+oid sha256:7275b9aae94841441d33ec596e65ffe2bd738f42a980ab1b53d26d35a725b73e
+size 8105807

chromadb/e820442b-1d6c-4933-8a2c-981f60377458/data_level0.bin → car-manuals/manual_Tiago.pdf RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf95cb4ad00dbb2be6ce91b7143b22b48c7583817cc57b4fc153791554a14132
-size 1676000

 version https://git-lfs.github.com/spec/v1
+oid sha256:b71ee499e53973ccbabdd49b11995cc374bf9c543d372d4bc63ea8f7414cd7fa
+size 2564414

chromadb/e820442b-1d6c-4933-8a2c-981f60377458/header.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
-size 100

chromadb/e820442b-1d6c-4933-8a2c-981f60377458/length.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:03e1219ac9d4a1a30d3d5f9f3dfc60df85e0844f2b73f04e8f641cc4a101a470
-size 4000

chromadb/e820442b-1d6c-4933-8a2c-981f60377458/link_lists.bin DELETED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,5 +1,3 @@
-# requirements.txt
 gradio
 torch
 sentence_transformers

 gradio
 torch
 sentence_transformers