Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import tempfile | |
| import shutil | |
| from langchain_community.document_loaders import PyMuPDFLoader, UnstructuredWordDocumentLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.chains import RetrievalQA | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| import torch | |
| EMBEDDING_MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix" | |
| QA_MODEL_NAME = "mosaicml/mpt-7b-storywriter" | |
| embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_NAME, trust_remote_code=True) | |
| qa_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_NAME, trust_remote_code=True).to(device) | |
| vectordb = None | |
| def load_document(file_path): | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext == ".pdf": | |
| loader = PyMuPDFLoader(file_path) | |
| elif ext in [".doc", ".docx"]: | |
| loader = UnstructuredWordDocumentLoader(file_path) | |
| else: | |
| raise ValueError("صيغة الملف غير مدعومة.") | |
| return loader.load() | |
| def train_from_documents(documents): | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| texts = splitter.split_documents(documents) | |
| vectorstore = FAISS.from_documents(texts, embedding_model) | |
| return vectorstore | |
| def upload_files(files): | |
| global vectordb | |
| temp_dir = tempfile.mkdtemp() | |
| all_docs = [] | |
| for file in files: | |
| file_path = os.path.join(temp_dir, file.name) | |
| with open(file_path, "wb") as f: | |
| f.write(file.read()) | |
| docs = load_document(file_path) | |
| all_docs.extend(docs) | |
| vectordb = train_from_documents(all_docs) | |
| shutil.rmtree(temp_dir) | |
| return "✅ النظام جاهز للإجابة على أسئلتك!" | |
| def answer_question(question): | |
| if vectordb is None: | |
| return "⚠️ الرجاء رفع الملفات أولاً." | |
| retriever = vectordb.as_retriever(search_kwargs={"k": 5}) | |
| qa_chain = RetrievalQA.from_chain_type( | |
| llm=None, | |
| retriever=retriever, | |
| return_source_documents=True | |
| ) | |
| relevant_docs = qa_chain.retriever.get_relevant_documents(question) | |
| context = "\n".join(doc.page_content for doc in relevant_docs) | |
| inputs = qa_tokenizer( | |
| f"أجب بالعربية فقط بناءً على السياق التالي:\n{context}\nالسؤال: {question}", | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=1024 | |
| ).to(device) | |
| with torch.no_grad(): | |
| outputs = qa_model.generate(**inputs, max_length=300) | |
| answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return answer | |
| with gr.Blocks(title="محاكاة دماغ المؤلف") as demo: | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## 📚 ارفع كتبك هنا") | |
| file_uploader = gr.File(file_types=[".pdf", ".doc", ".docx"], file_count="multiple") | |
| upload_button = gr.Button("🚀 ابدأ التدريب") | |
| training_status = gr.Textbox(label="حالة التدريب", interactive=False) | |
| with gr.Column(): | |
| gr.Markdown("## ❓ اطرح سؤالك") | |
| question_input = gr.Textbox(label="سؤالك", placeholder="اكتب سؤالك هنا...") | |
| ask_button = gr.Button("✉️ أرسل السؤال!") | |
| answer_output = gr.Textbox(label="الإجابة", interactive=False) | |
| upload_button.click(upload_files, inputs=[file_uploader], outputs=[training_status]) | |
| ask_button.click(answer_question, inputs=[question_input], outputs=[answer_output]) | |
| demo.launch(share=True) | |