Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import tempfile | |
| import shutil | |
| import pdfminer.high_level | |
| import docx | |
| import faiss | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from sentence_transformers import SentenceTransformer | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # تحميل النماذج | |
| embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device=device) | |
| qa_model_name = "aubmindlab/aragpt2-base" | |
| qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name) | |
| qa_model = AutoModelForCausalLM.from_pretrained(qa_model_name).to(device) | |
| # إعداد قاعدة البيانات | |
| index = None | |
| docs = [] | |
| def extract_text(file_path): | |
| if file_path.endswith('.pdf'): | |
| with open(file_path, 'rb') as f: | |
| return pdfminer.high_level.extract_text(f) | |
| elif file_path.endswith('.docx') or file_path.endswith('.doc'): | |
| doc = docx.Document(file_path) | |
| return "\n".join([para.text for para in doc.paragraphs]) | |
| else: | |
| raise ValueError("صيغة ملف غير مدعومة") | |
| def process_files(files): | |
| global index, docs | |
| all_text = "" | |
| for file in files: | |
| text = extract_text(file.name) | |
| all_text += text + "\n" | |
| # تقسيم النص إلى مقاطع | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| texts = text_splitter.split_text(all_text) | |
| # إنشاء المتجهات | |
| embeddings = embedding_model.encode(texts, show_progress_bar=True, convert_to_tensor=True) | |
| index = faiss.IndexFlatL2(embeddings.shape[1]) | |
| index.add(embeddings.cpu().numpy()) | |
| docs = texts | |
| return "✅ تم تحميل الكتب واستيعاب الأفكار! النظام جاهز للإجابة." | |
| def generate_answer(question): | |
| global index, docs | |
| if index is None: | |
| return "❌ الرجاء رفع الكتب أولاً." | |
| q_emb = embedding_model.encode([question]) | |
| D, I = index.search(q_emb, k=3) | |
| context = "\n".join([docs[i] for i in I[0]]) | |
| # تجهيز الإدخال للنموذج | |
| prompt = f"سؤال: {question}\n\nمحتوى ذو صلة:\n{context}\n\nالإجابة:" | |
| inputs = qa_tokenizer(prompt, return_tensors='pt').to(device) | |
| outputs = qa_model.generate(**inputs, max_new_tokens=300, pad_token_id=qa_tokenizer.eos_token_id) | |
| answer = qa_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return answer.split("الإجابة:")[-1].strip() | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # 🚀 أهلاً بك في محاكاة عقل المؤلف | |
| ارفع كتبك واستعد للانطلاق في رحلة استكشاف الأفكار العميقة! | |
| """) | |
| with gr.Tab("📚 رفع الكتب للتدريب"): | |
| upload = gr.File(file_types=['.pdf', '.docx', '.doc'], file_count='multiple') | |
| train_button = gr.Button("🚀 ابدأ التدريب!") | |
| train_output = gr.Textbox(label="🔵 حالة التدريب", interactive=False) | |
| with gr.Tab("❓ اسأل الكتاب"): | |
| question = gr.Textbox(label="اكتب سؤالك هنا...") | |
| answer = gr.Textbox(label="الإجابة", interactive=False) | |
| ask_button = gr.Button("✉️ أرسل السؤال!"") | |
| train_button.click(process_files, inputs=[upload], outputs=[train_output]) | |
| ask_button.click(generate_answer, inputs=[question], outputs=[answer]) | |
| demo.launch() | |