Spaces:

EnverLee
/

laweyedev

Runtime error

App Files Files Community

EnverLee commited on Aug 14, 2024

Commit

175d074

verified ·

1 Parent(s): d6be384

Upload 4 files

Browse files

Files changed (4) hide show

README.md +10 -8
app.py +150 -0
laws.pdf +0 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -1,13 +1,15 @@
 ---
-title: Laweyedev
-emoji: 🐠
-colorFrom: blue
-colorTo: indigo
 sdk: gradio
-sdk_version: 4.41.0
 app_file: app.py
-pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: RAG-Chatbot
+emoji: 🌘w🌖
+colorFrom: yellow
+colorTo: red
 sdk: gradio
+sdk_version: 4.39.0
 app_file: app.py
+pinned: true
+short_description: A retrieval system with chatbot integration
+thumbnail: >-
+  https://cdn-uploads.huggingface.co/production/uploads/6527e89a8808d80ccff88b7a/XVgtQiizeFHIUUj1huwdv.png
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import gradio as gr
+from datasets import load_dataset
+import os
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
+import torch
+from threading import Thread
+from sentence_transformers import SentenceTransformer
+import faiss
+import fitz  # PyMuPDF
+# 환경 변수에서 Hugging Face 토큰 가져오기
+token = os.environ.get("HF_TOKEN")
+# 임베딩 모델 로드
+ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
+# PDF에서 텍스트 추출
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
+# 법률 문서 PDF 경로 지정 및 텍스트 추출
+pdf_path = "laws.pdf"  # 여기에 실제 PDF 경로를 입력하세요.
+law_text = extract_text_from_pdf(pdf_path)
+# 법률 문서 텍스트를 문장 단위로 나누고 임베딩
+law_sentences = law_text.split('\n')  # Adjust splitting based on your PDF structure
+law_embeddings = ST.encode(law_sentences)
+# FAISS 인덱스 생성 및 임베딩 추가
+index = faiss.IndexFlatL2(law_embeddings.shape[1])
+index.add(law_embeddings)
+# Hugging Face에서 법률 상담 데이터셋 로드
+dataset = load_dataset("jihye-moon/LawQA-Ko")
+data = dataset["train"]
+# 질문 컬럼을 임베딩하여 새로운 컬럼에 추가
+data = data.map(lambda x: {"question_embedding": ST.encode(x["question"])}, batched=True)
+data.add_faiss_index(column="question_embedding")
+# LLaMA 모델 설정
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=bnb_config,
+    token=token
+)
+SYS_PROMPT = """You are an assistant for answering legal questions.
+You are given the extracted parts of legal documents and a question. Provide a conversational answer.
+If you don't know the answer, just say "I do not know." Don't make up an answer.
+you must answer korean."""
+# 법률 문서 검색 함수
+def search_law(query, k=5):
+    query_embedding = ST.encode([query])
+    D, I = index.search(query_embedding, k)
+    return [(law_sentences[i], D[0][idx]) for idx, i in enumerate(I[0])]
+# 법률 상담 데이터 검색 함수
+def search_qa(query, k=3):
+    scores, retrieved_examples = data.get_nearest_examples(
+        "question_embedding", ST.encode(query), k=k
+    )
+    return [retrieved_examples["answer"][i] for i in range(k)]
+# 최종 프롬프트 생성
+def format_prompt(prompt, law_docs, qa_docs):
+    PROMPT = f"Question: {prompt}\n\nLegal Context:\n"
+    for doc in law_docs:
+        PROMPT += f"{doc[0]}\n"  # Assuming doc[0] contains the relevant text
+    PROMPT += "\nLegal QA:\n"
+    for doc in qa_docs:
+        PROMPT += f"{doc}\n"
+    return PROMPT
+# 챗봇 응답 함수
+def talk(prompt, history):
+    law_results = search_law(prompt, k=3)
+    qa_results = search_qa(prompt, k=3)
+    retrieved_law_docs = [result[0] for result in law_results]
+    formatted_prompt = format_prompt(prompt, retrieved_law_docs, qa_results)
+    formatted_prompt = formatted_prompt[:2000]  # GPU 메모리 부족을 피하기 위해 프롬프트 제한
+    messages = [{"role": "system", "content": SYS_PROMPT}, {"role": "user", "content": formatted_prompt}]
+    # 모델에게 생성 지시
+    input_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(model.device)
+    streamer = TextIteratorStreamer(
+        tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+    )
+    generate_kwargs = dict(
+        input_ids=input_ids,
+        streamer=streamer,
+        max_new_tokens=1024,
+        do_sample=True,
+        top_p=0.95,
+        temperature=0.75,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+# Gradio 인터페이스 설정
+TITLE = "Legal RAG Chatbot"
+DESCRIPTION = """A chatbot that uses Retrieval-Augmented Generation (RAG) for legal consultation.
+This chatbot can search legal documents and previous legal QA pairs to provide answers."""
+demo = gr.ChatInterface(
+    fn=talk,
+    chatbot=gr.Chatbot(
+        show_label=True,
+        show_share_button=True,
+        show_copy_button=True,
+        likeable=True,
+        layout="bubble",
+        bubble_full_width=False,
+    ),
+    theme="Soft",
+    examples=[["What are the regulations on data privacy?"]],
+    title=TITLE,
+    description=DESCRIPTION,
+)
+# Gradio 데모 실행
+demo.launch(debug=True)

laws.pdf ADDED Viewed

Binary file (836 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+spaces
+torch==2.2.0
+transformers
+sentence-transformers
+faiss-gpu
+datasets
+accelerate
+bitsandbytes
+PyMuPDF