ginipick commited on
Commit
e82f30c
·
verified ·
1 Parent(s): 740bd2d

Delete app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +0 -639
app-backup.py DELETED
@@ -1,639 +0,0 @@
1
- import gradio as gr
2
- import spaces
3
- import os
4
- from typing import List, Dict, Any, Optional, Tuple
5
- import hashlib
6
- from datetime import datetime
7
- import numpy as np
8
- from transformers import pipeline, TextIteratorStreamer
9
- import torch
10
- from threading import Thread
11
- import re
12
-
13
- # PDF 처리 라이브러리
14
- try:
15
- import fitz # PyMuPDF
16
- PDF_AVAILABLE = True
17
- except ImportError:
18
- PDF_AVAILABLE = False
19
- print("⚠️ PyMuPDF not installed. Install with: pip install pymupdf")
20
-
21
- try:
22
- from sentence_transformers import SentenceTransformer
23
- ST_AVAILABLE = True
24
- except ImportError:
25
- ST_AVAILABLE = False
26
- print("⚠️ Sentence Transformers not installed. Install with: pip install sentence-transformers")
27
-
28
- # Custom CSS
29
- custom_css = """
30
- .gradio-container {
31
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
32
- min-height: 100vh;
33
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif;
34
- }
35
-
36
- .main-container {
37
- background: rgba(255, 255, 255, 0.98);
38
- border-radius: 16px;
39
- padding: 24px;
40
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
41
- border: 1px solid rgba(0, 0, 0, 0.05);
42
- margin: 12px;
43
- }
44
-
45
- .pdf-status {
46
- padding: 12px 16px;
47
- border-radius: 12px;
48
- margin: 12px 0;
49
- font-size: 0.95rem;
50
- font-weight: 500;
51
- }
52
-
53
- .pdf-success {
54
- background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
55
- border: 1px solid #b1dfbb;
56
- color: #155724;
57
- }
58
-
59
- .pdf-error {
60
- background: linear-gradient(135deg, #f8d7da 0%, #f5c6cb 100%);
61
- border: 1px solid #f1aeb5;
62
- color: #721c24;
63
- }
64
-
65
- .pdf-info {
66
- background: linear-gradient(135deg, #d1ecf1 0%, #bee5eb 100%);
67
- border: 1px solid #9ec5d8;
68
- color: #0c5460;
69
- }
70
-
71
- .rag-context {
72
- background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%);
73
- border-left: 4px solid #f59e0b;
74
- padding: 12px;
75
- margin: 12px 0;
76
- border-radius: 8px;
77
- font-size: 0.9rem;
78
- }
79
-
80
- .thinking-section {
81
- background: rgba(0, 0, 0, 0.02);
82
- border: 1px solid rgba(0, 0, 0, 0.1);
83
- border-radius: 8px;
84
- padding: 12px;
85
- margin: 8px 0;
86
- }
87
- """
88
-
89
- class SimpleTextSplitter:
90
- """텍스트 분할기"""
91
- def __init__(self, chunk_size=800, chunk_overlap=100):
92
- self.chunk_size = chunk_size
93
- self.chunk_overlap = chunk_overlap
94
-
95
- def split_text(self, text: str) -> List[str]:
96
- """텍스트를 청크로 분할"""
97
- chunks = []
98
- sentences = text.split('. ')
99
- current_chunk = ""
100
-
101
- for sentence in sentences:
102
- if len(current_chunk) + len(sentence) < self.chunk_size:
103
- current_chunk += sentence + ". "
104
- else:
105
- if current_chunk:
106
- chunks.append(current_chunk.strip())
107
- current_chunk = sentence + ". "
108
-
109
- if current_chunk:
110
- chunks.append(current_chunk.strip())
111
-
112
- return chunks
113
-
114
- class PDFRAGSystem:
115
- """PDF 기반 RAG 시스템"""
116
-
117
- def __init__(self):
118
- self.documents = {}
119
- self.document_chunks = {}
120
- self.embeddings_store = {}
121
- self.text_splitter = SimpleTextSplitter(chunk_size=800, chunk_overlap=100)
122
-
123
- # 임베딩 모델 초기화
124
- self.embedder = None
125
- if ST_AVAILABLE:
126
- try:
127
- self.embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
128
- print("✅ 임베딩 모델 로드 성공")
129
- except Exception as e:
130
- print(f"⚠️ 임베딩 모델 로드 실패: {e}")
131
-
132
- def extract_text_from_pdf(self, pdf_path: str) -> Dict[str, Any]:
133
- """PDF에서 텍스트 추출"""
134
- if not PDF_AVAILABLE:
135
- return {
136
- "metadata": {
137
- "title": "PDF Reader Not Available",
138
- "file_name": os.path.basename(pdf_path),
139
- "pages": 0
140
- },
141
- "full_text": "PDF 처리를 위해 'pip install pymupdf'를 실행해주세요."
142
- }
143
-
144
- try:
145
- doc = fitz.open(pdf_path)
146
- text_content = []
147
- metadata = {
148
- "title": doc.metadata.get("title", os.path.basename(pdf_path)),
149
- "pages": len(doc),
150
- "file_name": os.path.basename(pdf_path)
151
- }
152
-
153
- for page_num, page in enumerate(doc):
154
- text = page.get_text()
155
- if text.strip():
156
- text_content.append(text)
157
-
158
- doc.close()
159
-
160
- return {
161
- "metadata": metadata,
162
- "full_text": "\n\n".join(text_content)
163
- }
164
- except Exception as e:
165
- raise Exception(f"PDF 처리 오류: {str(e)}")
166
-
167
- def process_and_store_pdf(self, pdf_path: str, doc_id: str) -> Dict[str, Any]:
168
- """PDF 처리 및 저장"""
169
- try:
170
- # PDF 텍스트 추출
171
- pdf_data = self.extract_text_from_pdf(pdf_path)
172
-
173
- # 텍스트를 청크로 분할
174
- chunks = self.text_splitter.split_text(pdf_data["full_text"])
175
-
176
- if not chunks:
177
- print("Warning: No chunks created from PDF")
178
- return {"success": False, "error": "No text content found in PDF"}
179
-
180
- print(f"Created {len(chunks)} chunks from PDF")
181
-
182
- # 청크 저장
183
- self.document_chunks[doc_id] = chunks
184
-
185
- # 임베딩 생성 (선택적)
186
- if self.embedder:
187
- try:
188
- print("Generating embeddings...")
189
- embeddings = self.embedder.encode(chunks)
190
- self.embeddings_store[doc_id] = embeddings
191
- print(f"Generated {len(embeddings)} embeddings")
192
- except Exception as e:
193
- print(f"Warning: Failed to generate embeddings: {e}")
194
- # 임베딩 실패해도 계속 진행
195
-
196
- # 문서 정보 저장
197
- self.documents[doc_id] = {
198
- "metadata": pdf_data["metadata"],
199
- "chunk_count": len(chunks),
200
- "upload_time": datetime.now().isoformat()
201
- }
202
-
203
- # 디버그: 첫 번째 청크 출력
204
- print(f"First chunk preview: {chunks[0][:200]}...")
205
-
206
- return {
207
- "success": True,
208
- "doc_id": doc_id,
209
- "chunks": len(chunks),
210
- "pages": pdf_data["metadata"]["pages"],
211
- "title": pdf_data["metadata"]["title"]
212
- }
213
-
214
- except Exception as e:
215
- print(f"Error processing PDF: {e}")
216
- return {"success": False, "error": str(e)}
217
-
218
- def search_relevant_chunks(self, query: str, doc_ids: List[str], top_k: int = 3) -> List[Dict]:
219
- """관련 청크 검색"""
220
- all_relevant_chunks = []
221
-
222
- print(f"Searching chunks for query: '{query[:50]}...' in {len(doc_ids)} documents")
223
-
224
- # 먼저 문서가 있는지 확인
225
- for doc_id in doc_ids:
226
- if doc_id not in self.document_chunks:
227
- print(f"Warning: Document {doc_id} not found in chunks")
228
- continue
229
-
230
- chunks = self.document_chunks[doc_id]
231
- print(f"Document {doc_id} has {len(chunks)} chunks")
232
-
233
- # 임베딩 기반 검색 시도
234
- if self.embedder and doc_id in self.embeddings_store:
235
- try:
236
- query_embedding = self.embedder.encode([query])[0]
237
- doc_embeddings = self.embeddings_store[doc_id]
238
-
239
- # 코사인 유사도 계산 (안전하게)
240
- similarities = []
241
- for i, emb in enumerate(doc_embeddings):
242
- try:
243
- query_norm = np.linalg.norm(query_embedding)
244
- emb_norm = np.linalg.norm(emb)
245
-
246
- if query_norm > 0 and emb_norm > 0:
247
- sim = np.dot(query_embedding, emb) / (query_norm * emb_norm)
248
- similarities.append(sim)
249
- else:
250
- similarities.append(0.0)
251
- except Exception as e:
252
- print(f"Error calculating similarity for chunk {i}: {e}")
253
- similarities.append(0.0)
254
-
255
- # 상위 청크 선택
256
- if similarities:
257
- top_indices = np.argsort(similarities)[-min(top_k, len(similarities)):][::-1]
258
-
259
- for idx in top_indices:
260
- if idx < len(chunks): # 인덱스 범위 확인
261
- all_relevant_chunks.append({
262
- "content": chunks[idx],
263
- "doc_name": self.documents[doc_id]["metadata"]["file_name"],
264
- "similarity": similarities[idx]
265
- })
266
- print(f"Added chunk {idx} with similarity: {similarities[idx]:.3f}")
267
- except Exception as e:
268
- print(f"Error in embedding search: {e}")
269
- # 임베딩 실패시 폴백
270
-
271
- # 임베딩이 없거나 실패한 경우 - 간단히 처음 N개 청크 반환
272
- if not all_relevant_chunks:
273
- print(f"Falling back to simple chunk selection for {doc_id}")
274
- for i in range(min(top_k, len(chunks))):
275
- all_relevant_chunks.append({
276
- "content": chunks[i],
277
- "doc_name": self.documents[doc_id]["metadata"]["file_name"],
278
- "similarity": 1.0 - (i * 0.1) # 순서대로 가중치
279
- })
280
- print(f"Added chunk {i} (fallback)")
281
-
282
- # 유사도 기준 정렬
283
- all_relevant_chunks.sort(key=lambda x: x.get('similarity', 0), reverse=True)
284
-
285
- # 상위 K개 선택
286
- result = all_relevant_chunks[:top_k]
287
- print(f"Returning {len(result)} chunks")
288
-
289
- # 디버그: 첫 번째 청크 내용 일부 출력
290
- if result:
291
- print(f"First chunk preview: {result[0]['content'][:100]}...")
292
-
293
- return result
294
-
295
- def create_rag_prompt(self, query: str, doc_ids: List[str], top_k: int = 3) -> tuple:
296
- """RAG 프롬프트 생성 - 쿼리와 컨텍스트를 분리하여 반환"""
297
- print(f"Creating RAG prompt for query: '{query[:50]}...' with docs: {doc_ids}")
298
-
299
- relevant_chunks = self.search_relevant_chunks(query, doc_ids, top_k)
300
-
301
- if not relevant_chunks:
302
- print("No relevant chunks found - checking if documents exist")
303
- # 문서가 있는데 청크를 못 찾은 경우, 첫 번째 청크라도 사용
304
- for doc_id in doc_ids:
305
- if doc_id in self.document_chunks and self.document_chunks[doc_id]:
306
- print(f"Using first chunk from {doc_id} as fallback")
307
- relevant_chunks = [{
308
- "content": self.document_chunks[doc_id][0],
309
- "doc_name": self.documents[doc_id]["metadata"]["file_name"],
310
- "similarity": 0.5
311
- }]
312
- break
313
-
314
- if not relevant_chunks:
315
- print("No documents or chunks available")
316
- return query, ""
317
-
318
- print(f"Using {len(relevant_chunks)} chunks for context")
319
-
320
- # 컨텍스트 구성
321
- context_parts = []
322
- context_parts.append("Based on the following document context, please answer the question below:")
323
- context_parts.append("=" * 40)
324
-
325
- for i, chunk in enumerate(relevant_chunks, 1):
326
- context_parts.append(f"\n[Document Reference {i} - {chunk['doc_name']}]")
327
- # 청크 크기 증가
328
- content = chunk['content'][:1000] if len(chunk['content']) > 1000 else chunk['content']
329
- context_parts.append(content)
330
- print(f"Added chunk {i} ({len(content)} chars) with similarity: {chunk.get('similarity', 0):.3f}")
331
-
332
- context_parts.append("\n" + "=" * 40)
333
-
334
- context = "\n".join(context_parts)
335
- enhanced_query = f"{context}\n\nQuestion: {query}\n\nAnswer based on the document context provided above:"
336
-
337
- print(f"Enhanced query length: {len(enhanced_query)} chars (original: {len(query)} chars)")
338
-
339
- return enhanced_query, context
340
-
341
- # Initialize model and RAG system
342
- model_id = "openai/gpt-oss-20b"
343
- pipe = pipeline(
344
- "text-generation",
345
- model=model_id,
346
- torch_dtype="auto",
347
- device_map="auto",
348
- )
349
-
350
- rag_system = PDFRAGSystem()
351
-
352
- # Global state for RAG
353
- rag_enabled = False
354
- selected_docs = []
355
- top_k_chunks = 3
356
- last_context = ""
357
-
358
- def format_conversation_history(chat_history):
359
- """Format conversation history for the model"""
360
- messages = []
361
- for item in chat_history:
362
- role = item["role"]
363
- content = item["content"]
364
- if isinstance(content, list):
365
- content = content[0]["text"] if content and "text" in content[0] else str(content)
366
- messages.append({"role": role, "content": content})
367
- return messages
368
-
369
- @spaces.GPU()
370
- def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
371
- """Generate response with optional RAG enhancement"""
372
- global last_context, rag_enabled, selected_docs, top_k_chunks
373
-
374
- # Debug logging
375
- print(f"RAG Enabled: {rag_enabled}")
376
- print(f"Selected Docs: {selected_docs}")
377
- print(f"Available Docs: {list(rag_system.documents.keys())}")
378
-
379
- # Apply RAG if enabled
380
- if rag_enabled and selected_docs:
381
- doc_ids = [doc.split(":")[0] for doc in selected_docs]
382
- enhanced_input, context = rag_system.create_rag_prompt(input_data, doc_ids, top_k_chunks)
383
- last_context = context
384
- actual_input = enhanced_input
385
- print(f"RAG Applied - Original: {len(input_data)} chars, Enhanced: {len(enhanced_input)} chars")
386
- else:
387
- actual_input = input_data
388
- last_context = ""
389
- print("RAG Not Applied")
390
-
391
- # Prepare messages
392
- new_message = {"role": "user", "content": actual_input}
393
- system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
394
- processed_history = format_conversation_history(chat_history)
395
- messages = system_message + processed_history + [new_message]
396
-
397
- # Setup streaming
398
- streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
399
- generation_kwargs = {
400
- "max_new_tokens": max_new_tokens,
401
- "do_sample": True,
402
- "temperature": temperature,
403
- "top_p": top_p,
404
- "top_k": top_k,
405
- "repetition_penalty": repetition_penalty,
406
- "streamer": streamer
407
- }
408
-
409
- thread = Thread(target=pipe, args=(messages,), kwargs=generation_kwargs)
410
- thread.start()
411
-
412
- # Process streaming output
413
- thinking = ""
414
- final = ""
415
- started_final = False
416
-
417
- for chunk in streamer:
418
- if not started_final:
419
- if "assistantfinal" in chunk.lower():
420
- split_parts = re.split(r'assistantfinal', chunk, maxsplit=1)
421
- thinking += split_parts[0]
422
- final += split_parts[1]
423
- started_final = True
424
- else:
425
- thinking += chunk
426
- else:
427
- final += chunk
428
-
429
- clean_thinking = re.sub(r'^analysis\s*', '', thinking).strip()
430
- clean_final = final.strip()
431
-
432
- # Add RAG context indicator if used
433
- rag_indicator = ""
434
- if rag_enabled and selected_docs and last_context:
435
- rag_indicator = "<div class='rag-context'>📚 RAG Context Applied</div>\n\n"
436
-
437
- formatted = f"{rag_indicator}<details open><summary>Click to view Thinking Process</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
438
- yield formatted
439
-
440
- def upload_pdf(file):
441
- """PDF 파일 업로드 처리"""
442
- if file is None:
443
- return (
444
- gr.update(value="<div class='pdf-status pdf-info'>📁 파일을 선택해주세요</div>"),
445
- gr.update(choices=[])
446
- )
447
-
448
- try:
449
- # 파일 해시를 ID로 사용
450
- with open(file.name, 'rb') as f:
451
- file_hash = hashlib.md5(f.read()).hexdigest()[:8]
452
-
453
- doc_id = f"doc_{file_hash}"
454
-
455
- # PDF 처리 및 저장
456
- result = rag_system.process_and_store_pdf(file.name, doc_id)
457
-
458
- if result["success"]:
459
- status_html = f"""
460
- <div class="pdf-status pdf-success">
461
- ✅ PDF 업로드 완료!<br>
462
- 📄 {result['title']}<br>
463
- 📑 {result['pages']} 페이지 | 🔍 {result['chunks']} 청크
464
- </div>
465
- """
466
-
467
- # 문서 목록 업데이트
468
- doc_choices = [f"{doc_id}: {rag_system.documents[doc_id]['metadata']['file_name']}"
469
- for doc_id in rag_system.documents.keys()]
470
-
471
- return (
472
- status_html,
473
- gr.update(choices=doc_choices, value=doc_choices)
474
- )
475
- else:
476
- return (
477
- f"<div class='pdf-status pdf-error'>❌ 오류: {result['error']}</div>",
478
- gr.update()
479
- )
480
-
481
- except Exception as e:
482
- return (
483
- f"<div class='pdf-status pdf-error'>❌ 오류: {str(e)}</div>",
484
- gr.update()
485
- )
486
-
487
- def clear_documents():
488
- """문서 초기화"""
489
- global selected_docs
490
- rag_system.documents = {}
491
- rag_system.document_chunks = {}
492
- rag_system.embeddings_store = {}
493
- selected_docs = []
494
-
495
- return (
496
- gr.update(value="<div class='pdf-status pdf-info'>🗑️ 모든 문서가 삭제되었습니다</div>"),
497
- gr.update(choices=[], value=[])
498
- )
499
-
500
- def update_rag_settings(enable, docs, k):
501
- """Update RAG settings"""
502
- global rag_enabled, selected_docs, top_k_chunks
503
- rag_enabled = enable
504
- selected_docs = docs if docs else []
505
- top_k_chunks = k
506
-
507
- # Debug logging
508
- print(f"RAG Settings Updated - Enabled: {rag_enabled}, Docs: {selected_docs}, Top-K: {top_k_chunks}")
509
-
510
- status = "✅ Enabled" if enable and docs else "⭕ Disabled"
511
- status_html = f"<div class='pdf-status pdf-info'>🔍 RAG: <strong>{status}</strong></div>"
512
-
513
- # Show context preview if RAG is enabled
514
- if enable and docs:
515
- preview = f"<div class='rag-context'>📚 Using {len(docs)} document(s) with {k} chunks per query</div>"
516
- return gr.update(value=status_html), gr.update(value=preview, visible=True)
517
- else:
518
- return gr.update(value=status_html), gr.update(value="", visible=False)
519
-
520
- # Build the interface
521
- with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, fill_height=True) as demo:
522
- gr.Markdown("# 🚀 GPT-OSS-20B with PDF RAG System")
523
- gr.Markdown("Enhanced AI assistant with document-based context understanding")
524
-
525
- with gr.Row():
526
- # Left sidebar for RAG controls
527
- with gr.Column(scale=1):
528
- with gr.Group(elem_classes="main-container"):
529
- gr.Markdown("### 📚 Document RAG Settings")
530
-
531
- pdf_upload = gr.File(
532
- label="Upload PDF",
533
- file_types=[".pdf"],
534
- type="filepath"
535
- )
536
-
537
- upload_status = gr.HTML(
538
- value="<div class='pdf-status pdf-info'>📤 Upload a PDF to enable document-based answers</div>"
539
- )
540
-
541
- document_list = gr.CheckboxGroup(
542
- choices=[],
543
- label="📄 Select Documents",
544
- info="Choose documents to use as context"
545
- )
546
-
547
- clear_btn = gr.Button("🗑️ Clear All Documents", size="sm", variant="secondary")
548
-
549
- enable_rag = gr.Checkbox(
550
- label="✨ Enable RAG",
551
- value=False,
552
- info="Use documents for context-aware responses"
553
- )
554
-
555
- top_k_slider = gr.Slider(
556
- minimum=1,
557
- maximum=5,
558
- value=3,
559
- step=1,
560
- label="Context Chunks",
561
- info="Number of document chunks to use"
562
- )
563
-
564
- # RAG status display
565
- rag_status = gr.HTML(
566
- value="<div class='pdf-status pdf-info'>🔍 RAG: <strong>Disabled</strong></div>"
567
- )
568
-
569
- context_preview = gr.HTML(value="", visible=False)
570
-
571
- # Right side for chat interface
572
- with gr.Column(scale=3):
573
- with gr.Group(elem_classes="main-container"):
574
- # Create ChatInterface with custom function
575
- chat_interface = gr.ChatInterface(
576
- fn=generate_response,
577
- additional_inputs=[
578
- gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
579
- gr.Textbox(
580
- label="System Prompt",
581
- value="You are a helpful assistant. Reasoning: medium",
582
- lines=4,
583
- placeholder="Change system prompt"
584
- ),
585
- gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
586
- gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
587
- gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
588
- gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0)
589
- ],
590
- examples=[
591
- [{"text": "Explain Newton laws clearly and concisely"}],
592
- [{"text": "Write a Python function to calculate the Fibonacci sequence"}],
593
- [{"text": "What are the benefits of open weight AI models"}],
594
- ],
595
- cache_examples=False,
596
- type="messages",
597
- description="""Chat with GPT-OSS-20B. Upload PDFs to enhance responses with document context.
598
- Click to view thinking process (default is on).""",
599
- textbox=gr.Textbox(
600
- label="Query Input",
601
- placeholder="Type your prompt (RAG will be applied if enabled)"
602
- ),
603
- stop_btn="Stop Generation",
604
- multimodal=False
605
- )
606
-
607
- # Event handlers
608
- pdf_upload.upload(
609
- fn=upload_pdf,
610
- inputs=[pdf_upload],
611
- outputs=[upload_status, document_list]
612
- )
613
-
614
- clear_btn.click(
615
- fn=clear_documents,
616
- outputs=[upload_status, document_list]
617
- )
618
-
619
- # Update RAG settings when changed
620
- enable_rag.change(
621
- fn=update_rag_settings,
622
- inputs=[enable_rag, document_list, top_k_slider],
623
- outputs=[rag_status, context_preview]
624
- )
625
-
626
- document_list.change(
627
- fn=update_rag_settings,
628
- inputs=[enable_rag, document_list, top_k_slider],
629
- outputs=[rag_status, context_preview]
630
- )
631
-
632
- top_k_slider.change(
633
- fn=update_rag_settings,
634
- inputs=[enable_rag, document_list, top_k_slider],
635
- outputs=[rag_status, context_preview]
636
- )
637
-
638
- if __name__ == "__main__":
639
- demo.launch(share=True)