Spaces:

soojeongcrystal
/

hybridRAG

Sleeping

App Files Files Community

hybridRAG / app.py

soojeongcrystal

Update app.py

2f93eb1 verified about 1 year ago

raw

history blame contribute delete

11.3 kB

	import gradio as gr
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	import csv
	import io
	import tempfile
	import os

	# 한국어 처리를 위한 KoSentence-BERT 모델 로드
	model = SentenceTransformer('jhgan/ko-sbert-sts')

	# 전역 변수
	global_recommendations = None
	global_csv_file = None
	youtube_columns = None

	# CSV 파일 생성 함수
	def create_csv_file(recommendations):
	with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as temp_file:
	writer = csv.writer(temp_file)
	writer.writerow(["Employee ID", "Employee Name", "Recommended Programs", "Recommended YouTube Content"])
	for rec in recommendations:
	writer.writerow(rec)
	return temp_file.name

	# 열 매칭 함수
	def auto_match_columns(df, required_cols):
	matched_cols = {}
	for req_col in required_cols:
	matched_col = None
	for col in df.columns:
	if req_col.lower() in col.lower():
	matched_col = col
	break
	matched_cols[req_col] = matched_col
	return matched_cols

	# 열 검증 함수
	def validate_and_get_columns(employee_df, program_df):
	required_employee_cols = ["employee_id", "employee_name", "current_skills"]
	required_program_cols = ["program_name", "skills_acquired", "duration"]

	employee_cols = auto_match_columns(employee_df, required_employee_cols)
	program_cols = auto_match_columns(program_df, required_program_cols)

	for key, value in employee_cols.items():
	if value is None:
	return f"직원 데이터에서 '{key}' 열을 선택할 수 없습니다. 올바른 열을 선택하세요.", None, None

	for key, value in program_cols.items():
	if value is None:
	return f"프로그램 데이터에서 '{key}' 열을 선택할 수 없습니다. 올바른 열을 선택하세요.", None, None

	return None, employee_cols, program_cols

	# 유튜브 데이터 열 선택 함수
	def select_youtube_columns(youtube_file):
	global youtube_columns
	if youtube_file is None:
	return [gr.Dropdown(choices=[], value="") for _ in range(4)]
	youtube_df = pd.read_csv(youtube_file.name)
	required_youtube_cols = ["title", "description", "url", "upload_date"]
	youtube_columns = auto_match_columns(youtube_df, required_youtube_cols)

	column_options = youtube_df.columns.tolist()
	return [
	gr.Dropdown(choices=column_options, value=youtube_columns.get("title", "")),
	gr.Dropdown(choices=column_options, value=youtube_columns.get("description", "")),
	gr.Dropdown(choices=column_options, value=youtube_columns.get("url", "")),
	gr.Dropdown(choices=column_options, value=youtube_columns.get("upload_date", ""))
	]

	# 유튜브 콘텐츠 데이터 로드 및 처리 함수
	def load_youtube_content(file_path, title_col, description_col, url_col, upload_date_col):
	youtube_df = pd.read_csv(file_path)
	selected_columns = [col for col in [title_col, description_col, url_col, upload_date_col] if col]
	youtube_df = youtube_df[selected_columns]

	column_mapping = {
	title_col: 'title',
	description_col: 'description',
	url_col: 'url',
	upload_date_col: 'upload_date'
	}
	youtube_df.rename(columns=column_mapping, inplace=True)

	if 'upload_date' in youtube_df.columns:
	youtube_df['upload_date'] = pd.to_datetime(youtube_df['upload_date'], errors='coerce')

	return youtube_df

	# 유튜브 콘텐츠와 교육 프로그램 매칭 함수
	def match_youtube_content(program_skills, youtube_df, model):
	if 'description' not in youtube_df.columns:
	return None
	youtube_embeddings = model.encode(youtube_df['description'].tolist())
	program_embeddings = model.encode(program_skills)
	similarities = cosine_similarity(program_embeddings, youtube_embeddings)
	return similarities

	# 직원 데이터를 분석하여 교육 프로그램을 추천하고, 테이블을 생성하는 함수
	def hybrid_rag(employee_file, program_file, youtube_file, title_col, description_col, url_col, upload_date_col):
	global global_recommendations
	global global_csv_file

	# 직원 및 프로그램 데이터 로드
	employee_df = pd.read_csv(employee_file.name)
	program_df = pd.read_csv(program_file.name)

	error_msg, employee_cols, program_cols = validate_and_get_columns(employee_df, program_df)
	if error_msg:
	return error_msg, None, None

	employee_skills = employee_df[employee_cols["current_skills"]].tolist()
	program_skills = program_df[program_cols["skills_acquired"]].tolist()
	employee_embeddings = model.encode(employee_skills)
	program_embeddings = model.encode(program_skills)

	similarities = cosine_similarity(employee_embeddings, program_embeddings)

	# 유튜브 콘텐츠 로드 및 처리
	youtube_df = load_youtube_content(youtube_file.name, title_col, description_col, url_col, upload_date_col)

	# 유튜브 콘텐츠와 교육 프로그램 매칭
	youtube_similarities = match_youtube_content(program_df[program_cols['skills_acquired']].tolist(), youtube_df, model)

	recommendations = []
	recommendation_rows = []
	for i, employee in employee_df.iterrows():
	recommended_programs = []
	recommended_youtube = []
	for j, program in program_df.iterrows():
	if similarities[i][j] > 0.5:
	recommended_programs.append(f"{program[program_cols['program_name']]} ({program[program_cols['duration']]})")

	if youtube_similarities is not None:
	top_youtube_indices = youtube_similarities[j].argsort()[-3:][::-1] # 상위 3개
	for idx in top_youtube_indices:
	if 'title' in youtube_df.columns and 'url' in youtube_df.columns:
	recommended_youtube.append(f"{youtube_df.iloc[idx]['title']} (URL: {youtube_df.iloc[idx]['url']})")

	# 추천 프로그램 및 유튜브 콘텐츠 개수 제한
	recommended_programs = recommended_programs[:5] # 최대 5개 프로그램만 추천
	recommended_youtube = recommended_youtube[:3] # 최대 3개 유튜브 콘텐츠만 추천

	if recommended_programs:
	recommendation = f"직원 {employee[employee_cols['employee_name']]}의 추천 프로그램: {', '.join(recommended_programs)}"
	youtube_recommendation = f"추천 유튜브 콘텐츠: {', '.join(recommended_youtube)}" if recommended_youtube else "추천할 유튜브 콘텐츠가 없습니다."
	recommendation_rows.append([employee[employee_cols['employee_id']], employee[employee_cols['employee_name']],
	", ".join(recommended_programs), ", ".join(recommended_youtube)])
	else:
	recommendation = f"직원 {employee[employee_cols['employee_name']]}에게 적합한 프로그램이 없습니다."
	youtube_recommendation = "추천할 유튜브 콘텐츠가 없습니다."
	recommendation_rows.append([employee[employee_cols['employee_id']], employee[employee_cols['employee_name']],
	"적합한 프로그램 없음", "추천 콘텐츠 없음"])

	recommendations.append(recommendation + "\n" + youtube_recommendation)

	global_recommendations = recommendation_rows

	# CSV 파일 생성
	global_csv_file = create_csv_file(recommendation_rows)

	# 결과 테이블 데이터프레임 생성
	result_df = pd.DataFrame(recommendation_rows, columns=["Employee ID", "Employee Name", "Recommended Programs", "Recommended YouTube Content"])

	return result_df, gr.File(value=global_csv_file, visible=True), gr.Button(value="CSV 다운로드", visible=True)

	# 채팅 응답 함수
	def chat_response(message, history):
	global global_recommendations
	if global_recommendations is None:
	return "먼저 '분석 시작' 버튼을 눌러 데이터를 분석해주세요."

	for employee in global_recommendations:
	if employee[1].lower() in message.lower():
	return f"{employee[1]}님에게 추천된 프로그램은 다음과 같습니다: {employee[2]}\n\n추천 유튜브 콘텐츠: {employee[3]}"

	return "죄송합니다. 해당 직원의 정보를 찾을 수 없습니다. 다른 직원 이름을 입력해주세요."

	# CSV 다운로드 함수
	def download_csv():
	global global_csv_file
	return gr.File(value=global_csv_file, visible=True)

	# Gradio 블록
	with gr.Blocks(css=".gradio-button {background-color: #007bff; color: white;} .gradio-textbox {border-color: #6c757d;}") as demo:
	gr.Markdown("<h1 style='text-align: center; color: #2c3e50;'>💼 HybridRAG 시스템 (유튜브 콘텐츠 포함)</h1>")

	with gr.Row():
	with gr.Column(scale=1, min_width=300):
	gr.Markdown("<h3 style='color: #34495e;'>1. 데이터를 업로드하세요</h3>")
	employee_file = gr.File(label="직원 데이터 업로드", interactive=True)
	program_file = gr.File(label="교육 프로그램 데이터 업로드", interactive=True)
	youtube_file = gr.File(label="유튜브 콘텐츠 데이터 업로드", interactive=True)

	gr.Markdown("<h4 style='color: #34495e;'>유튜브 데이터 열 선택</h4>")
	title_col = gr.Dropdown(label="제목 열")
	description_col = gr.Dropdown(label="설명 열")
	url_col = gr.Dropdown(label="URL 열")
	upload_date_col = gr.Dropdown(label="업로드 날짜 열")

	youtube_file.change(select_youtube_columns, inputs=[youtube_file], outputs=[title_col, description_col, url_col, upload_date_col])

	analyze_button = gr.Button("분석 시작", elem_classes="gradio-button")
	output_table = gr.DataFrame(label="분석 결과 (테이블)")
	csv_download = gr.File(label="추천 결과 다운로드", visible=False)
	download_button = gr.Button("CSV 다운로드", visible=False)

	gr.Markdown("<h3 style='color: #34495e;'>2. 직원별 추천 프로그램 및 유튜브 콘텐츠 확인</h3>")
	chatbot = gr.Chatbot()
	msg = gr.Textbox(label="직원 이름을 입력하세요")
	clear = gr.Button("대화 내역 지우기")

	# 분석 버튼 클릭 시 테이블, 파일 다운로드를 업데이트
	analyze_button.click(
	hybrid_rag,
	inputs=[employee_file, program_file, youtube_file, title_col, description_col, url_col, upload_date_col],
	outputs=[output_table, csv_download, download_button]
	)

	# CSV 다운로드 버튼
	download_button.click(download_csv, inputs=[], outputs=[csv_download])

	# 채팅 기능
	msg.submit(chat_response, [msg, chatbot], [chatbot])
	clear.click(lambda: None, None, chatbot, queue=False)

	# 프로그램 종료 시 임시 파일 삭제
	import atexit

	@atexit.register
	def cleanup():
	global global_csv_file
	if global_csv_file and os.path.exists(global_csv_file):
	os.remove(global_csv_file)

	# Gradio 인터페이스 실행
	demo.launch()