Spaces:
Build error
Build error
| import pandas as pd | |
| import gradio as gr | |
| from autorag.data.qa.filter.passage_dependency import passage_dependency_filter_llama_index | |
| from autorag.data.qa.query.llama_gen_query import factoid_query_gen | |
| from autorag.data.qa.sample import random_single_hop | |
| from autorag.data.qa.schema import Corpus, QA | |
| from autorag.data.qa.generation_gt.llama_index_gen_gt import ( | |
| make_basic_gen_gt, | |
| make_concise_gen_gt, | |
| ) | |
| from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based | |
| from llama_index.core.base.llms.base import BaseLLM | |
| from autorag.data.qa.evolve.llama_index_query_evolve import reasoning_evolve_ragas | |
| from autorag.data.qa.evolve.llama_index_query_evolve import compress_ragas | |
| def default_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", | |
| batch_size: int = 32, | |
| progress=gr.Progress()) -> QA: | |
| corpus_instance = Corpus(corpus_df) | |
| if len(corpus_instance.data) < n: | |
| n = len(corpus_instance.data) | |
| sampled_corpus = corpus_instance.sample(random_single_hop, n=n) | |
| mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) | |
| retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() | |
| progress(0.05) | |
| query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.2) | |
| basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.4) | |
| concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.6) | |
| filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang) | |
| progress(0.8) | |
| initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.96) | |
| return initial_qa | |
| def fast_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", | |
| batch_size: int = 32, | |
| progress=gr.Progress()) -> QA: | |
| corpus_instance = Corpus(corpus_df) | |
| progress(0.05) | |
| if len(corpus_instance.data) < n: | |
| n = len(corpus_instance.data) | |
| sampled_corpus = corpus_instance.sample(random_single_hop, n=n) | |
| mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) | |
| progress(0.1) | |
| retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() | |
| progress(0.2) | |
| query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.3) | |
| basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.5) | |
| concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.75) | |
| initial_qa = concise_answers | |
| progress(0.9) | |
| return initial_qa | |
| def advanced_create(corpus_df, llm: BaseLLM, n: int = 100, lang: str = "en", | |
| batch_size: int = 32, | |
| progress=gr.Progress()) -> QA: | |
| """ | |
| Mix hard and easy question. | |
| """ | |
| corpus_instance = Corpus(corpus_df) | |
| if len(corpus_instance.data) < n: | |
| n = len(corpus_instance.data) | |
| sampled_corpus = corpus_instance.sample(random_single_hop, n=n) | |
| mapped_corpus = sampled_corpus.map(lambda df: df.reset_index(drop=True)) | |
| retrieval_gt_contents = mapped_corpus.make_retrieval_gt_contents() | |
| progress(0.05) | |
| query_generated = retrieval_gt_contents.batch_apply(factoid_query_gen, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.15) | |
| basic_answers = query_generated.batch_apply(make_basic_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.25) | |
| concise_answers = basic_answers.batch_apply(make_concise_gen_gt, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.35) | |
| filtered_answers = concise_answers.filter(dontknow_filter_rule_based, lang=lang) | |
| progress(0.45) | |
| initial_qa = filtered_answers.batch_filter(passage_dependency_filter_llama_index, llm=llm, lang=lang, batch_size=batch_size) | |
| progress(0.55) | |
| cut_idx = n // 2 | |
| reasoning_qa = initial_qa.map(lambda df: df.iloc[:cut_idx]).batch_apply( | |
| reasoning_evolve_ragas, | |
| llm=llm, | |
| lang=lang, | |
| batch_size=batch_size, | |
| ) | |
| progress(0.75) | |
| compressed_qa = initial_qa.map(lambda df: df.iloc[cut_idx:]).map(lambda df: df.reset_index(drop=True)).batch_apply( | |
| compress_ragas, | |
| llm=llm, | |
| lang=lang, | |
| batch_size=batch_size, | |
| ) | |
| progress(0.95) | |
| final_qa = QA(pd.concat([reasoning_qa.data, compressed_qa.data], ignore_index=True), | |
| linked_corpus=corpus_instance) | |
| return final_qa | |