Spaces:
Sleeping
Sleeping
| import tempfile, os | |
| from typing import List | |
| from langchain_core.documents import Document as LangchainDocument | |
| from llama_index import Document | |
| from llama_parse import LlamaParse, ResultType | |
| from _utils.langchain_utils.splitter_util import SplitterUtils | |
| from setup.logging import Axiom | |
| llama_parser_keys = [ | |
| os.getenv("LLAMA_CLOUD_API_KEY_POPS"), | |
| os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"), | |
| ] | |
| def handle_pdf_files_from_serializer(files, axiom_instance: Axiom): | |
| listaPDFs = [] | |
| for file in files: | |
| file_extension = file.name.split(".")[-1] | |
| file.seek(0) | |
| with tempfile.NamedTemporaryFile( | |
| delete=False, suffix=f".{file_extension}" | |
| ) as temp_file: # Create a temporary file to save the uploaded PDF | |
| for ( | |
| chunk | |
| ) in file.chunks(): # Write the uploaded file content to the temporary file | |
| temp_file.write(chunk) | |
| temp_file_path = temp_file.name # Get the path of the temporary file | |
| listaPDFs.append(temp_file_path) | |
| axiom_instance.send_axiom(f"listaPDFs: {listaPDFs}") | |
| return listaPDFs | |
| def remove_pdf_temp_files(listaPDFs): | |
| print("\nREMOVENDO ARQUIVOS PDF TEMPOR脕RIOS") | |
| for file in listaPDFs: | |
| os.remove(file) | |
| async def return_document_list_with_llama_parser(file: str): | |
| for key in llama_parser_keys: | |
| documents: List[LangchainDocument] = [] | |
| if key: | |
| parser = LlamaParse( | |
| api_key=key, | |
| result_type=ResultType.JSON, # Options: 'text', 'markdown', 'json', 'structured' | |
| language="pt", | |
| verbose=True, | |
| ) | |
| try: | |
| parsed_document = await parser.aget_json(file) | |
| except: | |
| print(f"Error with llama parser key ending with {key[-4:]}") | |
| continue # Faz com que comece o pr贸ximo loop | |
| if len(parsed_document) == 0: | |
| continue | |
| for doc in parsed_document[0].get("pages"): # type: ignore | |
| # documents.append(doc.to_langchain_format()) | |
| langchain_document = LangchainDocument( | |
| page_content=doc.get("md"), # type: ignore | |
| metadata={ | |
| "page": doc.get("page"), # type: ignore | |
| # **doc.get("metadata", {}), # type: ignore | |
| }, # Include page number in metadata | |
| ) | |
| documents.append(langchain_document) | |
| return documents | |
| # C贸digo abaixo s贸 茅 executado se o loop acima acabar e n茫o tiver retornado um valor nenhuma vez | |
| raise ValueError(f"ALGO DEU ERRADO NO PARSER DO LLAMA PARSE:") | |