Spaces:
Sleeping
Sleeping
Deutsch is now available, language can be switched, use different embedding models, ingest resources from two languages, also propmts, vector retrival supported for both languages now.
db81bb8
| from langchain_community.document_loaders import PyMuPDFLoader | |
| from langchain.schema import Document | |
| from app.managers import vector_manager as vm | |
| import os | |
| def ingest_fedlex_pdf(file_path, language): | |
| loader = PyMuPDFLoader(file_path) | |
| docs = loader.load() | |
| full_text = "\n".join([doc.page_content for doc in docs]) | |
| chunks = full_text.split("\nArt.") | |
| print(len(chunks)) | |
| processed_chunks, prefixes = filter_chunks_fedlex(chunks) | |
| processed_docs = [] | |
| for i, chunk in enumerate(processed_chunks): | |
| metadata = { | |
| "source": os.path.basename(file_path), | |
| "article_number": prefixes[i] | |
| } | |
| processed_docs.append(Document(page_content=chunk, metadata=metadata)) | |
| vm.add_multi_documents(processed_docs, language) | |
| print(f"β Ingested {len(processed_docs)} articles from {file_path}") | |
| def filter_chunks_fedlex(chunks): | |
| chunks = chunks[1:] | |
| processed_chunks = [] | |
| prefixes = [] | |
| for i, chunk in enumerate(chunks): | |
| if len(chunk) < 40: | |
| continue | |
| prefix, postfix = chunk.split(maxsplit=1)[0], chunk.split(maxsplit=1)[1] | |
| if len(prefix) > 3: | |
| if prefix[3].isalpha(): | |
| prefix = prefix[:4] | |
| elif prefix[2].isalpha(): | |
| prefix = prefix[:3] | |
| elif int(prefix[:3]) >= 200: | |
| prefix = prefix[:2] | |
| else: | |
| prefix = prefix[:3] | |
| elif len(prefix) == 3 and not prefix[2].isalpha(): | |
| if int(prefix[:3]) >= 200 and i < 20: | |
| prefix = prefix[:1] | |
| chunk = "Art. " + prefix.strip() + " " + postfix.strip() | |
| processed_chunks.append(chunk) | |
| prefixes.append("Art. " + prefix.strip()) | |
| return processed_chunks, prefixes |