nanfangwuyu21's picture
Deutsch is now available, language can be switched, use different embedding models, ingest resources from two languages, also propmts, vector retrival supported for both languages now.
db81bb8
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.schema import Document
from app.managers import vector_manager as vm
import os
def ingest_fedlex_pdf(file_path, language):
loader = PyMuPDFLoader(file_path)
docs = loader.load()
full_text = "\n".join([doc.page_content for doc in docs])
chunks = full_text.split("\nArt.")
print(len(chunks))
processed_chunks, prefixes = filter_chunks_fedlex(chunks)
processed_docs = []
for i, chunk in enumerate(processed_chunks):
metadata = {
"source": os.path.basename(file_path),
"article_number": prefixes[i]
}
processed_docs.append(Document(page_content=chunk, metadata=metadata))
vm.add_multi_documents(processed_docs, language)
print(f"βœ… Ingested {len(processed_docs)} articles from {file_path}")
def filter_chunks_fedlex(chunks):
chunks = chunks[1:]
processed_chunks = []
prefixes = []
for i, chunk in enumerate(chunks):
if len(chunk) < 40:
continue
prefix, postfix = chunk.split(maxsplit=1)[0], chunk.split(maxsplit=1)[1]
if len(prefix) > 3:
if prefix[3].isalpha():
prefix = prefix[:4]
elif prefix[2].isalpha():
prefix = prefix[:3]
elif int(prefix[:3]) >= 200:
prefix = prefix[:2]
else:
prefix = prefix[:3]
elif len(prefix) == 3 and not prefix[2].isalpha():
if int(prefix[:3]) >= 200 and i < 20:
prefix = prefix[:1]
chunk = "Art. " + prefix.strip() + " " + postfix.strip()
processed_chunks.append(chunk)
prefixes.append("Art. " + prefix.strip())
return processed_chunks, prefixes