Spaces:
Sleeping
Sleeping
| from langchain_core.documents import Document | |
| from typing import Tuple, List | |
| import pandas as pd | |
| import re | |
| class Metadata: | |
| def __init__(self, ref_database_path: str): | |
| self.df = pd.read_csv(ref_database_path) | |
| def extract_ref_from_metadata(self, meta_data: dict) -> List[str]: | |
| """Extract references from metadata of images and tables.""" | |
| meta_data_dict = {} | |
| meta_data_dict["source"] = meta_data.get("source", "") | |
| self_ref = meta_data.get("self_ref", "") | |
| parent_ref = meta_data.get("parent_ref", "") | |
| child_ref = meta_data.get("child_ref", "") | |
| formated_self_ref = re.split(r'[,\s]+', self_ref or "") | |
| formated_parent_ref = re.split(r'[,\s]+', parent_ref or "") | |
| formated_child_ref = re.split(r'[,\s]+', child_ref or "") | |
| filtered_self_ref_ids = [item for item in formated_self_ref | |
| if item.startswith('#/tables/') or item.startswith('#/pictures/')] | |
| filtered_parent_ref_ids = [item for item in formated_parent_ref | |
| if item.startswith('#/tables/') or item.startswith('#/pictures/')] | |
| filtered_child_ref_ids = [item for item in formated_child_ref | |
| if item.startswith('#/tables/') or item.startswith('#/pictures/')] | |
| # Combine all filtered references into a set (to avoid duplicates) | |
| all_filtered_references = set(filtered_self_ref_ids + | |
| filtered_parent_ref_ids + | |
| filtered_child_ref_ids) | |
| if len(all_filtered_references) > 0: | |
| meta_data_dict["self_ref"] = list(all_filtered_references) | |
| return meta_data_dict | |
| def extract_all_ref_from_retrived_chunks(self, chunks: Document) -> dict: | |
| all_metadata = {} | |
| # Example: Iterate over documents and add extracted metadata to the new dictionary | |
| for idx, doc in enumerate(chunks): # Assuming `docs` is a list of documents | |
| meta_data = doc.metadata # Extract metadata from the document | |
| extracted_ref_data = self.extract_ref_from_metadata(meta_data) # Extract references | |
| # Add the extracted metadata to the all_metadata dictionary | |
| if extracted_ref_data: | |
| all_metadata[f"doc_{idx}"] = extracted_ref_data | |
| return all_metadata | |
| def get_data_from_ref(self, chunks:Document) -> Tuple[str, str]: | |
| """Extract tables and pictures from metadata using references.""" | |
| tables = {} | |
| images = {} | |
| all_metadata = self.extract_all_ref_from_retrived_chunks(chunks) | |
| for meta in all_metadata.values(): | |
| source = meta.get("source", "") | |
| ref = meta.get("self_ref", []) | |
| for r in ref: | |
| reference_rows = self.df[ | |
| (self.df['source'] == source) & | |
| (self.df['self_ref'].isin([r])) | |
| ] | |
| if not reference_rows.empty: | |
| chunk_type = reference_rows["chunk_type"].values[0] | |
| page_content = reference_rows["page_content"].values[0] | |
| if chunk_type == "table": | |
| tables[r] = page_content | |
| elif chunk_type == "picture": | |
| images[r] = page_content | |
| return tables, images |