Spaces:

Shami96
/

PDF-Data_Extractor

Running

PDF-Data_Extractor / src /adapters /ml /vgt /get_json_annotations.py

Wasim

Sync: robust vehicle parser + full project

2e237ce 3 months ago

2.43 kB

	import json
	from os import makedirs
	from pdf_features import PdfToken
	from domain.PdfImages import PdfImages
	from configuration import DOCLAYNET_TYPE_BY_ID
	from configuration import JSONS_ROOT_PATH, JSON_TEST_FILE_PATH


	def save_annotations_json(annotations: list, width_height: list, images: list):
	images_dict = [
	{
	"id": i,
	"file_name": image_id + ".jpg",
	"width": width_height[images.index(image_id)][0],
	"height": width_height[images.index(image_id)][1],
	}
	for i, image_id in enumerate(images)
	]

	categories_dict = [{"id": key, "name": value} for key, value in DOCLAYNET_TYPE_BY_ID.items()]

	info_dict = {
	"description": "PDF Document Layout Analysis Dataset",
	"url": "",
	"version": "1.0",
	"year": 2025,
	"contributor": "",
	"date_created": "2025-01-01",
	}

	coco_dict = {"info": info_dict, "images": images_dict, "categories": categories_dict, "annotations": annotations}

	JSON_TEST_FILE_PATH.write_text(json.dumps(coco_dict))


	def get_annotation(index: int, image_id: str, token: PdfToken):
	return {
	"area": 1,
	"iscrowd": 0,
	"score": 1,
	"image_id": image_id,
	"bbox": [token.bounding_box.left, token.bounding_box.top, token.bounding_box.width, token.bounding_box.height],
	"category_id": token.token_type.get_index(),
	"id": index,
	}


	def get_annotations_for_document(annotations, images, index, pdf_images, width_height):
	for page_index, page in enumerate(pdf_images.pdf_features.pages):
	image_id = f"{pdf_images.pdf_features.file_name}_{page.page_number - 1}"
	images.append(image_id)
	width_height.append((pdf_images.pdf_images[page_index].width, pdf_images.pdf_images[page_index].height))

	for token in page.tokens:
	annotations.append(get_annotation(index, image_id, token))
	index += 1


	def get_annotations(pdf_images_list: list[PdfImages]):
	makedirs(JSONS_ROOT_PATH, exist_ok=True)

	annotations = list()
	images = list()
	width_height = list()
	index = 0

	for pdf_images in pdf_images_list:
	get_annotations_for_document(annotations, images, index, pdf_images, width_height)
	index += sum([len(page.tokens) for page in pdf_images.pdf_features.pages])

	save_annotations_json(annotations, width_height, images)