Spaces:

Dyraa18
/

Rag-dan-Guardrail

Running

App Files Files Community

Rag-dan-Guardrail / Rag-Pipeline /cleans2.py

Dyraa18

Upload 8 files

fb2123c verified 15 days ago

raw

history blame contribute delete

3.35 kB


	import os, re, sys
	from typing import List, Optional, Set
	import fitz
	import pytesseract
	from PIL import Image
	from io import BytesIO

	PDF_PATH = r"D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_VI.pdf"
	OUTPUT_TXT = r"D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas VI.txt"
	SKIP_PAGES = list(range(1, 22)) + list(range(200, 211)) + list(range(213, 226))
	TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
	OCR_LANG = "ind+eng"
	DPI = 300


	if TESSERACT_CMD:
	pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD


	URL_RE = re.compile(
	r"(https?://\S+\|www\.\S+\|\b\S+\.(?:com\|org\|net\|edu\|gov\|go\|id\|co)\S*)",
	flags=re.IGNORECASE,
	)
	BAB_LINE_RE = re.compile(
	r"^\s(?:bab\|BAB)\s(?:[0-9]+\|[IVXLCDM]+)\s(?:[:\-–—]\s.)?\s$"
	)
	BAB_PREFIX_RE = re.compile(
	r"^\s(?:bab\|BAB)\s(?:[0-9]+\|[IVXLCDM]+)\s(?:[:\-–—]\s)?",
	flags=re.IGNORECASE,
	)

	def clean_text(text: str) -> str:
	text = URL_RE.sub("", text or "")

	text = text.replace("\t", " ")
	text = re.sub(r"[^\x09\x0A\x0D\x20-\x7EÀ-ÿ]", "", text)

	cleaned_lines: List[str] = []
	for raw_ln in text.splitlines():
	ln = re.sub(r"\s+", " ", raw_ln).strip()
	if not ln:
	continue

	if BAB_LINE_RE.match(ln):
	continue

	ln = BAB_PREFIX_RE.sub("", ln).strip()

	if not ln:
	continue

	cleaned_lines.append(ln)

	text_out = "\n".join(cleaned_lines).strip()
	return text_out

	def pixmap_to_pil(pix: fitz.Pixmap) -> Image.Image:
	if pix.alpha:
	pix = fitz.Pixmap(fitz.csRGB, pix)
	img_bytes = pix.tobytes("png")
	return Image.open(BytesIO(img_bytes))

	def ocr_page(img: Image.Image, lang: str) -> str:
	return clean_text(pytesseract.image_to_string(img, lang=lang))

	def main():
	if not os.path.exists(PDF_PATH):
	print(f"PDF tidak ditemukan: {PDF_PATH}")
	sys.exit(1)

	doc = fitz.open(PDF_PATH)
	total = doc.page_count
	skip: Set[int] = set(SKIP_PAGES or [])

	zoom = DPI / 72.0
	mat = fitz.Matrix(zoom, zoom)

	results: List[str] = []
	skipped = 0
	kept = 0

	print(f"[*] Total halaman: {total} \| DPI render: {DPI}")
	for page_num in range(1, total + 1):
	if page_num in skip:
	skipped += 1
	print(f"Halaman {page_num} dilewati.")
	continue

	page = doc.load_page(page_num - 1)
	pix = page.get_pixmap(matrix=mat, alpha=False)
	img = pixmap_to_pil(pix)

	print(f"Halaman {page_num}: OCR …")
	try:
	txt = ocr_page(img, OCR_LANG)
	except Exception as e:
	print(f"[!] OCR gagal halaman {page_num}: {e}")
	txt = ""

	if txt.strip():
	results.append(txt.strip())
	kept += 1
	else:
	print(f"Halaman {page_num}: hasil kosong/pendek.")

	doc.close()

	os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)
	with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
	for t in results:
	if not t.strip():
	continue
	f.write(t + "\n\n")

	print("\nRingkasan:")
	print(f"- Total halaman : {total}")
	print(f"- Dilewati (skip) : {skipped}")
	print(f"- Tersimpan (non-skip): {kept}")
	print(f"[*] Output: {OUTPUT_TXT}")

	if __name__ == "__main__":
	main()