Spaces:
Running
Running
| import os, re, sys | |
| from typing import List, Optional, Set | |
| import fitz | |
| import pytesseract | |
| from PIL import Image | |
| from io import BytesIO | |
| PDF_PATH = r"D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_VI.pdf" | |
| OUTPUT_TXT = r"D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas VI.txt" | |
| SKIP_PAGES = list(range(1, 22)) + list(range(200, 211)) + list(range(213, 226)) | |
| TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe" | |
| OCR_LANG = "ind+eng" | |
| DPI = 300 | |
| if TESSERACT_CMD: | |
| pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD | |
| URL_RE = re.compile( | |
| r"(https?://\S+|www\.\S+|\b\S+\.(?:com|org|net|edu|gov|go|id|co)\S*)", | |
| flags=re.IGNORECASE, | |
| ) | |
| BAB_LINE_RE = re.compile( | |
| r"^\s*(?:bab|BAB)\s*(?:[0-9]+|[IVXLCDM]+)\s*(?:[:\-ββ]\s*.*)?\s*$" | |
| ) | |
| BAB_PREFIX_RE = re.compile( | |
| r"^\s*(?:bab|BAB)\s*(?:[0-9]+|[IVXLCDM]+)\s*(?:[:\-ββ]\s*)?", | |
| flags=re.IGNORECASE, | |
| ) | |
| def clean_text(text: str) -> str: | |
| text = URL_RE.sub("", text or "") | |
| text = text.replace("\t", " ") | |
| text = re.sub(r"[^\x09\x0A\x0D\x20-\x7EΓ-ΓΏ]", "", text) | |
| cleaned_lines: List[str] = [] | |
| for raw_ln in text.splitlines(): | |
| ln = re.sub(r"\s+", " ", raw_ln).strip() | |
| if not ln: | |
| continue | |
| if BAB_LINE_RE.match(ln): | |
| continue | |
| ln = BAB_PREFIX_RE.sub("", ln).strip() | |
| if not ln: | |
| continue | |
| cleaned_lines.append(ln) | |
| text_out = "\n".join(cleaned_lines).strip() | |
| return text_out | |
| def pixmap_to_pil(pix: fitz.Pixmap) -> Image.Image: | |
| if pix.alpha: | |
| pix = fitz.Pixmap(fitz.csRGB, pix) | |
| img_bytes = pix.tobytes("png") | |
| return Image.open(BytesIO(img_bytes)) | |
| def ocr_page(img: Image.Image, lang: str) -> str: | |
| return clean_text(pytesseract.image_to_string(img, lang=lang)) | |
| def main(): | |
| if not os.path.exists(PDF_PATH): | |
| print(f"PDF tidak ditemukan: {PDF_PATH}") | |
| sys.exit(1) | |
| doc = fitz.open(PDF_PATH) | |
| total = doc.page_count | |
| skip: Set[int] = set(SKIP_PAGES or []) | |
| zoom = DPI / 72.0 | |
| mat = fitz.Matrix(zoom, zoom) | |
| results: List[str] = [] | |
| skipped = 0 | |
| kept = 0 | |
| print(f"[*] Total halaman: {total} | DPI render: {DPI}") | |
| for page_num in range(1, total + 1): | |
| if page_num in skip: | |
| skipped += 1 | |
| print(f"Halaman {page_num} dilewati.") | |
| continue | |
| page = doc.load_page(page_num - 1) | |
| pix = page.get_pixmap(matrix=mat, alpha=False) | |
| img = pixmap_to_pil(pix) | |
| print(f"Halaman {page_num}: OCR β¦") | |
| try: | |
| txt = ocr_page(img, OCR_LANG) | |
| except Exception as e: | |
| print(f"[!] OCR gagal halaman {page_num}: {e}") | |
| txt = "" | |
| if txt.strip(): | |
| results.append(txt.strip()) | |
| kept += 1 | |
| else: | |
| print(f"Halaman {page_num}: hasil kosong/pendek.") | |
| doc.close() | |
| os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True) | |
| with open(OUTPUT_TXT, "w", encoding="utf-8") as f: | |
| for t in results: | |
| if not t.strip(): | |
| continue | |
| f.write(t + "\n\n") | |
| print("\nRingkasan:") | |
| print(f"- Total halaman : {total}") | |
| print(f"- Dilewati (skip) : {skipped}") | |
| print(f"- Tersimpan (non-skip): {kept}") | |
| print(f"[*] Output: {OUTPUT_TXT}") | |
| if __name__ == "__main__": | |
| main() | |