Spaces:
Running
Running
File size: 3,347 Bytes
fb2123c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os, re, sys
from typing import List, Optional, Set
import fitz
import pytesseract
from PIL import Image
from io import BytesIO
PDF_PATH = r"D:\Webchatbot\Dataset\Penjas\PJOK_BS_KLS_VI.pdf"
OUTPUT_TXT = r"D:\Webchatbot\Dataset\Penjas\Clean\Penjas Kelas VI.txt"
SKIP_PAGES = list(range(1, 22)) + list(range(200, 211)) + list(range(213, 226))
TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
OCR_LANG = "ind+eng"
DPI = 300
if TESSERACT_CMD:
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
URL_RE = re.compile(
r"(https?://\S+|www\.\S+|\b\S+\.(?:com|org|net|edu|gov|go|id|co)\S*)",
flags=re.IGNORECASE,
)
BAB_LINE_RE = re.compile(
r"^\s*(?:bab|BAB)\s*(?:[0-9]+|[IVXLCDM]+)\s*(?:[:\-–—]\s*.*)?\s*$"
)
BAB_PREFIX_RE = re.compile(
r"^\s*(?:bab|BAB)\s*(?:[0-9]+|[IVXLCDM]+)\s*(?:[:\-–—]\s*)?",
flags=re.IGNORECASE,
)
def clean_text(text: str) -> str:
text = URL_RE.sub("", text or "")
text = text.replace("\t", " ")
text = re.sub(r"[^\x09\x0A\x0D\x20-\x7EÀ-ÿ]", "", text)
cleaned_lines: List[str] = []
for raw_ln in text.splitlines():
ln = re.sub(r"\s+", " ", raw_ln).strip()
if not ln:
continue
if BAB_LINE_RE.match(ln):
continue
ln = BAB_PREFIX_RE.sub("", ln).strip()
if not ln:
continue
cleaned_lines.append(ln)
text_out = "\n".join(cleaned_lines).strip()
return text_out
def pixmap_to_pil(pix: fitz.Pixmap) -> Image.Image:
if pix.alpha:
pix = fitz.Pixmap(fitz.csRGB, pix)
img_bytes = pix.tobytes("png")
return Image.open(BytesIO(img_bytes))
def ocr_page(img: Image.Image, lang: str) -> str:
return clean_text(pytesseract.image_to_string(img, lang=lang))
def main():
if not os.path.exists(PDF_PATH):
print(f"PDF tidak ditemukan: {PDF_PATH}")
sys.exit(1)
doc = fitz.open(PDF_PATH)
total = doc.page_count
skip: Set[int] = set(SKIP_PAGES or [])
zoom = DPI / 72.0
mat = fitz.Matrix(zoom, zoom)
results: List[str] = []
skipped = 0
kept = 0
print(f"[*] Total halaman: {total} | DPI render: {DPI}")
for page_num in range(1, total + 1):
if page_num in skip:
skipped += 1
print(f"Halaman {page_num} dilewati.")
continue
page = doc.load_page(page_num - 1)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = pixmap_to_pil(pix)
print(f"Halaman {page_num}: OCR …")
try:
txt = ocr_page(img, OCR_LANG)
except Exception as e:
print(f"[!] OCR gagal halaman {page_num}: {e}")
txt = ""
if txt.strip():
results.append(txt.strip())
kept += 1
else:
print(f"Halaman {page_num}: hasil kosong/pendek.")
doc.close()
os.makedirs(os.path.dirname(OUTPUT_TXT), exist_ok=True)
with open(OUTPUT_TXT, "w", encoding="utf-8") as f:
for t in results:
if not t.strip():
continue
f.write(t + "\n\n")
print("\nRingkasan:")
print(f"- Total halaman : {total}")
print(f"- Dilewati (skip) : {skipped}")
print(f"- Tersimpan (non-skip): {kept}")
print(f"[*] Output: {OUTPUT_TXT}")
if __name__ == "__main__":
main()
|