Spaces:
Running
Running
| from typing import ( | |
| TYPE_CHECKING, | |
| BinaryIO, | |
| Iterable, | |
| List, | |
| Optional, | |
| Sequence, | |
| Union, | |
| cast, | |
| ) | |
| from pdf2zh import utils | |
| from pdf2zh.pdfcolor import PDFColorSpace | |
| from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined | |
| from pdf2zh.pdfpage import PDFPage | |
| from pdf2zh.pdftypes import PDFStream | |
| from pdf2zh.psparser import PSLiteral | |
| from pdf2zh.utils import Matrix, PathSegment, Point, Rect | |
| if TYPE_CHECKING: | |
| from pdf2zh.pdfinterp import ( | |
| PDFGraphicState, | |
| PDFResourceManager, | |
| PDFStackT, | |
| PDFTextState, | |
| ) | |
| PDFTextSeq = Iterable[Union[int, float, bytes]] | |
| class PDFDevice: | |
| """Translate the output of PDFPageInterpreter to the output that is needed""" | |
| def __init__(self, rsrcmgr: "PDFResourceManager") -> None: | |
| self.rsrcmgr = rsrcmgr | |
| self.ctm: Optional[Matrix] = None | |
| def __repr__(self) -> str: | |
| return "<PDFDevice>" | |
| def __enter__(self) -> "PDFDevice": | |
| return self | |
| def __exit__(self, exc_type: object, exc_val: object, exc_tb: object) -> None: | |
| self.close() | |
| def close(self) -> None: | |
| pass | |
| def set_ctm(self, ctm: Matrix) -> None: | |
| self.ctm = ctm | |
| def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: | |
| pass | |
| def end_tag(self) -> None: | |
| pass | |
| def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: | |
| pass | |
| def begin_page(self, page: PDFPage, ctm: Matrix) -> None: | |
| pass | |
| def end_page(self, page: PDFPage) -> None: | |
| pass | |
| def begin_figure(self, name: str, bbox: Rect, matrix: Matrix) -> None: | |
| pass | |
| def end_figure(self, name: str) -> None: | |
| pass | |
| def paint_path( | |
| self, | |
| graphicstate: "PDFGraphicState", | |
| stroke: bool, | |
| fill: bool, | |
| evenodd: bool, | |
| path: Sequence[PathSegment], | |
| ) -> None: | |
| pass | |
| def render_image(self, name: str, stream: PDFStream) -> None: | |
| pass | |
| def render_string( | |
| self, | |
| textstate: "PDFTextState", | |
| seq: PDFTextSeq, | |
| ncs: PDFColorSpace, | |
| graphicstate: "PDFGraphicState", | |
| ) -> None: | |
| pass | |
| class PDFTextDevice(PDFDevice): | |
| def render_string( | |
| self, | |
| textstate: "PDFTextState", | |
| seq: PDFTextSeq, | |
| ncs: PDFColorSpace, | |
| graphicstate: "PDFGraphicState", | |
| ) -> None: | |
| assert self.ctm is not None | |
| matrix = utils.mult_matrix(textstate.matrix, self.ctm) | |
| font = textstate.font | |
| fontsize = textstate.fontsize | |
| scaling = textstate.scaling * 0.01 | |
| charspace = textstate.charspace * scaling | |
| wordspace = textstate.wordspace * scaling | |
| rise = textstate.rise | |
| assert font is not None | |
| if font.is_multibyte(): | |
| wordspace = 0 | |
| dxscale = 0.001 * fontsize * scaling | |
| if font.is_vertical(): | |
| textstate.linematrix = self.render_string_vertical( | |
| seq, | |
| matrix, | |
| textstate.linematrix, | |
| font, | |
| fontsize, | |
| scaling, | |
| charspace, | |
| wordspace, | |
| rise, | |
| dxscale, | |
| ncs, | |
| graphicstate, | |
| ) | |
| else: | |
| textstate.linematrix = self.render_string_horizontal( | |
| seq, | |
| matrix, | |
| textstate.linematrix, | |
| font, | |
| fontsize, | |
| scaling, | |
| charspace, | |
| wordspace, | |
| rise, | |
| dxscale, | |
| ncs, | |
| graphicstate, | |
| ) | |
| def render_string_horizontal( | |
| self, | |
| seq: PDFTextSeq, | |
| matrix: Matrix, | |
| pos: Point, | |
| font: PDFFont, | |
| fontsize: float, | |
| scaling: float, | |
| charspace: float, | |
| wordspace: float, | |
| rise: float, | |
| dxscale: float, | |
| ncs: PDFColorSpace, | |
| graphicstate: "PDFGraphicState", | |
| ) -> Point: | |
| (x, y) = pos | |
| needcharspace = False | |
| for obj in seq: | |
| if isinstance(obj, (int, float)): | |
| x -= obj * dxscale | |
| needcharspace = True | |
| else: | |
| for cid in font.decode(obj): | |
| if needcharspace: | |
| x += charspace | |
| x += self.render_char( | |
| utils.translate_matrix(matrix, (x, y)), | |
| font, | |
| fontsize, | |
| scaling, | |
| rise, | |
| cid, | |
| ncs, | |
| graphicstate, | |
| ) | |
| if cid == 32 and wordspace: | |
| x += wordspace | |
| needcharspace = True | |
| return (x, y) | |
| def render_string_vertical( | |
| self, | |
| seq: PDFTextSeq, | |
| matrix: Matrix, | |
| pos: Point, | |
| font: PDFFont, | |
| fontsize: float, | |
| scaling: float, | |
| charspace: float, | |
| wordspace: float, | |
| rise: float, | |
| dxscale: float, | |
| ncs: PDFColorSpace, | |
| graphicstate: "PDFGraphicState", | |
| ) -> Point: | |
| (x, y) = pos | |
| needcharspace = False | |
| for obj in seq: | |
| if isinstance(obj, (int, float)): | |
| y -= obj * dxscale | |
| needcharspace = True | |
| else: | |
| for cid in font.decode(obj): | |
| if needcharspace: | |
| y += charspace | |
| y += self.render_char( | |
| utils.translate_matrix(matrix, (x, y)), | |
| font, | |
| fontsize, | |
| scaling, | |
| rise, | |
| cid, | |
| ncs, | |
| graphicstate, | |
| ) | |
| if cid == 32 and wordspace: | |
| y += wordspace | |
| needcharspace = True | |
| return (x, y) | |
| def render_char( | |
| self, | |
| matrix: Matrix, | |
| font: PDFFont, | |
| fontsize: float, | |
| scaling: float, | |
| rise: float, | |
| cid: int, | |
| ncs: PDFColorSpace, | |
| graphicstate: "PDFGraphicState", | |
| ) -> float: | |
| return 0 | |
| class TagExtractor(PDFDevice): | |
| def __init__( | |
| self, | |
| rsrcmgr: "PDFResourceManager", | |
| outfp: BinaryIO, | |
| codec: str = "utf-8", | |
| ) -> None: | |
| PDFDevice.__init__(self, rsrcmgr) | |
| self.outfp = outfp | |
| self.codec = codec | |
| self.pageno = 0 | |
| self._stack: List[PSLiteral] = [] | |
| def render_string( | |
| self, | |
| textstate: "PDFTextState", | |
| seq: PDFTextSeq, | |
| ncs: PDFColorSpace, | |
| graphicstate: "PDFGraphicState", | |
| ) -> None: | |
| font = textstate.font | |
| assert font is not None | |
| text = "" | |
| for obj in seq: | |
| if isinstance(obj, str): | |
| obj = utils.make_compat_bytes(obj) | |
| if not isinstance(obj, bytes): | |
| continue | |
| chars = font.decode(obj) | |
| for cid in chars: | |
| try: | |
| char = font.to_unichr(cid) | |
| text += char | |
| except PDFUnicodeNotDefined: | |
| pass | |
| self._write(utils.enc(text)) | |
| def begin_page(self, page: PDFPage, ctm: Matrix) -> None: | |
| output = '<page id="%s" bbox="%s" rotate="%d">' % ( | |
| self.pageno, | |
| utils.bbox2str(page.mediabox), | |
| page.rotate, | |
| ) | |
| self._write(output) | |
| def end_page(self, page: PDFPage) -> None: | |
| self._write("</page>\n") | |
| self.pageno += 1 | |
| def begin_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: | |
| s = "" | |
| if isinstance(props, dict): | |
| s = "".join( | |
| [ | |
| f' {utils.enc(k)}="{utils.make_compat_str(v)}"' | |
| for (k, v) in sorted(props.items()) | |
| ], | |
| ) | |
| out_s = f"<{utils.enc(cast(str, tag.name))}{s}>" | |
| self._write(out_s) | |
| self._stack.append(tag) | |
| def end_tag(self) -> None: | |
| assert self._stack, str(self.pageno) | |
| tag = self._stack.pop(-1) | |
| out_s = "</%s>" % utils.enc(cast(str, tag.name)) | |
| self._write(out_s) | |
| def do_tag(self, tag: PSLiteral, props: Optional["PDFStackT"] = None) -> None: | |
| self.begin_tag(tag, props) | |
| self._stack.pop(-1) | |
| def _write(self, s: str) -> None: | |
| self.outfp.write(s.encode(self.codec)) | |