Spaces:
Running
Running
| import logging | |
| import struct | |
| from io import BytesIO | |
| from typing import ( | |
| TYPE_CHECKING, | |
| Any, | |
| BinaryIO, | |
| Dict, | |
| Iterable, | |
| Iterator, | |
| List, | |
| Mapping, | |
| Optional, | |
| Tuple, | |
| Union, | |
| cast, | |
| ) | |
| from pdf2zh import settings | |
| from pdf2zh.cmapdb import ( | |
| CMap, | |
| CMapBase, | |
| CMapDB, | |
| CMapParser, | |
| FileUnicodeMap, | |
| IdentityUnicodeMap, | |
| UnicodeMap, | |
| ) | |
| from pdf2zh.encodingdb import EncodingDB, name2unicode | |
| from pdf2zh.fontmetrics import FONT_METRICS | |
| from pdf2zh.pdfexceptions import PDFException, PDFKeyError, PDFValueError | |
| from pdf2zh.pdftypes import ( | |
| PDFStream, | |
| dict_value, | |
| int_value, | |
| list_value, | |
| num_value, | |
| resolve1, | |
| resolve_all, | |
| stream_value, | |
| ) | |
| from pdf2zh.psexceptions import PSEOF | |
| from pdf2zh.psparser import ( | |
| KWD, | |
| LIT, | |
| PSKeyword, | |
| PSLiteral, | |
| PSStackParser, | |
| literal_name, | |
| ) | |
| from pdf2zh.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack | |
| if TYPE_CHECKING: | |
| from pdf2zh.pdfinterp import PDFResourceManager | |
| log = logging.getLogger(__name__) | |
| def get_widths(seq: Iterable[object]) -> Dict[int, float]: | |
| """Build a mapping of character widths for horizontal writing.""" | |
| widths: Dict[int, float] = {} | |
| r: List[float] = [] | |
| for v in seq: | |
| if isinstance(v, list): | |
| if r: | |
| char1 = r[-1] | |
| for i, w in enumerate(v): | |
| widths[cast(int, char1) + i] = w | |
| r = [] | |
| elif isinstance(v, (int, float)): # == utils.isnumber(v) | |
| r.append(v) | |
| if len(r) == 3: | |
| (char1, char2, w) = r | |
| for i in range(cast(int, char1), cast(int, char2) + 1): | |
| widths[i] = w | |
| r = [] | |
| return widths | |
| def get_widths2(seq: Iterable[object]) -> Dict[int, Tuple[float, Point]]: | |
| """Build a mapping of character widths for vertical writing.""" | |
| widths: Dict[int, Tuple[float, Point]] = {} | |
| r: List[float] = [] | |
| for v in seq: | |
| if isinstance(v, list): | |
| if r: | |
| char1 = r[-1] | |
| for i, (w, vx, vy) in enumerate(choplist(3, v)): | |
| widths[cast(int, char1) + i] = (w, (vx, vy)) | |
| r = [] | |
| elif isinstance(v, (int, float)): # == utils.isnumber(v) | |
| r.append(v) | |
| if len(r) == 5: | |
| (char1, char2, w, vx, vy) = r | |
| for i in range(cast(int, char1), cast(int, char2) + 1): | |
| widths[i] = (w, (vx, vy)) | |
| r = [] | |
| return widths | |
| class FontMetricsDB: | |
| def get_metrics(cls, fontname: str) -> Tuple[Dict[str, object], Dict[str, int]]: | |
| return FONT_METRICS[fontname] | |
| # int here means that we're not extending PSStackParser with additional types. | |
| class Type1FontHeaderParser(PSStackParser[int]): | |
| KEYWORD_BEGIN = KWD(b"begin") | |
| KEYWORD_END = KWD(b"end") | |
| KEYWORD_DEF = KWD(b"def") | |
| KEYWORD_PUT = KWD(b"put") | |
| KEYWORD_DICT = KWD(b"dict") | |
| KEYWORD_ARRAY = KWD(b"array") | |
| KEYWORD_READONLY = KWD(b"readonly") | |
| KEYWORD_FOR = KWD(b"for") | |
| def __init__(self, data: BinaryIO) -> None: | |
| PSStackParser.__init__(self, data) | |
| self._cid2unicode: Dict[int, str] = {} | |
| def get_encoding(self) -> Dict[int, str]: | |
| """Parse the font encoding. | |
| The Type1 font encoding maps character codes to character names. These | |
| character names could either be standard Adobe glyph names, or | |
| character names associated with custom CharStrings for this font. A | |
| CharString is a sequence of operations that describe how the character | |
| should be drawn. Currently, this function returns '' (empty string) | |
| for character names that are associated with a CharStrings. | |
| Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format | |
| :returns mapping of character identifiers (cid's) to unicode characters | |
| """ | |
| while 1: | |
| try: | |
| _, (cid, name) = self.nextobject() | |
| except PSEOF: | |
| break | |
| try: | |
| self._cid2unicode[cid] = name2unicode(cast(str, name)) | |
| except KeyError: | |
| # log.debug(str(e)) | |
| pass | |
| return self._cid2unicode | |
| def do_keyword(self, pos: int, token: PSKeyword) -> None: | |
| if token is self.KEYWORD_PUT: | |
| ((_, key), (_, value)) = self.pop(2) | |
| if isinstance(key, int) and isinstance(value, PSLiteral): | |
| self.add_results((key, literal_name(value))) | |
| NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-") | |
| # Mapping of cmap names. Original cmap name is kept if not in the mapping. | |
| # (missing reference for why DLIdent is mapped to Identity) | |
| IDENTITY_ENCODER = { | |
| "DLIdent-H": "Identity-H", | |
| "DLIdent-V": "Identity-V", | |
| } | |
| def getdict(data: bytes) -> Dict[int, List[Union[float, int]]]: | |
| d: Dict[int, List[Union[float, int]]] = {} | |
| fp = BytesIO(data) | |
| stack: List[Union[float, int]] = [] | |
| while 1: | |
| c = fp.read(1) | |
| if not c: | |
| break | |
| b0 = ord(c) | |
| if b0 <= 21: | |
| d[b0] = stack | |
| stack = [] | |
| continue | |
| if b0 == 30: | |
| s = "" | |
| loop = True | |
| while loop: | |
| b = ord(fp.read(1)) | |
| for n in (b >> 4, b & 15): | |
| if n == 15: | |
| loop = False | |
| else: | |
| nibble = NIBBLES[n] | |
| assert nibble is not None | |
| s += nibble | |
| value = float(s) | |
| elif b0 >= 32 and b0 <= 246: | |
| value = b0 - 139 | |
| else: | |
| b1 = ord(fp.read(1)) | |
| if b0 >= 247 and b0 <= 250: | |
| value = ((b0 - 247) << 8) + b1 + 108 | |
| elif b0 >= 251 and b0 <= 254: | |
| value = -((b0 - 251) << 8) - b1 - 108 | |
| else: | |
| b2 = ord(fp.read(1)) | |
| if b1 >= 128: | |
| b1 -= 256 | |
| if b0 == 28: | |
| value = b1 << 8 | b2 | |
| else: | |
| value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0] | |
| stack.append(value) | |
| return d | |
| class CFFFont: | |
| STANDARD_STRINGS = ( | |
| ".notdef", | |
| "space", | |
| "exclam", | |
| "quotedbl", | |
| "numbersign", | |
| "dollar", | |
| "percent", | |
| "ampersand", | |
| "quoteright", | |
| "parenleft", | |
| "parenright", | |
| "asterisk", | |
| "plus", | |
| "comma", | |
| "hyphen", | |
| "period", | |
| "slash", | |
| "zero", | |
| "one", | |
| "two", | |
| "three", | |
| "four", | |
| "five", | |
| "six", | |
| "seven", | |
| "eight", | |
| "nine", | |
| "colon", | |
| "semicolon", | |
| "less", | |
| "equal", | |
| "greater", | |
| "question", | |
| "at", | |
| "A", | |
| "B", | |
| "C", | |
| "D", | |
| "E", | |
| "F", | |
| "G", | |
| "H", | |
| "I", | |
| "J", | |
| "K", | |
| "L", | |
| "M", | |
| "N", | |
| "O", | |
| "P", | |
| "Q", | |
| "R", | |
| "S", | |
| "T", | |
| "U", | |
| "V", | |
| "W", | |
| "X", | |
| "Y", | |
| "Z", | |
| "bracketleft", | |
| "backslash", | |
| "bracketright", | |
| "asciicircum", | |
| "underscore", | |
| "quoteleft", | |
| "a", | |
| "b", | |
| "c", | |
| "d", | |
| "e", | |
| "f", | |
| "g", | |
| "h", | |
| "i", | |
| "j", | |
| "k", | |
| "l", | |
| "m", | |
| "n", | |
| "o", | |
| "p", | |
| "q", | |
| "r", | |
| "s", | |
| "t", | |
| "u", | |
| "v", | |
| "w", | |
| "x", | |
| "y", | |
| "z", | |
| "braceleft", | |
| "bar", | |
| "braceright", | |
| "asciitilde", | |
| "exclamdown", | |
| "cent", | |
| "sterling", | |
| "fraction", | |
| "yen", | |
| "florin", | |
| "section", | |
| "currency", | |
| "quotesingle", | |
| "quotedblleft", | |
| "guillemotleft", | |
| "guilsinglleft", | |
| "guilsinglright", | |
| "fi", | |
| "fl", | |
| "endash", | |
| "dagger", | |
| "daggerdbl", | |
| "periodcentered", | |
| "paragraph", | |
| "bullet", | |
| "quotesinglbase", | |
| "quotedblbase", | |
| "quotedblright", | |
| "guillemotright", | |
| "ellipsis", | |
| "perthousand", | |
| "questiondown", | |
| "grave", | |
| "acute", | |
| "circumflex", | |
| "tilde", | |
| "macron", | |
| "breve", | |
| "dotaccent", | |
| "dieresis", | |
| "ring", | |
| "cedilla", | |
| "hungarumlaut", | |
| "ogonek", | |
| "caron", | |
| "emdash", | |
| "AE", | |
| "ordfeminine", | |
| "Lslash", | |
| "Oslash", | |
| "OE", | |
| "ordmasculine", | |
| "ae", | |
| "dotlessi", | |
| "lslash", | |
| "oslash", | |
| "oe", | |
| "germandbls", | |
| "onesuperior", | |
| "logicalnot", | |
| "mu", | |
| "trademark", | |
| "Eth", | |
| "onehalf", | |
| "plusminus", | |
| "Thorn", | |
| "onequarter", | |
| "divide", | |
| "brokenbar", | |
| "degree", | |
| "thorn", | |
| "threequarters", | |
| "twosuperior", | |
| "registered", | |
| "minus", | |
| "eth", | |
| "multiply", | |
| "threesuperior", | |
| "copyright", | |
| "Aacute", | |
| "Acircumflex", | |
| "Adieresis", | |
| "Agrave", | |
| "Aring", | |
| "Atilde", | |
| "Ccedilla", | |
| "Eacute", | |
| "Ecircumflex", | |
| "Edieresis", | |
| "Egrave", | |
| "Iacute", | |
| "Icircumflex", | |
| "Idieresis", | |
| "Igrave", | |
| "Ntilde", | |
| "Oacute", | |
| "Ocircumflex", | |
| "Odieresis", | |
| "Ograve", | |
| "Otilde", | |
| "Scaron", | |
| "Uacute", | |
| "Ucircumflex", | |
| "Udieresis", | |
| "Ugrave", | |
| "Yacute", | |
| "Ydieresis", | |
| "Zcaron", | |
| "aacute", | |
| "acircumflex", | |
| "adieresis", | |
| "agrave", | |
| "aring", | |
| "atilde", | |
| "ccedilla", | |
| "eacute", | |
| "ecircumflex", | |
| "edieresis", | |
| "egrave", | |
| "iacute", | |
| "icircumflex", | |
| "idieresis", | |
| "igrave", | |
| "ntilde", | |
| "oacute", | |
| "ocircumflex", | |
| "odieresis", | |
| "ograve", | |
| "otilde", | |
| "scaron", | |
| "uacute", | |
| "ucircumflex", | |
| "udieresis", | |
| "ugrave", | |
| "yacute", | |
| "ydieresis", | |
| "zcaron", | |
| "exclamsmall", | |
| "Hungarumlautsmall", | |
| "dollaroldstyle", | |
| "dollarsuperior", | |
| "ampersandsmall", | |
| "Acutesmall", | |
| "parenleftsuperior", | |
| "parenrightsuperior", | |
| "twodotenleader", | |
| "onedotenleader", | |
| "zerooldstyle", | |
| "oneoldstyle", | |
| "twooldstyle", | |
| "threeoldstyle", | |
| "fouroldstyle", | |
| "fiveoldstyle", | |
| "sixoldstyle", | |
| "sevenoldstyle", | |
| "eightoldstyle", | |
| "nineoldstyle", | |
| "commasuperior", | |
| "threequartersemdash", | |
| "periodsuperior", | |
| "questionsmall", | |
| "asuperior", | |
| "bsuperior", | |
| "centsuperior", | |
| "dsuperior", | |
| "esuperior", | |
| "isuperior", | |
| "lsuperior", | |
| "msuperior", | |
| "nsuperior", | |
| "osuperior", | |
| "rsuperior", | |
| "ssuperior", | |
| "tsuperior", | |
| "ff", | |
| "ffi", | |
| "ffl", | |
| "parenleftinferior", | |
| "parenrightinferior", | |
| "Circumflexsmall", | |
| "hyphensuperior", | |
| "Gravesmall", | |
| "Asmall", | |
| "Bsmall", | |
| "Csmall", | |
| "Dsmall", | |
| "Esmall", | |
| "Fsmall", | |
| "Gsmall", | |
| "Hsmall", | |
| "Ismall", | |
| "Jsmall", | |
| "Ksmall", | |
| "Lsmall", | |
| "Msmall", | |
| "Nsmall", | |
| "Osmall", | |
| "Psmall", | |
| "Qsmall", | |
| "Rsmall", | |
| "Ssmall", | |
| "Tsmall", | |
| "Usmall", | |
| "Vsmall", | |
| "Wsmall", | |
| "Xsmall", | |
| "Ysmall", | |
| "Zsmall", | |
| "colonmonetary", | |
| "onefitted", | |
| "rupiah", | |
| "Tildesmall", | |
| "exclamdownsmall", | |
| "centoldstyle", | |
| "Lslashsmall", | |
| "Scaronsmall", | |
| "Zcaronsmall", | |
| "Dieresissmall", | |
| "Brevesmall", | |
| "Caronsmall", | |
| "Dotaccentsmall", | |
| "Macronsmall", | |
| "figuredash", | |
| "hypheninferior", | |
| "Ogoneksmall", | |
| "Ringsmall", | |
| "Cedillasmall", | |
| "questiondownsmall", | |
| "oneeighth", | |
| "threeeighths", | |
| "fiveeighths", | |
| "seveneighths", | |
| "onethird", | |
| "twothirds", | |
| "zerosuperior", | |
| "foursuperior", | |
| "fivesuperior", | |
| "sixsuperior", | |
| "sevensuperior", | |
| "eightsuperior", | |
| "ninesuperior", | |
| "zeroinferior", | |
| "oneinferior", | |
| "twoinferior", | |
| "threeinferior", | |
| "fourinferior", | |
| "fiveinferior", | |
| "sixinferior", | |
| "seveninferior", | |
| "eightinferior", | |
| "nineinferior", | |
| "centinferior", | |
| "dollarinferior", | |
| "periodinferior", | |
| "commainferior", | |
| "Agravesmall", | |
| "Aacutesmall", | |
| "Acircumflexsmall", | |
| "Atildesmall", | |
| "Adieresissmall", | |
| "Aringsmall", | |
| "AEsmall", | |
| "Ccedillasmall", | |
| "Egravesmall", | |
| "Eacutesmall", | |
| "Ecircumflexsmall", | |
| "Edieresissmall", | |
| "Igravesmall", | |
| "Iacutesmall", | |
| "Icircumflexsmall", | |
| "Idieresissmall", | |
| "Ethsmall", | |
| "Ntildesmall", | |
| "Ogravesmall", | |
| "Oacutesmall", | |
| "Ocircumflexsmall", | |
| "Otildesmall", | |
| "Odieresissmall", | |
| "OEsmall", | |
| "Oslashsmall", | |
| "Ugravesmall", | |
| "Uacutesmall", | |
| "Ucircumflexsmall", | |
| "Udieresissmall", | |
| "Yacutesmall", | |
| "Thornsmall", | |
| "Ydieresissmall", | |
| "001.000", | |
| "001.001", | |
| "001.002", | |
| "001.003", | |
| "Black", | |
| "Bold", | |
| "Book", | |
| "Light", | |
| "Medium", | |
| "Regular", | |
| "Roman", | |
| "Semibold", | |
| ) | |
| class INDEX: | |
| def __init__(self, fp: BinaryIO) -> None: | |
| self.fp = fp | |
| self.offsets: List[int] = [] | |
| (count, offsize) = struct.unpack(">HB", self.fp.read(3)) | |
| for i in range(count + 1): | |
| self.offsets.append(nunpack(self.fp.read(offsize))) | |
| self.base = self.fp.tell() - 1 | |
| self.fp.seek(self.base + self.offsets[-1]) | |
| def __repr__(self) -> str: | |
| return "<INDEX: size=%d>" % len(self) | |
| def __len__(self) -> int: | |
| return len(self.offsets) - 1 | |
| def __getitem__(self, i: int) -> bytes: | |
| self.fp.seek(self.base + self.offsets[i]) | |
| return self.fp.read(self.offsets[i + 1] - self.offsets[i]) | |
| def __iter__(self) -> Iterator[bytes]: | |
| return iter(self[i] for i in range(len(self))) | |
| def __init__(self, name: str, fp: BinaryIO) -> None: | |
| self.name = name | |
| self.fp = fp | |
| # Header | |
| (_major, _minor, hdrsize, offsize) = struct.unpack("BBBB", self.fp.read(4)) | |
| self.fp.read(hdrsize - 4) | |
| # Name INDEX | |
| self.name_index = self.INDEX(self.fp) | |
| # Top DICT INDEX | |
| self.dict_index = self.INDEX(self.fp) | |
| # String INDEX | |
| self.string_index = self.INDEX(self.fp) | |
| # Global Subr INDEX | |
| self.subr_index = self.INDEX(self.fp) | |
| # Top DICT DATA | |
| self.top_dict = getdict(self.dict_index[0]) | |
| (charset_pos,) = self.top_dict.get(15, [0]) | |
| (encoding_pos,) = self.top_dict.get(16, [0]) | |
| (charstring_pos,) = self.top_dict.get(17, [0]) | |
| # CharStrings | |
| self.fp.seek(cast(int, charstring_pos)) | |
| self.charstring = self.INDEX(self.fp) | |
| self.nglyphs = len(self.charstring) | |
| # Encodings | |
| self.code2gid = {} | |
| self.gid2code = {} | |
| self.fp.seek(cast(int, encoding_pos)) | |
| format = self.fp.read(1) | |
| if format == b"\x00": | |
| # Format 0 | |
| (n,) = struct.unpack("B", self.fp.read(1)) | |
| for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))): | |
| self.code2gid[code] = gid | |
| self.gid2code[gid] = code | |
| elif format == b"\x01": | |
| # Format 1 | |
| (n,) = struct.unpack("B", self.fp.read(1)) | |
| code = 0 | |
| for i in range(n): | |
| (first, nleft) = struct.unpack("BB", self.fp.read(2)) | |
| for gid in range(first, first + nleft + 1): | |
| self.code2gid[code] = gid | |
| self.gid2code[gid] = code | |
| code += 1 | |
| else: | |
| raise PDFValueError("unsupported encoding format: %r" % format) | |
| # Charsets | |
| self.name2gid = {} | |
| self.gid2name = {} | |
| self.fp.seek(cast(int, charset_pos)) | |
| format = self.fp.read(1) | |
| if format == b"\x00": | |
| # Format 0 | |
| n = self.nglyphs - 1 | |
| for gid, sid in enumerate( | |
| cast( | |
| Tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n)) | |
| ), | |
| ): | |
| gid += 1 | |
| sidname = self.getstr(sid) | |
| self.name2gid[sidname] = gid | |
| self.gid2name[gid] = sidname | |
| elif format == b"\x01": | |
| # Format 1 | |
| (n,) = struct.unpack("B", self.fp.read(1)) | |
| sid = 0 | |
| for i in range(n): | |
| (first, nleft) = struct.unpack("BB", self.fp.read(2)) | |
| for gid in range(first, first + nleft + 1): | |
| sidname = self.getstr(sid) | |
| self.name2gid[sidname] = gid | |
| self.gid2name[gid] = sidname | |
| sid += 1 | |
| elif format == b"\x02": | |
| # Format 2 | |
| assert False, str(("Unhandled", format)) | |
| else: | |
| raise PDFValueError("unsupported charset format: %r" % format) | |
| def getstr(self, sid: int) -> Union[str, bytes]: | |
| # This returns str for one of the STANDARD_STRINGS but bytes otherwise, | |
| # and appears to be a needless source of type complexity. | |
| if sid < len(self.STANDARD_STRINGS): | |
| return self.STANDARD_STRINGS[sid] | |
| return self.string_index[sid - len(self.STANDARD_STRINGS)] | |
| class TrueTypeFont: | |
| class CMapNotFound(PDFException): | |
| pass | |
| def __init__(self, name: str, fp: BinaryIO) -> None: | |
| self.name = name | |
| self.fp = fp | |
| self.tables: Dict[bytes, Tuple[int, int]] = {} | |
| self.fonttype = fp.read(4) | |
| try: | |
| (ntables, _1, _2, _3) = cast( | |
| Tuple[int, int, int, int], | |
| struct.unpack(">HHHH", fp.read(8)), | |
| ) | |
| for _ in range(ntables): | |
| (name_bytes, tsum, offset, length) = cast( | |
| Tuple[bytes, int, int, int], | |
| struct.unpack(">4sLLL", fp.read(16)), | |
| ) | |
| self.tables[name_bytes] = (offset, length) | |
| except struct.error: | |
| # Do not fail if there are not enough bytes to read. Even for | |
| # corrupted PDFs we would like to get as much information as | |
| # possible, so continue. | |
| pass | |
| def create_unicode_map(self) -> FileUnicodeMap: | |
| if b"cmap" not in self.tables: | |
| raise TrueTypeFont.CMapNotFound | |
| (base_offset, length) = self.tables[b"cmap"] | |
| fp = self.fp | |
| fp.seek(base_offset) | |
| (version, nsubtables) = cast(Tuple[int, int], struct.unpack(">HH", fp.read(4))) | |
| subtables: List[Tuple[int, int, int]] = [] | |
| for i in range(nsubtables): | |
| subtables.append( | |
| cast(Tuple[int, int, int], struct.unpack(">HHL", fp.read(8))), | |
| ) | |
| char2gid: Dict[int, int] = {} | |
| # Only supports subtable type 0, 2 and 4. | |
| for platform_id, encoding_id, st_offset in subtables: | |
| # Skip non-Unicode cmaps. | |
| # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap | |
| if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): | |
| continue | |
| fp.seek(base_offset + st_offset) | |
| (fmttype, fmtlen, fmtlang) = cast( | |
| Tuple[int, int, int], | |
| struct.unpack(">HHH", fp.read(6)), | |
| ) | |
| if fmttype == 0: | |
| char2gid.update( | |
| enumerate( | |
| cast(Tuple[int, ...], struct.unpack(">256B", fp.read(256))), | |
| ), | |
| ) | |
| elif fmttype == 2: | |
| subheaderkeys = cast( | |
| Tuple[int, ...], | |
| struct.unpack(">256H", fp.read(512)), | |
| ) | |
| firstbytes = [0] * 8192 | |
| for i, k in enumerate(subheaderkeys): | |
| firstbytes[k // 8] = i | |
| nhdrs = max(subheaderkeys) // 8 + 1 | |
| hdrs: List[Tuple[int, int, int, int, int]] = [] | |
| for i in range(nhdrs): | |
| (firstcode, entcount, delta, offset) = cast( | |
| Tuple[int, int, int, int], | |
| struct.unpack(">HHhH", fp.read(8)), | |
| ) | |
| hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset)) | |
| for i, firstcode, entcount, delta, pos in hdrs: | |
| if not entcount: | |
| continue | |
| first = firstcode + (firstbytes[i] << 8) | |
| fp.seek(pos) | |
| for c in range(entcount): | |
| gid = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] | |
| if gid: | |
| gid += delta | |
| char2gid[first + c] = gid | |
| elif fmttype == 4: | |
| (segcount, _1, _2, _3) = cast( | |
| Tuple[int, int, int, int], | |
| struct.unpack(">HHHH", fp.read(8)), | |
| ) | |
| segcount //= 2 | |
| ecs = cast( | |
| Tuple[int, ...], | |
| struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), | |
| ) | |
| fp.read(2) | |
| scs = cast( | |
| Tuple[int, ...], | |
| struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), | |
| ) | |
| idds = cast( | |
| Tuple[int, ...], | |
| struct.unpack(">%dh" % segcount, fp.read(2 * segcount)), | |
| ) | |
| pos = fp.tell() | |
| idrs = cast( | |
| Tuple[int, ...], | |
| struct.unpack(">%dH" % segcount, fp.read(2 * segcount)), | |
| ) | |
| for ec, sc, idd, idr in zip(ecs, scs, idds, idrs): | |
| if idr: | |
| fp.seek(pos + idr) | |
| for c in range(sc, ec + 1): | |
| b = cast(Tuple[int], struct.unpack(">H", fp.read(2)))[0] | |
| char2gid[c] = (b + idd) & 0xFFFF | |
| else: | |
| for c in range(sc, ec + 1): | |
| char2gid[c] = (c + idd) & 0xFFFF | |
| else: | |
| assert False, str(("Unhandled", fmttype)) | |
| if not char2gid: | |
| raise TrueTypeFont.CMapNotFound | |
| # create unicode map | |
| unicode_map = FileUnicodeMap() | |
| for char, gid in char2gid.items(): | |
| unicode_map.add_cid2unichr(gid, char) | |
| return unicode_map | |
| class PDFFontError(PDFException): | |
| pass | |
| class PDFUnicodeNotDefined(PDFFontError): | |
| pass | |
| LITERAL_STANDARD_ENCODING = LIT("StandardEncoding") | |
| LITERAL_TYPE1C = LIT("Type1C") | |
| # Font widths are maintained in a dict type that maps from *either* unicode | |
| # chars or integer character IDs. | |
| FontWidthDict = Union[Dict[int, float], Dict[str, float]] | |
| class PDFFont: | |
| def __init__( | |
| self, | |
| descriptor: Mapping[str, Any], | |
| widths: FontWidthDict, | |
| default_width: Optional[float] = None, | |
| ) -> None: | |
| self.descriptor = descriptor | |
| self.widths: FontWidthDict = resolve_all(widths) | |
| self.fontname = resolve1(descriptor.get("FontName", "unknown")) | |
| if isinstance(self.fontname, PSLiteral): | |
| self.fontname = literal_name(self.fontname) | |
| self.flags = int_value(descriptor.get("Flags", 0)) | |
| self.ascent = num_value(descriptor.get("Ascent", 0)) | |
| self.descent = num_value(descriptor.get("Descent", 0)) | |
| self.italic_angle = num_value(descriptor.get("ItalicAngle", 0)) | |
| if default_width is None: | |
| self.default_width = num_value(descriptor.get("MissingWidth", 0)) | |
| else: | |
| self.default_width = default_width | |
| self.default_width = resolve1(self.default_width) | |
| self.leading = num_value(descriptor.get("Leading", 0)) | |
| self.bbox = cast( | |
| Rect, | |
| list_value(resolve_all(descriptor.get("FontBBox", (0, 0, 0, 0)))), | |
| ) | |
| self.hscale = self.vscale = 0.001 | |
| # PDF RM 9.8.1 specifies /Descent should always be a negative number. | |
| # PScript5.dll seems to produce Descent with a positive number, but | |
| # text analysis will be wrong if this is taken as correct. So force | |
| # descent to negative. | |
| if self.descent > 0: | |
| self.descent = -self.descent | |
| def __repr__(self) -> str: | |
| return "<PDFFont>" | |
| def is_vertical(self) -> bool: | |
| return False | |
| def is_multibyte(self) -> bool: | |
| return False | |
| def decode(self, bytes: bytes) -> Iterable[int]: | |
| return bytearray(bytes) # map(ord, bytes) | |
| def get_ascent(self) -> float: | |
| """Ascent above the baseline, in text space units""" | |
| return self.ascent * self.vscale | |
| def get_descent(self) -> float: | |
| """Descent below the baseline, in text space units; always negative""" | |
| return self.descent * self.vscale | |
| def get_width(self) -> float: | |
| w = self.bbox[2] - self.bbox[0] | |
| if w == 0: | |
| w = -self.default_width | |
| return w * self.hscale | |
| def get_height(self) -> float: | |
| h = self.bbox[3] - self.bbox[1] | |
| if h == 0: | |
| h = self.ascent - self.descent | |
| return h * self.vscale | |
| def char_width(self, cid: int) -> float: | |
| # Because character widths may be mapping either IDs or strings, | |
| # we try to lookup the character ID first, then its str equivalent. | |
| try: | |
| return cast(Dict[int, float], self.widths)[cid] * self.hscale | |
| except KeyError: | |
| str_widths = cast(Dict[str, float], self.widths) | |
| try: | |
| return str_widths[self.to_unichr(cid)] * self.hscale | |
| except (KeyError, PDFUnicodeNotDefined): | |
| return self.default_width * self.hscale | |
| def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: | |
| """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" | |
| return 0 | |
| def string_width(self, s: bytes) -> float: | |
| return sum(self.char_width(cid) for cid in self.decode(s)) | |
| def to_unichr(self, cid: int) -> str: | |
| raise NotImplementedError | |
| class PDFSimpleFont(PDFFont): | |
| def __init__( | |
| self, | |
| descriptor: Mapping[str, Any], | |
| widths: FontWidthDict, | |
| spec: Mapping[str, Any], | |
| ) -> None: | |
| # Font encoding is specified either by a name of | |
| # built-in encoding or a dictionary that describes | |
| # the differences. | |
| if "Encoding" in spec: | |
| encoding = resolve1(spec["Encoding"]) | |
| else: | |
| encoding = LITERAL_STANDARD_ENCODING | |
| if isinstance(encoding, dict): | |
| name = literal_name(encoding.get("BaseEncoding", LITERAL_STANDARD_ENCODING)) | |
| diff = list_value(encoding.get("Differences", [])) | |
| self.cid2unicode = EncodingDB.get_encoding(name, diff) | |
| else: | |
| self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) | |
| self.unicode_map: Optional[UnicodeMap] = None | |
| if "ToUnicode" in spec: | |
| strm = stream_value(spec["ToUnicode"]) | |
| self.unicode_map = FileUnicodeMap() | |
| CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() | |
| PDFFont.__init__(self, descriptor, widths) | |
| def to_unichr(self, cid: int) -> str: | |
| if self.unicode_map: | |
| try: | |
| return self.unicode_map.get_unichr(cid) | |
| except KeyError: | |
| pass | |
| try: | |
| return self.cid2unicode[cid] | |
| except KeyError: | |
| raise PDFUnicodeNotDefined(None, cid) | |
| class PDFType1Font(PDFSimpleFont): | |
| def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: | |
| try: | |
| self.basefont = literal_name(spec["BaseFont"]) | |
| except KeyError: | |
| if settings.STRICT: | |
| raise PDFFontError("BaseFont is missing") | |
| self.basefont = "unknown" | |
| widths: FontWidthDict | |
| try: | |
| (descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont) | |
| widths = cast(Dict[str, float], int_widths) # implicit int->float | |
| except KeyError: | |
| descriptor = dict_value(spec.get("FontDescriptor", {})) | |
| firstchar = int_value(spec.get("FirstChar", 0)) | |
| # lastchar = int_value(spec.get('LastChar', 255)) | |
| width_list = list_value(spec.get("Widths", [0] * 256)) | |
| widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)} | |
| PDFSimpleFont.__init__(self, descriptor, widths, spec) | |
| if "Encoding" not in spec and "FontFile" in descriptor: | |
| # try to recover the missing encoding info from the font file. | |
| self.fontfile = stream_value(descriptor.get("FontFile")) | |
| length1 = int_value(self.fontfile["Length1"]) | |
| data = self.fontfile.get_data()[:length1] | |
| parser = Type1FontHeaderParser(BytesIO(data)) | |
| self.cid2unicode = parser.get_encoding() | |
| def __repr__(self) -> str: | |
| return "<PDFType1Font: basefont=%r>" % self.basefont | |
| class PDFTrueTypeFont(PDFType1Font): | |
| def __repr__(self) -> str: | |
| return "<PDFTrueTypeFont: basefont=%r>" % self.basefont | |
| class PDFType3Font(PDFSimpleFont): | |
| def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None: | |
| firstchar = int_value(spec.get("FirstChar", 0)) | |
| # lastchar = int_value(spec.get('LastChar', 0)) | |
| width_list = list_value(spec.get("Widths", [0] * 256)) | |
| widths = {i + firstchar: w for (i, w) in enumerate(width_list)} | |
| if "FontDescriptor" in spec: | |
| descriptor = dict_value(spec["FontDescriptor"]) | |
| else: | |
| descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]} | |
| PDFSimpleFont.__init__(self, descriptor, widths, spec) | |
| self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix")))) | |
| (_, self.descent, _, self.ascent) = self.bbox | |
| (self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1)) | |
| def __repr__(self) -> str: | |
| return "<PDFType3Font>" | |
| class PDFCIDFont(PDFFont): | |
| default_disp: Union[float, Tuple[Optional[float], float]] | |
| def __init__( | |
| self, | |
| rsrcmgr: "PDFResourceManager", | |
| spec: Mapping[str, Any], | |
| strict: bool = settings.STRICT, | |
| ) -> None: | |
| try: | |
| self.basefont = literal_name(spec["BaseFont"]) | |
| except KeyError: | |
| if strict: | |
| raise PDFFontError("BaseFont is missing") | |
| self.basefont = "unknown" | |
| self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {})) | |
| cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode( | |
| "latin1", | |
| ) | |
| cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode( | |
| "latin1", | |
| ) | |
| self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}" | |
| self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict) | |
| try: | |
| descriptor = dict_value(spec["FontDescriptor"]) | |
| except KeyError: | |
| if strict: | |
| raise PDFFontError("FontDescriptor is missing") | |
| descriptor = {} | |
| ttf = None | |
| if "FontFile2" in descriptor: | |
| self.fontfile = stream_value(descriptor.get("FontFile2")) | |
| ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data())) | |
| self.unicode_map: Optional[UnicodeMap] = None | |
| if "ToUnicode" in spec: | |
| if isinstance(spec["ToUnicode"], PDFStream): | |
| strm = stream_value(spec["ToUnicode"]) | |
| self.unicode_map = FileUnicodeMap() | |
| CMapParser(self.unicode_map, BytesIO(strm.get_data())).run() | |
| else: | |
| cmap_name = literal_name(spec["ToUnicode"]) | |
| encoding = literal_name(spec["Encoding"]) | |
| if ( | |
| "Identity" in cid_ordering | |
| or "Identity" in cmap_name | |
| or "Identity" in encoding | |
| ): | |
| self.unicode_map = IdentityUnicodeMap() | |
| elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"): | |
| if ttf: | |
| try: | |
| self.unicode_map = ttf.create_unicode_map() | |
| except TrueTypeFont.CMapNotFound: | |
| pass | |
| else: | |
| try: | |
| self.unicode_map = CMapDB.get_unicode_map( | |
| self.cidcoding, | |
| self.cmap.is_vertical(), | |
| ) | |
| except CMapDB.CMapNotFound: | |
| pass | |
| self.vertical = self.cmap.is_vertical() | |
| if self.vertical: | |
| # writing mode: vertical | |
| widths2 = get_widths2(list_value(spec.get("W2", []))) | |
| self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()} | |
| (vy, w) = resolve1(spec.get("DW2", [880, -1000])) | |
| self.default_disp = (None, vy) | |
| widths = {cid: w for (cid, (w, _)) in widths2.items()} | |
| default_width = w | |
| else: | |
| # writing mode: horizontal | |
| self.disps = {} | |
| self.default_disp = 0 | |
| widths = get_widths(list_value(spec.get("W", []))) | |
| default_width = spec.get("DW", 1000) | |
| PDFFont.__init__(self, descriptor, widths, default_width=default_width) | |
| def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase: | |
| """Get cmap from font specification | |
| For certain PDFs, Encoding Type isn't mentioned as an attribute of | |
| Encoding but as an attribute of CMapName, where CMapName is an | |
| attribute of spec['Encoding']. | |
| The horizontal/vertical modes are mentioned with different name | |
| such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. | |
| """ | |
| cmap_name = self._get_cmap_name(spec, strict) | |
| try: | |
| return CMapDB.get_cmap(cmap_name) | |
| except CMapDB.CMapNotFound as e: | |
| if strict: | |
| raise PDFFontError(e) | |
| return CMap() | |
| def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str: | |
| """Get cmap name from font specification""" | |
| cmap_name = "unknown" # default value | |
| try: | |
| spec_encoding = spec["Encoding"] | |
| if hasattr(spec_encoding, "name"): | |
| cmap_name = literal_name(spec["Encoding"]) | |
| else: | |
| cmap_name = literal_name(spec_encoding["CMapName"]) | |
| except KeyError: | |
| if strict: | |
| raise PDFFontError("Encoding is unspecified") | |
| if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap] | |
| cmap_name_stream: PDFStream = cast(PDFStream, cmap_name) | |
| if "CMapName" in cmap_name_stream: | |
| cmap_name = cmap_name_stream.get("CMapName").name | |
| elif strict: | |
| raise PDFFontError("CMapName unspecified for encoding") | |
| return IDENTITY_ENCODER.get(cmap_name, cmap_name) | |
| def __repr__(self) -> str: | |
| return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>" | |
| def is_vertical(self) -> bool: | |
| return self.vertical | |
| def is_multibyte(self) -> bool: | |
| return True | |
| def decode(self, bytes: bytes) -> Iterable[int]: | |
| return self.cmap.decode(bytes) | |
| def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]: | |
| """Returns an integer for horizontal fonts, a tuple for vertical fonts.""" | |
| return self.disps.get(cid, self.default_disp) | |
| def to_unichr(self, cid: int) -> str: | |
| try: | |
| if not self.unicode_map: | |
| raise PDFKeyError(cid) | |
| return self.unicode_map.get_unichr(cid) | |
| except KeyError: | |
| raise PDFUnicodeNotDefined(self.cidcoding, cid) | |