Spaces:
Running
Running
| import logging | |
| import re | |
| from typing import Dict, Iterable, Optional, cast | |
| from pdf2zh.glyphlist import glyphname2unicode | |
| from pdf2zh.latin_enc import ENCODING | |
| from pdf2zh.pdfexceptions import PDFKeyError | |
| from pdf2zh.psparser import PSLiteral | |
| HEXADECIMAL = re.compile(r"[0-9a-fA-F]+") | |
| log = logging.getLogger(__name__) | |
| def name2unicode(name: str) -> str: | |
| """Converts Adobe glyph names to Unicode numbers. | |
| In contrast to the specification, this raises a KeyError instead of return | |
| an empty string when the key is unknown. | |
| This way the caller must explicitly define what to do | |
| when there is not a match. | |
| Reference: | |
| https://github.com/adobe-type-tools/agl-specification#2-the-mapping | |
| :returns unicode character if name resembles something, | |
| otherwise a KeyError | |
| """ | |
| if not isinstance(name, str): | |
| raise PDFKeyError( | |
| 'Could not convert unicode name "%s" to character because ' | |
| "it should be of type str but is of type %s" % (name, type(name)), | |
| ) | |
| name = name.split(".")[0] | |
| components = name.split("_") | |
| if len(components) > 1: | |
| return "".join(map(name2unicode, components)) | |
| elif name in glyphname2unicode: | |
| return glyphname2unicode[name] | |
| elif name.startswith("uni"): | |
| name_without_uni = name.strip("uni") | |
| if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0: | |
| unicode_digits = [ | |
| int(name_without_uni[i : i + 4], base=16) | |
| for i in range(0, len(name_without_uni), 4) | |
| ] | |
| for digit in unicode_digits: | |
| raise_key_error_for_invalid_unicode(digit) | |
| characters = map(chr, unicode_digits) | |
| return "".join(characters) | |
| elif name.startswith("u"): | |
| name_without_u = name.strip("u") | |
| if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6: | |
| unicode_digit = int(name_without_u, base=16) | |
| raise_key_error_for_invalid_unicode(unicode_digit) | |
| return chr(unicode_digit) | |
| raise PDFKeyError( | |
| 'Could not convert unicode name "%s" to character because ' | |
| "it does not match specification" % name, | |
| ) | |
| def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None: | |
| """Unicode values should not be in the range D800 through DFFF because | |
| that is used for surrogate pairs in UTF-16 | |
| :raises KeyError if unicode digit is invalid | |
| """ | |
| if 55295 < unicode_digit < 57344: | |
| raise PDFKeyError( | |
| "Unicode digit %d is invalid because " | |
| "it is in the range D800 through DFFF" % unicode_digit, | |
| ) | |
| class EncodingDB: | |
| std2unicode: Dict[int, str] = {} | |
| mac2unicode: Dict[int, str] = {} | |
| win2unicode: Dict[int, str] = {} | |
| pdf2unicode: Dict[int, str] = {} | |
| for name, std, mac, win, pdf in ENCODING: | |
| c = name2unicode(name) | |
| if std: | |
| std2unicode[std] = c | |
| if mac: | |
| mac2unicode[mac] = c | |
| if win: | |
| win2unicode[win] = c | |
| if pdf: | |
| pdf2unicode[pdf] = c | |
| encodings = { | |
| "StandardEncoding": std2unicode, | |
| "MacRomanEncoding": mac2unicode, | |
| "WinAnsiEncoding": win2unicode, | |
| "PDFDocEncoding": pdf2unicode, | |
| } | |
| def get_encoding( | |
| cls, | |
| name: str, | |
| diff: Optional[Iterable[object]] = None, | |
| ) -> Dict[int, str]: | |
| cid2unicode = cls.encodings.get(name, cls.std2unicode) | |
| if diff: | |
| cid2unicode = cid2unicode.copy() | |
| cid = 0 | |
| for x in diff: | |
| if isinstance(x, int): | |
| cid = x | |
| elif isinstance(x, PSLiteral): | |
| try: | |
| cid2unicode[cid] = name2unicode(cast(str, x.name)) | |
| except (KeyError, ValueError): | |
| # log.debug(str(e)) | |
| pass | |
| cid += 1 | |
| return cid2unicode | |