Spaces:

sanbo1200
/

PDFTranslate

Running

sanbo

update sth. at 2024-11-26 16:15:47

9b0f4a0 12 months ago

4.01 kB

	import logging
	import re
	from typing import Dict, Iterable, Optional, cast

	from pdf2zh.glyphlist import glyphname2unicode
	from pdf2zh.latin_enc import ENCODING
	from pdf2zh.pdfexceptions import PDFKeyError
	from pdf2zh.psparser import PSLiteral

	HEXADECIMAL = re.compile(r"[0-9a-fA-F]+")

	log = logging.getLogger(__name__)


	def name2unicode(name: str) -> str:
	"""Converts Adobe glyph names to Unicode numbers.

	In contrast to the specification, this raises a KeyError instead of return
	an empty string when the key is unknown.
	This way the caller must explicitly define what to do
	when there is not a match.

	Reference:
	https://github.com/adobe-type-tools/agl-specification#2-the-mapping

	:returns unicode character if name resembles something,
	otherwise a KeyError
	"""
	if not isinstance(name, str):
	raise PDFKeyError(
	'Could not convert unicode name "%s" to character because '
	"it should be of type str but is of type %s" % (name, type(name)),
	)

	name = name.split(".")[0]
	components = name.split("_")

	if len(components) > 1:
	return "".join(map(name2unicode, components))

	elif name in glyphname2unicode:
	return glyphname2unicode[name]

	elif name.startswith("uni"):
	name_without_uni = name.strip("uni")

	if HEXADECIMAL.match(name_without_uni) and len(name_without_uni) % 4 == 0:
	unicode_digits = [
	int(name_without_uni[i : i + 4], base=16)
	for i in range(0, len(name_without_uni), 4)
	]
	for digit in unicode_digits:
	raise_key_error_for_invalid_unicode(digit)
	characters = map(chr, unicode_digits)
	return "".join(characters)

	elif name.startswith("u"):
	name_without_u = name.strip("u")

	if HEXADECIMAL.match(name_without_u) and 4 <= len(name_without_u) <= 6:
	unicode_digit = int(name_without_u, base=16)
	raise_key_error_for_invalid_unicode(unicode_digit)
	return chr(unicode_digit)

	raise PDFKeyError(
	'Could not convert unicode name "%s" to character because '
	"it does not match specification" % name,
	)


	def raise_key_error_for_invalid_unicode(unicode_digit: int) -> None:
	"""Unicode values should not be in the range D800 through DFFF because
	that is used for surrogate pairs in UTF-16

	:raises KeyError if unicode digit is invalid
	"""
	if 55295 < unicode_digit < 57344:
	raise PDFKeyError(
	"Unicode digit %d is invalid because "
	"it is in the range D800 through DFFF" % unicode_digit,
	)


	class EncodingDB:
	std2unicode: Dict[int, str] = {}
	mac2unicode: Dict[int, str] = {}
	win2unicode: Dict[int, str] = {}
	pdf2unicode: Dict[int, str] = {}
	for name, std, mac, win, pdf in ENCODING:
	c = name2unicode(name)
	if std:
	std2unicode[std] = c
	if mac:
	mac2unicode[mac] = c
	if win:
	win2unicode[win] = c
	if pdf:
	pdf2unicode[pdf] = c

	encodings = {
	"StandardEncoding": std2unicode,
	"MacRomanEncoding": mac2unicode,
	"WinAnsiEncoding": win2unicode,
	"PDFDocEncoding": pdf2unicode,
	}

	@classmethod
	def get_encoding(
	cls,
	name: str,
	diff: Optional[Iterable[object]] = None,
	) -> Dict[int, str]:
	cid2unicode = cls.encodings.get(name, cls.std2unicode)
	if diff:
	cid2unicode = cid2unicode.copy()
	cid = 0
	for x in diff:
	if isinstance(x, int):
	cid = x
	elif isinstance(x, PSLiteral):
	try:
	cid2unicode[cid] = name2unicode(cast(str, x.name))
	except (KeyError, ValueError):
	# log.debug(str(e))
	pass
	cid += 1
	return cid2unicode