Spaces:

sanbo1200
/

PDFTranslate

Running

sanbo

update sth. at 2024-11-26 16:15:47

9b0f4a0 12 months ago

9.05 kB

	#!/usr/bin/env python3
	"""A command line tool for extracting text and images from PDF and
	output it to plain text, html, xml or tags.
	"""

	from __future__ import annotations

	import argparse
	import logging
	import os
	import sys
	from pathlib import Path
	from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional

	import pymupdf
	import requests

	from pdf2zh import __version__
	from pdf2zh.pdfexceptions import PDFValueError

	if TYPE_CHECKING:
	from pdf2zh.layout import LAParams
	from pdf2zh.utils import AnyIO

	OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))


	def setup_log() -> None:
	logging.basicConfig()

	try:
	import doclayout_yolo

	doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
	except ImportError:
	pass


	def check_files(files: List[str]) -> List[str]:
	files = [
	f for f in files if not f.startswith("http://")
	] # exclude online files, http
	files = [
	f for f in files if not f.startswith("https://")
	] # exclude online files, https
	missing_files = [file for file in files if not os.path.exists(file)]
	return missing_files


	def float_or_disabled(x: str) -> Optional[float]:
	if x.lower().strip() == "disabled":
	return None
	try:
	return float(x)
	except ValueError:
	raise argparse.ArgumentTypeError(f"invalid float value: {x}")


	def extract_text(
	files: Iterable[str] = [],
	outfile: str = "-",
	laparams: Optional[LAParams] = None,
	output_type: str = "text",
	codec: str = "utf-8",
	strip_control: bool = False,
	maxpages: int = 0,
	pages: Optional[Container[int]] = None,
	password: str = "",
	scale: float = 1.0,
	rotation: int = 0,
	layoutmode: str = "normal",
	output_dir: Optional[str] = None,
	debug: bool = False,
	disable_caching: bool = False,
	vfont: str = "",
	vchar: str = "",
	thread: int = 0,
	lang_in: str = "",
	lang_out: str = "",
	service: str = "",
	callback: object = None,
	output: str = "",
	**kwargs: Any,
	) -> AnyIO:
	import pdf2zh.high_level
	from pdf2zh.doclayout import DocLayoutModel

	if not files:
	raise PDFValueError("Must provide files to work upon!")

	if output_type == "text" and outfile != "-":
	for override, alttype in OUTPUT_TYPES:
	if outfile.endswith(override):
	output_type = alttype

	outfp: AnyIO = sys.stdout
	model = DocLayoutModel.load_available()

	for file in files:
	if file.startswith("http://") or file.startswith("https://"):
	print("Online files detected, downloading...")
	try:
	r = requests.get(file, allow_redirects=True)
	if r.status_code == 200:
	if not os.path.exists("./pdf2zh_files"):
	print("Making a temporary dir for downloading PDF files...")
	os.mkdir(os.path.dirname("./pdf2zh_files"))
	with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
	print(f"Writing the file: {file}...")
	f.write(r.content)
	file = "./pdf2zh_files/tmp_download.pdf"
	else:
	r.raise_for_status()
	except Exception as e:
	raise PDFValueError(
	f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
	)
	filename = os.path.splitext(os.path.basename(file))[0]

	doc_en = pymupdf.open(file)
	page_count = doc_en.page_count
	font_list = ["china-ss", "tiro"]
	font_id = {}
	for page in doc_en:
	for font in font_list:
	font_id[font] = page.insert_font(font)
	xreflen = doc_en.xref_length()
	for xref in range(1, xreflen):
	for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
	try: # xref 读写可能出错
	font_res = doc_en.xref_get_key(xref, f"{label}Font")
	if font_res[0] == "dict":
	for font in font_list:
	font_exist = doc_en.xref_get_key(
	xref, f"{label}Font/{font}"
	)
	if font_exist[0] == "null":
	doc_en.xref_set_key(
	xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
	)
	except Exception:
	pass
	doc_en.save(Path(output) / f"{filename}-en.pdf")

	with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
	obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())

	for obj_id, ops_new in obj_patch.items():
	# ops_old=doc_en.xref_stream(obj_id)
	# print(obj_id)
	# print(ops_old)
	# print(ops_new.encode())
	doc_en.update_stream(obj_id, ops_new.encode())

	doc_zh = doc_en
	doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
	doc_dual.insert_file(doc_zh)
	for id in range(page_count):
	doc_dual.move_page(page_count + id, id * 2 + 1)
	doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
	doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
	doc_zh.close()
	doc_dual.close()
	os.remove(Path(output) / f"{filename}-en.pdf")

	return


	def create_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(description=__doc__, add_help=True)
	parser.add_argument(
	"files",
	type=str,
	default=None,
	nargs="*",
	help="One or more paths to PDF files.",
	)
	parser.add_argument(
	"--version",
	"-v",
	action="version",
	version=f"pdf2zh v{__version__}",
	)
	parser.add_argument(
	"--debug",
	"-d",
	default=False,
	action="store_true",
	help="Use debug logging level.",
	)
	parse_params = parser.add_argument_group(
	"Parser",
	description="Used during PDF parsing",
	)
	parse_params.add_argument(
	"--pages",
	"-p",
	type=str,
	help="The list of page numbers to parse.",
	)
	parse_params.add_argument(
	"--password",
	"-P",
	type=str,
	default="",
	help="The password to use for decrypting PDF file.",
	)
	parse_params.add_argument(
	"--vfont",
	"-f",
	type=str,
	default="",
	help="The regex to math font name of formula.",
	)
	parse_params.add_argument(
	"--vchar",
	"-c",
	type=str,
	default="",
	help="The regex to math character of formula.",
	)
	parse_params.add_argument(
	"--lang-in",
	"-li",
	type=str,
	default="auto",
	help="The code of source language.",
	)
	parse_params.add_argument(
	"--lang-out",
	"-lo",
	type=str,
	default="auto",
	help="The code of target language.",
	)
	parse_params.add_argument(
	"--service",
	"-s",
	type=str,
	default="google",
	help="The service to use for translation.",
	)
	parse_params.add_argument(
	"--output",
	"-o",
	type=str,
	default="",
	help="Output directory for files.",
	)
	parse_params.add_argument(
	"--thread",
	"-t",
	type=int,
	default=4,
	help="The number of threads to execute translation.",
	)
	parse_params.add_argument(
	"--interactive",
	"-i",
	action="store_true",
	help="Interact with GUI.",
	)
	parse_params.add_argument(
	"--share",
	action="store_true",
	help="Enable Gradio Share",
	)

	return parser


	def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
	parsed_args = create_parser().parse_args(args=args)

	if parsed_args.pages:
	pages = []
	for p in parsed_args.pages.split(","):
	if "-" in p:
	start, end = p.split("-")
	pages.extend(range(int(start) - 1, int(end)))
	else:
	pages.append(int(p) - 1)
	parsed_args.pages = pages

	return parsed_args


	def main(args: Optional[List[str]] = None) -> int:
	parsed_args = parse_args(args)

	missing_files = check_files(parsed_args.files)
	if missing_files:
	print("The following files do not exist:", file=sys.stderr)
	for file in missing_files:
	print(f" {file}", file=sys.stderr)
	return -1
	if parsed_args.interactive:
	from pdf2zh.gui import setup_gui

	setup_gui(parsed_args.share)
	return 0

	setup_log()
	extract_text(**vars(parsed_args))
	return 0


	if __name__ == "__main__":
	sys.exit(main())
	sys.exit(main())