Spaces:

lsottani
/

RAG_file_preprocessing

Sleeping

App Files Files Community

RAG_file_preprocessing / venv /lib /python3.9 /site-packages /chardet /cli /chardetect.py

lsottani

Upload folder using huggingface_hub

d9f69e5 verified about 2 months ago

raw

history blame contribute delete

3.24 kB

	"""
	Script which takes one or more file paths and reports on their detected
	encodings

	Example::

	% chardetect somefile someotherfile
	somefile: windows-1252 with confidence 0.5
	someotherfile: ascii with confidence 1.0

	If no paths are provided, it takes its input from stdin.

	"""


	import argparse
	import sys
	from typing import Iterable, List, Optional

	from .. import __version__
	from ..universaldetector import UniversalDetector


	def description_of(
	lines: Iterable[bytes],
	name: str = "stdin",
	minimal: bool = False,
	should_rename_legacy: bool = False,
	) -> Optional[str]:
	"""
	Return a string describing the probable encoding of a file or
	list of strings.

	:param lines: The lines to get the encoding of.
	:type lines: Iterable of bytes
	:param name: Name of file or collection of lines
	:type name: str
	:param should_rename_legacy: Should we rename legacy encodings to
	their more modern equivalents?
	:type should_rename_legacy: ``bool``
	"""
	u = UniversalDetector(should_rename_legacy=should_rename_legacy)
	for line in lines:
	line = bytearray(line)
	u.feed(line)
	# shortcut out of the loop to save reading further - particularly useful if we read a BOM.
	if u.done:
	break
	u.close()
	result = u.result
	if minimal:
	return result["encoding"]
	if result["encoding"]:
	return f'{name}: {result["encoding"]} with confidence {result["confidence"]}'
	return f"{name}: no result"


	def main(argv: Optional[List[str]] = None) -> None:
	"""
	Handles command line arguments and gets things started.

	:param argv: List of arguments, as if specified on the command-line.
	If None, ``sys.argv[1:]`` is used instead.
	:type argv: list of str
	"""
	# Get command line arguments
	parser = argparse.ArgumentParser(
	description=(
	"Takes one or more file paths and reports their detected encodings"
	)
	)
	parser.add_argument(
	"input",
	help="File whose encoding we would like to determine. (default: stdin)",
	type=argparse.FileType("rb"),
	nargs="*",
	default=[sys.stdin.buffer],
	)
	parser.add_argument(
	"--minimal",
	help="Print only the encoding to standard output",
	action="store_true",
	)
	parser.add_argument(
	"-l",
	"--legacy",
	help="Rename legacy encodings to more modern ones.",
	action="store_true",
	)
	parser.add_argument(
	"--version", action="version", version=f"%(prog)s {__version__}"
	)
	args = parser.parse_args(argv)

	for f in args.input:
	if f.isatty():
	print(
	"You are running chardetect interactively. Press "
	"CTRL-D twice at the start of a blank line to signal the "
	"end of your input. If you want help, run chardetect "
	"--help\n",
	file=sys.stderr,
	)
	print(
	description_of(
	f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy
	)
	)


	if __name__ == "__main__":
	main()