Spaces:

dwb2023
/

hf_extractor

Runtime error

App Files Files Community

hf_extractor / app.py

dwb2023

Update app.py

6be117b verified over 1 year ago

raw

history blame

3.09 kB

	import os
	import subprocess
	import gradio as gr

	def clone_repo(url, repo_dir):
	env = os.environ.copy()
	env['GIT_LFS_SKIP_SMUDGE'] = '1'
	result = subprocess.run(["git", "clone", url, repo_dir], env=env, capture_output=True, text=True)
	if result.returncode != 0:
	return False, result.stderr
	return True, None

	def get_file_summary(file_path):
	size = os.path.getsize(file_path)
	file_type = "binary" if size > 1024 * 1024 else "text"
	return {
	"name": os.path.relpath(file_path),
	"type": file_type,
	"size": size,
	}

	def read_file_content(file_path):
	with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
	return file.read()

	def extract_repo_content(url):
	repo_dir = "./temp_repo"
	if os.path.exists(repo_dir):
	subprocess.run(["rm", "-rf", repo_dir])

	success, error = clone_repo(url, repo_dir)
	if not success:
	return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": error}]

	extracted_content = []
	for root, _, files in os.walk(repo_dir):
	if '.git' in root:
	continue # Skip the .git directory
	for file in files:
	file_path = os.path.join(root, file)
	file_summary = get_file_summary(file_path)
	content = {"header": file_summary}

	if file_summary["type"] == "text" and file_summary["size"] <= 1024 * 1024:
	try:
	content["content"] = read_file_content(file_path)
	except Exception as e:
	content["content"] = f"Failed to read file content: {str(e)}"
	else:
	content["content"] = "File too large or binary, content not captured."

	extracted_content.append(content)

	return extracted_content

	def format_output(extracted_content):
	formatted_output = ""
	for file_data in extracted_content:
	if isinstance(file_data, dict) and 'header' in file_data:
	formatted_output += f"### File: {file_data['header']['name']}\n"
	formatted_output += f"Type: {file_data['header']['type']}\n"
	formatted_output += f"Size: {file_data['header']['size']} bytes\n"
	formatted_output += "#### Content:\n"
	formatted_output += f"```\n{file_data['content']}\n```\n\n"
	else:
	formatted_output += "Error in file data format.\n"
	return formatted_output

	def extract_and_display(url):
	extracted_content = extract_repo_content(url)
	formatted_output = format_output(extracted_content)
	return formatted_output

	app = gr.Blocks()

	with app:
	gr.Markdown("# Gradio Space/Model Content Extractor")
	url_input = gr.Textbox(label="Hugging Face Space/Model URL")
	output_display = gr.Textbox(show_copy_button=True, lines=20, placeholder="Output will be displayed here...")
	extract_button = gr.Button("Extract Content")

	extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)

	app.launch()