Spaces:

neuralworm
/

video_transcription

Sleeping

App Files Files Community

video_transcription / app.py

neuralworm

Update app.py

6c588c4 verified 23 days ago

raw

history blame

6.9 kB

	#!/usr/bin/env python3
	# coding: utf-8
	"""
	Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)

	Hinweis: Verwende diese App nur für eigene oder freigegebene Inhalte.
	"""
	import os
	import subprocess
	import tempfile
	import json
	from pathlib import Path
	from datetime import timedelta
	import gradio as gr

	try:
	import whisper
	except Exception:
	whisper = None

	def run_capture(cmd):
	result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
	if result.returncode != 0:
	err_tail = result.stderr[-1000:] if result.stderr else ""
	raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{err_tail}")
	return result.stdout

	def download_video_with_ytdlp(url, out_dir, cookies_path=None, format_selector=None):
	out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
	cmd = ["yt-dlp", "-o", out_template]
	if format_selector:
	cmd += ["-f", format_selector]
	if cookies_path:
	cmd += ["--cookies", cookies_path]
	cmd.append(url)
	run_capture(cmd)
	files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
	if not files:
	raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.")
	return str(files[0])

	def extract_audio_ffmpeg(video_path, out_wav):
	cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav]
	subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
	return out_wav

	def seconds_to_timestamp(s):
	hours = int(s // 3600)
	minutes = int((s % 3600) // 60)
	seconds = int(s % 60)
	ms = int(round((s - int(s)) * 1000))
	return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"

	def format_timestamp_vtt(s):
	hours = int(s // 3600)
	minutes = int((s % 3600) // 60)
	seconds = int(s % 60)
	ms = int(round((s - int(s)) * 1000))
	return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"

	def segments_to_srt(segments):
	parts = []
	for i, seg in enumerate(segments, start=1):
	start = seconds_to_timestamp(seg['start'])
	end = seconds_to_timestamp(seg['end'])
	text = seg['text'].strip()
	parts.append(f"{i}\n{start} --> {end}\n{text}\n")
	return "\n".join(parts)

	def segments_to_vtt(segments):
	parts = ["WEBVTT\n"]
	for seg in segments:
	start = format_timestamp_vtt(seg['start'])
	end = format_timestamp_vtt(seg['end'])
	text = seg['text'].strip()
	parts.append(f"{start} --> {end}\n{text}\n")
	return "\n".join(parts)

	def segments_to_txt(segments):
	return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])

	def segments_to_json(segments, language=None, metadata=None):
	data = {"language": language, "segments": segments}
	if metadata:
	data["metadata"] = metadata
	return json.dumps(data, ensure_ascii=False, indent=2)

	def transcribe_pipeline(file_obj, url, model_size, keep_video=False, cookies_file=None, format_selector=None):
	if whisper is None:
	return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
	tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
	try:
	if url:
	cookies_path = cookies_file if cookies_file and os.path.exists(cookies_file) else None
	video_path = download_video_with_ytdlp(url, tmpdir, cookies_path=cookies_path, format_selector=format_selector)
	elif file_obj:
	if isinstance(file_obj, str) and os.path.exists(file_obj):
	video_path = file_obj
	else:
	uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
	with open(uploaded_path, "wb") as f:
	f.write(file_obj.read())
	video_path = str(uploaded_path)
	else:
	return "Kein Video angegeben.", None, None, None, None, None
	audio_wav = str(Path(tmpdir) / "audio.wav")
	extract_audio_ffmpeg(video_path, audio_wav)
	model = whisper.load_model(model_size)
	result = model.transcribe(audio_wav, verbose=False)
	segments = result.get("segments", [])
	language = result.get("language", "unknown")
	srt_text = segments_to_srt(segments)
	vtt_text = segments_to_vtt(segments)
	txt_text = segments_to_txt(segments)
	json_text = segments_to_json(segments, language, {"model": model_size})
	base = Path(video_path).stem
	files = {}
	for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
	p = Path(tmpdir) / f"{base}.{ext}"
	p.write_text(content, encoding="utf-8")
	files[ext] = str(p)
	if not keep_video and url:
	try:
	os.remove(video_path)
	except Exception:
	pass
	return txt_text, files["srt"], files["vtt"], files["txt"], files["json"], f"Model: {model_size}, Sprache: {language}"
	except Exception as e:
	return f"Fehler: {e}", None, None, None, None, None

	with gr.Blocks() as demo:
	gr.Markdown("# Video → Whisper Transkript (SRT/TXT/VTT/JSON)")
	with gr.Row():
	with gr.Column():
	url_in = gr.Textbox(label="Video URL", placeholder="https://...")
	file_in = gr.File(label="Oder Videodatei hochladen")
	cookies_in = gr.File(label="Cookies.txt (optional)")
	fmt_in = gr.Textbox(label="Format (optional, yt-dlp -f)")
	model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell")
	keep_chk = gr.Checkbox(label="Video behalten", value=False)
	btn = gr.Button("Transkribieren")
	status = gr.Textbox(label="Status")
	with gr.Column():
	transcript = gr.Textbox(label="Transkript", lines=20)
	srt_dl = gr.File(label="SRT", visible=False)
	vtt_dl = gr.File(label="VTT", visible=False)
	txt_dl = gr.File(label="TXT", visible=False)
	json_dl = gr.File(label="JSON", visible=False)

	def run_transcribe(f, u, m, k, c, fmt):
	cookies_path = c if isinstance(c, str) and os.path.exists(c) else None
	display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k, cookies_file=cookies_path, format_selector=fmt)
	return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta

	btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk, cookies_in, fmt_in], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))