neuralworm's picture
Update app.py
6c588c4 verified
raw
history blame
6.9 kB
#!/usr/bin/env python3
# coding: utf-8
"""
Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)
Hinweis: Verwende diese App nur für eigene oder freigegebene Inhalte.
"""
import os
import subprocess
import tempfile
import json
from pathlib import Path
from datetime import timedelta
import gradio as gr
try:
import whisper
except Exception:
whisper = None
def run_capture(cmd):
result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
err_tail = result.stderr[-1000:] if result.stderr else ""
raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{err_tail}")
return result.stdout
def download_video_with_ytdlp(url, out_dir, cookies_path=None, format_selector=None):
out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
cmd = ["yt-dlp", "-o", out_template]
if format_selector:
cmd += ["-f", format_selector]
if cookies_path:
cmd += ["--cookies", cookies_path]
cmd.append(url)
run_capture(cmd)
files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
if not files:
raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.")
return str(files[0])
def extract_audio_ffmpeg(video_path, out_wav):
cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav]
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return out_wav
def seconds_to_timestamp(s):
hours = int(s // 3600)
minutes = int((s % 3600) // 60)
seconds = int(s % 60)
ms = int(round((s - int(s)) * 1000))
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
def format_timestamp_vtt(s):
hours = int(s // 3600)
minutes = int((s % 3600) // 60)
seconds = int(s % 60)
ms = int(round((s - int(s)) * 1000))
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
def segments_to_srt(segments):
parts = []
for i, seg in enumerate(segments, start=1):
start = seconds_to_timestamp(seg['start'])
end = seconds_to_timestamp(seg['end'])
text = seg['text'].strip()
parts.append(f"{i}\n{start} --> {end}\n{text}\n")
return "\n".join(parts)
def segments_to_vtt(segments):
parts = ["WEBVTT\n"]
for seg in segments:
start = format_timestamp_vtt(seg['start'])
end = format_timestamp_vtt(seg['end'])
text = seg['text'].strip()
parts.append(f"{start} --> {end}\n{text}\n")
return "\n".join(parts)
def segments_to_txt(segments):
return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
def segments_to_json(segments, language=None, metadata=None):
data = {"language": language, "segments": segments}
if metadata:
data["metadata"] = metadata
return json.dumps(data, ensure_ascii=False, indent=2)
def transcribe_pipeline(file_obj, url, model_size, keep_video=False, cookies_file=None, format_selector=None):
if whisper is None:
return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
try:
if url:
cookies_path = cookies_file if cookies_file and os.path.exists(cookies_file) else None
video_path = download_video_with_ytdlp(url, tmpdir, cookies_path=cookies_path, format_selector=format_selector)
elif file_obj:
if isinstance(file_obj, str) and os.path.exists(file_obj):
video_path = file_obj
else:
uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
with open(uploaded_path, "wb") as f:
f.write(file_obj.read())
video_path = str(uploaded_path)
else:
return "Kein Video angegeben.", None, None, None, None, None
audio_wav = str(Path(tmpdir) / "audio.wav")
extract_audio_ffmpeg(video_path, audio_wav)
model = whisper.load_model(model_size)
result = model.transcribe(audio_wav, verbose=False)
segments = result.get("segments", [])
language = result.get("language", "unknown")
srt_text = segments_to_srt(segments)
vtt_text = segments_to_vtt(segments)
txt_text = segments_to_txt(segments)
json_text = segments_to_json(segments, language, {"model": model_size})
base = Path(video_path).stem
files = {}
for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
p = Path(tmpdir) / f"{base}.{ext}"
p.write_text(content, encoding="utf-8")
files[ext] = str(p)
if not keep_video and url:
try:
os.remove(video_path)
except Exception:
pass
return txt_text, files["srt"], files["vtt"], files["txt"], files["json"], f"Model: {model_size}, Sprache: {language}"
except Exception as e:
return f"Fehler: {e}", None, None, None, None, None
with gr.Blocks() as demo:
gr.Markdown("# Video → Whisper Transkript (SRT/TXT/VTT/JSON)")
with gr.Row():
with gr.Column():
url_in = gr.Textbox(label="Video URL", placeholder="https://...")
file_in = gr.File(label="Oder Videodatei hochladen")
cookies_in = gr.File(label="Cookies.txt (optional)")
fmt_in = gr.Textbox(label="Format (optional, yt-dlp -f)")
model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell")
keep_chk = gr.Checkbox(label="Video behalten", value=False)
btn = gr.Button("Transkribieren")
status = gr.Textbox(label="Status")
with gr.Column():
transcript = gr.Textbox(label="Transkript", lines=20)
srt_dl = gr.File(label="SRT", visible=False)
vtt_dl = gr.File(label="VTT", visible=False)
txt_dl = gr.File(label="TXT", visible=False)
json_dl = gr.File(label="JSON", visible=False)
def run_transcribe(f, u, m, k, c, fmt):
cookies_path = c if isinstance(c, str) and os.path.exists(c) else None
display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k, cookies_file=cookies_path, format_selector=fmt)
return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta
btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk, cookies_in, fmt_in], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))