neuralworm's picture
Update app.py
1f568e6 verified
raw
history blame
6.17 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)
"""
import os
import subprocess
import tempfile
import json
from pathlib import Path
from datetime import timedelta
import gradio as gr
try:
import whisper
except Exception:
whisper = None
def run(cmd, hide_output=False):
if hide_output:
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
else:
subprocess.run(cmd, check=True)
def download_video_with_ytdlp(url: str, out_dir: str) -> str:
out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
cmd = ["yt-dlp", "-f", "best", "-o", out_template, url]
run(cmd)
files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
if not files:
raise FileNotFoundError("Download fehlgeschlagen β€” keine Datei gefunden.")
return str(files[0])
def extract_audio_ffmpeg(video_path: str, out_wav: str):
cmd = [
"ffmpeg", "-y", "-i", video_path,
"-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav
]
run(cmd, hide_output=True)
return out_wav
def seconds_to_timestamp(s: float) -> str:
hours = int(s // 3600)
minutes = int((s % 3600) // 60)
seconds = int(s % 60)
ms = int(round((s - int(s)) * 1000))
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
def format_timestamp_vtt(s: float) -> str:
hours = int(s // 3600)
minutes = int((s % 3600) // 60)
seconds = int(s % 60)
ms = int(round((s - int(s)) * 1000))
return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
def segments_to_srt(segments):
out = []
for i, seg in enumerate(segments, start=1):
start = seconds_to_timestamp(seg['start'])
end = seconds_to_timestamp(seg['end'])
text = seg['text'].strip()
out.append(f"{i}\n{start} --> {end}\n{text}\n")
return "\n".join(out)
def segments_to_vtt(segments):
out = ["WEBVTT\n"]
for seg in segments:
start = format_timestamp_vtt(seg['start'])
end = format_timestamp_vtt(seg['end'])
text = seg['text'].strip()
out.append(f"{start} --> {end}\n{text}\n")
return "\n".join(out)
def segments_to_txt(segments):
return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
def segments_to_json(segments, language=None, metadata=None):
data = {"language": language, "segments": segments}
if metadata:
data["metadata"] = metadata
return json.dumps(data, ensure_ascii=False, indent=2)
def transcribe_pipeline(file_obj, url, model_size, keep_video=False):
if whisper is None:
return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
try:
if url:
video_path = download_video_with_ytdlp(url, tmpdir)
elif file_obj:
if isinstance(file_obj, str) and os.path.exists(file_obj):
video_path = file_obj
else:
uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
with open(uploaded_path, "wb") as f:
f.write(file_obj.read())
video_path = str(uploaded_path)
else:
return "Kein Video angegeben.", None, None, None, None, None
audio_wav = str(Path(tmpdir) / "audio.wav")
extract_audio_ffmpeg(video_path, audio_wav)
model = whisper.load_model(model_size)
result = model.transcribe(audio_wav, verbose=False)
segments = result.get("segments", [])
language = result.get("language", "unknown")
srt_text = segments_to_srt(segments)
vtt_text = segments_to_vtt(segments)
txt_text = segments_to_txt(segments)
json_text = segments_to_json(segments, language, {"model": model_size})
out_files = {}
base = Path(video_path).stem
for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
p = Path(tmpdir) / f"{base}.{ext}"
p.write_text(content, encoding="utf-8")
out_files[ext] = str(p)
display_text = txt_text
if not keep_video and url:
try:
os.remove(video_path)
except Exception:
pass
return display_text, out_files["srt"], out_files["vtt"], out_files["txt"], out_files["json"], f"Model: {model_size}, Sprache: {language}"
except Exception as e:
return f"Fehler: {e}", None, None, None, None, None
finally:
pass
with gr.Blocks() as demo:
gr.Markdown("# Video β†’ Whisper Transkript (SRT/TXT/VTT/JSON)")
with gr.Row():
with gr.Column():
url_in = gr.Textbox(label="Video URL", placeholder="https://...")
file_in = gr.File(label="Oder Videodatei hochladen")
model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell")
keep_chk = gr.Checkbox(label="Video behalten", value=False)
btn = gr.Button("Transkribieren")
status = gr.Textbox(label="Status")
with gr.Column():
transcript = gr.Textbox(label="Transkript mit Zeitmarken", lines=20)
srt_dl = gr.File(label="SRT", visible=False)
vtt_dl = gr.File(label="VTT", visible=False)
txt_dl = gr.File(label="TXT", visible=False)
json_dl = gr.File(label="JSON", visible=False)
def run_transcribe(f, u, m, k):
display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k)
return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta
btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))