Spaces:

neuralworm
/

video_transcription

Running

App Files Files Community

neuralworm commited on 22 days ago

Commit

a9d392f

verified ·

1 Parent(s): 185f270

Create app.py

Browse files

Files changed (1) hide show

app.py +204 -0

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)
+Rechtlicher Hinweis:
+- Verwende diese App nur für eigene Inhalte oder Inhalte, für die du explizit die Erlaubnis hast.
+- Respektiere Urheberrecht und die Terms of Service der jeweiligen Plattformen.
+Benötigt:
+- ffmpeg (systemweit)
+- Python-Pakete siehe requirements.txt
+"""
+import os
+import subprocess
+import tempfile
+import json
+from pathlib import Path
+from datetime import timedelta
+import gradio as gr
+# Versuch, whisper zu importieren (installiert via requirements.txt as git+repo)
+try:
+    import whisper
+except Exception as e:
+    whisper = None
+# Hilfsfunktionen ----------------------------------------------------------
+def run(cmd, hide_output=False):
+    """Run shell command, raise on error."""
+    if hide_output:
+        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    else:
+        subprocess.run(cmd, check=True)
+def download_video_with_ytdlp(url: str, out_dir: str) -> str:
+    """Download best video using yt-dlp into out_dir, return filepath"""
+    out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
+    cmd = ["yt-dlp", "-f", "best", "-o", out_template, url]
+    run(cmd)
+    # pick most recently modified file
+    files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
+    if not files:
+        raise FileNotFoundError("Download erfolglos — keine Datei gefunden.")
+    return str(files[0])
+def extract_audio_ffmpeg(video_path: str, out_wav: str):
+    """Extract mono 16k WAV for Whisper"""
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i", video_path,
+        "-vn",
+        "-ac", "1",
+        "-ar", "16000",
+        "-f", "wav",
+        out_wav
+    ]
+    run(cmd, hide_output=True)
+    return out_wav
+def seconds_to_timestamp(s: float, always_ms: bool = True) -> str:
+    """Convert seconds (float) to SRT/VTT time format HH:MM:SS,mmm"""
+    td = timedelta(seconds=float(s))
+    total_seconds = int(td.total_seconds())
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    seconds = total_seconds % 60
+    milliseconds = int(td.microseconds / 1000 + (td.seconds - int(td.seconds)) * 1000)
+    # Better approach using fractional part:
+    frac = s - int(s)
+    ms = int(round((s - int(s)) * 1000)) if s >= 0 else 0
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
+def format_timestamp_vtt(s: float) -> str:
+    td = timedelta(seconds=float(s))
+    total_seconds = int(td.total_seconds())
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    seconds = total_seconds % 60
+    ms = int(round((s - int(s)) * 1000))
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
+def segments_to_srt(segments):
+    """Create SRT string from whisper segments"""
+    parts = []
+    for i, seg in enumerate(segments, start=1):
+        start = seconds_to_timestamp(seg['start'])
+        end = seconds_to_timestamp(seg['end'])
+        text = seg['text'].strip()
+        parts.append(f"{i}\n{start} --> {end}\n{text}\n")
+    return "\n".join(parts)
+def segments_to_vtt(segments):
+    """Create VTT string from whisper segments"""
+    parts = ["WEBVTT\n"]
+    for seg in segments:
+        start = format_timestamp_vtt(seg['start'])
+        end = format_timestamp_vtt(seg['end'])
+        text = seg['text'].strip()
+        parts.append(f"{start} --> {end}\n{text}\n")
+    return "\n".join(parts)
+def segments_to_txt(segments):
+    """Create plain TXT with timestamps per segment"""
+    lines = []
+    for seg in segments:
+        start = seconds_to_timestamp(seg['start'])
+        text = seg['text'].strip()
+        lines.append(f"[{start}] {text}")
+    return "\n".join(lines)
+def segments_to_json(segments, language=None, metadata=None):
+    obj = {
+        "language": language,
+        "segments": segments
+    }
+    if metadata:
+        obj["metadata"] = metadata
+    return json.dumps(obj, ensure_ascii=False, indent=2)
+# Haupt-Workflow ----------------------------------------------------------
+def transcribe_pipeline(file_obj, url, model_size, keep_video=False):
+    """
+    file_obj: uploaded file (temp path) or None
+    url: optional URL to download via yt-dlp
+    model_size: whisper model size
+    """
+    if whisper is None:
+        return "Fehler: lokales whisper nicht verfügbar. Stelle sicher, dass das Repo installiert ist.", None, None, None, None, None
+    tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
+    try:
+        # 1) Get video path either from uploaded file or by downloading URL
+        if url:
+            video_path = download_video_with_ytdlp(url, tmpdir)
+        elif file_obj:
+            # file_obj is a tuple (name, file-like) or a path depending on Gradio version.
+            # Gradio typically supplies a filesystem path.
+            if isinstance(file_obj, str) and os.path.exists(file_obj):
+                video_path = file_obj
+            else:
+                # try to write content to temp file
+                uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
+                with open(uploaded_path, "wb") as f:
+                    # file_obj may be a SpooledTemporaryFile or similar with .read()
+                    f.write(file_obj.read())
+                video_path = str(uploaded_path)
+        else:
+            return "Kein Video angegeben (weder Datei noch URL).", None, None, None, None, None
+        # 2) Extract audio
+        audio_wav = str(Path(tmpdir) / "audio.wav")
+        extract_audio_ffmpeg(video_path, audio_wav)
+        # 3) Load whisper model and transcribe
+        model = whisper.load_model(model_size)
+        # transcribe: get segments to generate SRT/VTT etc.
+        result = model.transcribe(audio_wav, verbose=False)
+        segments = result.get("segments", [])
+        language = result.get("language", None)
+        # 4) Create output strings
+        srt_text = segments_to_srt(segments)
+        vtt_text = segments_to_vtt(segments)
+        txt_text = segments_to_txt(segments)
+        json_text = segments_to_json(segments, language=language, metadata={"model": model_size})
+        # 5) Save files to tmpdir for download via Gradio
+        out_files = {}
+        base_name = Path(video_path).stem
+        files_map = {
+            f"{base_name}.srt": srt_text,
+            f"{base_name}.vtt": vtt_text,
+            f"{base_name}.txt": txt_text,
+            f"{base_name}.json": json_text
+        }
+        for fname, content in files_map.items():
+            path = Path(tmpdir) / fname
+            path.write_text(content, encoding="utf-8")
+            out_files[fname] = str(path)
+        # 6) prepare display text with timestamps for UI (simple combined view)
+        display_lines = []
+        for seg in segments:
+            start = seconds_to_timestamp(seg['start'])
+            display_lines.append(f"[{start}] {seg['text'].strip()}")
+        display_text = "\n".join(display_lines)
+        # Optionally remove video to save space
+        if not keep_video and url:
+            try:
+                os.remove(video_path)
+            except Exception:
+                pass
+        return display_text, out_files[f"{base_name}.srt"], out_files[f"{base_name}.vtt"], out_files[f"{base_name}.txt"], out_files[f"{base_name}.json"], f"Model: {model_size}, Language: {language}"
+    except Exception as e:
+        return f"Fehler während Verarbeitung: {e}", None, None, None, None, None
+    finally:
+        # Do not delete tmpdir immediately if the user wants to download