File size: 7,240 Bytes
a9d392f
6c588c4
 
 
 
 
 
 
 
 
 
 
 
a9d392f
 
d926f18
6c588c4
 
 
 
 
 
 
 
 
 
 
8278e48
6c588c4
 
 
 
 
 
 
8278e48
 
 
 
 
 
 
6c588c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9d392f
6c588c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python3
# coding: utf-8
"""
Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)

Hinweis: Verwende diese App nur für eigene oder freigegebene Inhalte.
"""
import os
import subprocess
import tempfile
import json
from pathlib import Path
from datetime import timedelta
import gradio as gr

try:
    import whisper
except Exception:
    whisper = None

def run_capture(cmd):
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        err_tail = result.stderr[-1000:] if result.stderr else ""
        raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{err_tail}")
    return result.stdout

def download_video_with_ytdlp(url: str, out_dir: str, cookies_path=None, format_selector=None) -> str:
    out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
    cmd = ["yt-dlp", "-o", out_template]
    if format_selector:
        cmd += ["-f", format_selector]
    if cookies_path:
        cmd += ["--cookies", cookies_path]
    cmd.append(url)
    try:
        run_capture(cmd)
    except RuntimeError as e:
        msg = str(e)
        if "Failed to resolve" in msg or "Name or service not known" in msg:
            raise RuntimeError("Kein DNS/Internet im Space: URL-Download nicht möglich. Bitte Videodatei hochladen oder in einer Umgebung mit Internet ausführen.")
        raise
    files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not files:
        raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.")
    return str(files[0])

def extract_audio_ffmpeg(video_path, out_wav):
    cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav]
    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    return out_wav

def seconds_to_timestamp(s):
    hours = int(s // 3600)
    minutes = int((s % 3600) // 60)
    seconds = int(s % 60)
    ms = int(round((s - int(s)) * 1000))
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"

def format_timestamp_vtt(s):
    hours = int(s // 3600)
    minutes = int((s % 3600) // 60)
    seconds = int(s % 60)
    ms = int(round((s - int(s)) * 1000))
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"

def segments_to_srt(segments):
    parts = []
    for i, seg in enumerate(segments, start=1):
        start = seconds_to_timestamp(seg['start'])
        end = seconds_to_timestamp(seg['end'])
        text = seg['text'].strip()
        parts.append(f"{i}\n{start} --> {end}\n{text}\n")
    return "\n".join(parts)

def segments_to_vtt(segments):
    parts = ["WEBVTT\n"]
    for seg in segments:
        start = format_timestamp_vtt(seg['start'])
        end = format_timestamp_vtt(seg['end'])
        text = seg['text'].strip()
        parts.append(f"{start} --> {end}\n{text}\n")
    return "\n".join(parts)

def segments_to_txt(segments):
    return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])

def segments_to_json(segments, language=None, metadata=None):
    data = {"language": language, "segments": segments}
    if metadata:
        data["metadata"] = metadata
    return json.dumps(data, ensure_ascii=False, indent=2)

def transcribe_pipeline(file_obj, url, model_size, keep_video=False, cookies_file=None, format_selector=None):
    if whisper is None:
        return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
    tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
    try:
        if url:
            cookies_path = cookies_file if cookies_file and os.path.exists(cookies_file) else None
            video_path = download_video_with_ytdlp(url, tmpdir, cookies_path=cookies_path, format_selector=format_selector)
        elif file_obj:
            if isinstance(file_obj, str) and os.path.exists(file_obj):
                video_path = file_obj
            else:
                uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
                with open(uploaded_path, "wb") as f:
                    f.write(file_obj.read())
                video_path = str(uploaded_path)
        else:
            return "Kein Video angegeben.", None, None, None, None, None
        audio_wav = str(Path(tmpdir) / "audio.wav")
        extract_audio_ffmpeg(video_path, audio_wav)
        model = whisper.load_model(model_size)
        result = model.transcribe(audio_wav, verbose=False)
        segments = result.get("segments", [])
        language = result.get("language", "unknown")
        srt_text = segments_to_srt(segments)
        vtt_text = segments_to_vtt(segments)
        txt_text = segments_to_txt(segments)
        json_text = segments_to_json(segments, language, {"model": model_size})
        base = Path(video_path).stem
        files = {}
        for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
            p = Path(tmpdir) / f"{base}.{ext}"
            p.write_text(content, encoding="utf-8")
            files[ext] = str(p)
        if not keep_video and url:
            try:
                os.remove(video_path)
            except Exception:
                pass
        return txt_text, files["srt"], files["vtt"], files["txt"], files["json"], f"Model: {model_size}, Sprache: {language}"
    except Exception as e:
        return f"Fehler: {e}", None, None, None, None, None

with gr.Blocks() as demo:
    gr.Markdown("# Video → Whisper Transkript (SRT/TXT/VTT/JSON)")
    with gr.Row():
        with gr.Column():
            url_in = gr.Textbox(label="Video URL", placeholder="https://...")
            file_in = gr.File(label="Oder Videodatei hochladen")
            cookies_in = gr.File(label="Cookies.txt (optional)")
            fmt_in = gr.Textbox(label="Format (optional, yt-dlp -f)")
            model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell")
            keep_chk = gr.Checkbox(label="Video behalten", value=False)
            btn = gr.Button("Transkribieren")
            status = gr.Textbox(label="Status")
        with gr.Column():
            transcript = gr.Textbox(label="Transkript", lines=20)
            srt_dl = gr.File(label="SRT", visible=False)
            vtt_dl = gr.File(label="VTT", visible=False)
            txt_dl = gr.File(label="TXT", visible=False)
            json_dl = gr.File(label="JSON", visible=False)

    def run_transcribe(f, u, m, k, c, fmt):
        cookies_path = c if isinstance(c, str) and os.path.exists(c) else None
        display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k, cookies_file=cookies_path, format_selector=fmt)
        return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta

    btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk, cookies_in, fmt_in], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))