neuralworm commited on
Commit
6c588c4
·
verified ·
1 Parent(s): d926f18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -84
app.py CHANGED
@@ -1,89 +1,158 @@
1
  #!/usr/bin/env python3
2
-
3
- -- coding: utf-8 --
4
-
5
- """ Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON) """ import os import subprocess import tempfile import json from pathlib import Path from datetime import timedelta
6
-
 
 
 
 
 
 
 
7
  import gradio as gr
8
 
9
- try: import whisper except Exception: whisper = None
10
-
11
- def run(cmd, hide_output=False): if hide_output: subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) else: subprocess.run(cmd, check=True)
12
-
13
- def download_video_with_ytdlp(url: str, out_dir: str) -> str: out_template = str(Path(out_dir) / "%(title)s.%(ext)s") cmd = ["yt-dlp", "-f", "best", "-o", out_template, url] run(cmd) files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True) if not files: raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.") return str(files[0])
14
-
15
- def extract_audio_ffmpeg(video_path: str, out_wav: str): cmd = [ "ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav ] run(cmd, hide_output=True) return out_wav
16
-
17
- def seconds_to_timestamp(s: float) -> str: hours = int(s // 3600) minutes = int((s % 3600) // 60) seconds = int(s % 60) ms = int(round((s - int(s)) * 1000)) return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
18
-
19
- def format_timestamp_vtt(s: float) -> str: hours = int(s // 3600) minutes = int((s % 3600) // 60) seconds = int(s % 60) ms = int(round((s - int(s)) * 1000)) return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
20
-
21
- def segments_to_srt(segments): out = [] for i, seg in enumerate(segments, start=1): start = seconds_to_timestamp(seg['start']) end = seconds_to_timestamp(seg['end']) text = seg['text'].strip() out.append(f"{i}\n{start} --> {end}\n{text}\n") return "\n".join(out)
22
-
23
- def segments_to_vtt(segments): out = ["WEBVTT\n"] for seg in segments: start = format_timestamp_vtt(seg['start']) end = format_timestamp_vtt(seg['end']) text = seg['text'].strip() out.append(f"{start} --> {end}\n{text}\n") return "\n".join(out)
24
-
25
- def segments_to_txt(segments): return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
26
-
27
- def segments_to_json(segments, language=None, metadata=None): data = {"language": language, "segments": segments} if metadata: data["metadata"] = metadata return json.dumps(data, ensure_ascii=False, indent=2)
28
-
29
- def transcribe_pipeline(file_obj, url, model_size, keep_video=False): if whisper is None: return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
30
-
31
- tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
32
  try:
33
- if url:
34
- video_path = download_video_with_ytdlp(url, tmpdir)
35
- elif file_obj:
36
- if isinstance(file_obj, str) and os.path.exists(file_obj):
37
- video_path = file_obj
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  else:
39
- uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
40
- with open(uploaded_path, "wb") as f:
41
- f.write(file_obj.read())
42
- video_path = str(uploaded_path)
43
- else:
44
- return "Kein Video angegeben.", None, None, None, None, None
45
-
46
- audio_wav = str(Path(tmpdir) / "audio.wav")
47
- extract_audio_ffmpeg(video_path, audio_wav)
48
-
49
- model = whisper.load_model(model_size)
50
- result = model.transcribe(audio_wav, verbose=False)
51
- segments = result.get("segments", [])
52
- language = result.get("language", "unknown")
53
-
54
- srt_text = segments_to_srt(segments)
55
- vtt_text = segments_to_vtt(segments)
56
- txt_text = segments_to_txt(segments)
57
- json_text = segments_to_json(segments, language, {"model": model_size})
58
-
59
- out_files = {}
60
- base = Path(video_path).stem
61
- for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
62
- p = Path(tmpdir) / f"{base}.{ext}"
63
- p.write_text(content, encoding="utf-8")
64
- out_files[ext] = str(p)
65
-
66
- display_text = txt_text
67
-
68
- if not keep_video and url:
69
- try:
70
- os.remove(video_path)
71
- except Exception:
72
- pass
73
-
74
- return display_text, out_files["srt"], out_files["vtt"], out_files["txt"], out_files["json"], f"Model: {model_size}, Sprache: {language}"
75
- except Exception as e:
76
- return f"Fehler: {e}", None, None, None, None, None
77
- finally:
78
- pass
79
-
80
- with gr.Blocks() as demo: gr.Markdown("# Video → Whisper Transkript (SRT/TXT/VTT/JSON)") with gr.Row(): with gr.Column(): url_in = gr.Textbox(label="Video URL", placeholder="https://...") file_in = gr.File(label="Oder Videodatei hochladen") model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell") keep_chk = gr.Checkbox(label="Video behalten", value=False) btn = gr.Button("Transkribieren") status = gr.Textbox(label="Status") with gr.Column(): transcript = gr.Textbox(label="Transkript mit Zeitmarken", lines=20) srt_dl = gr.File(label="SRT", visible=False) vtt_dl = gr.File(label="VTT", visible=False) txt_dl = gr.File(label="TXT", visible=False) json_dl = gr.File(label="JSON", visible=False)
81
-
82
- def run_transcribe(f, u, m, k):
83
- display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k)
84
- return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta
85
-
86
- btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])
87
-
88
- if name == "main": demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
89
-
 
 
 
 
1
  #!/usr/bin/env python3
2
+ # coding: utf-8
3
+ """
4
+ Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)
5
+
6
+ Hinweis: Verwende diese App nur für eigene oder freigegebene Inhalte.
7
+ """
8
+ import os
9
+ import subprocess
10
+ import tempfile
11
+ import json
12
+ from pathlib import Path
13
+ from datetime import timedelta
14
  import gradio as gr
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  try:
17
+ import whisper
18
+ except Exception:
19
+ whisper = None
20
+
21
+ def run_capture(cmd):
22
+ result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
23
+ if result.returncode != 0:
24
+ err_tail = result.stderr[-1000:] if result.stderr else ""
25
+ raise RuntimeError(f"Command failed: {' '.join(cmd)}\n{err_tail}")
26
+ return result.stdout
27
+
28
+ def download_video_with_ytdlp(url, out_dir, cookies_path=None, format_selector=None):
29
+ out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
30
+ cmd = ["yt-dlp", "-o", out_template]
31
+ if format_selector:
32
+ cmd += ["-f", format_selector]
33
+ if cookies_path:
34
+ cmd += ["--cookies", cookies_path]
35
+ cmd.append(url)
36
+ run_capture(cmd)
37
+ files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
38
+ if not files:
39
+ raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.")
40
+ return str(files[0])
41
+
42
+ def extract_audio_ffmpeg(video_path, out_wav):
43
+ cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav]
44
+ subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
45
+ return out_wav
46
+
47
+ def seconds_to_timestamp(s):
48
+ hours = int(s // 3600)
49
+ minutes = int((s % 3600) // 60)
50
+ seconds = int(s % 60)
51
+ ms = int(round((s - int(s)) * 1000))
52
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
53
+
54
+ def format_timestamp_vtt(s):
55
+ hours = int(s // 3600)
56
+ minutes = int((s % 3600) // 60)
57
+ seconds = int(s % 60)
58
+ ms = int(round((s - int(s)) * 1000))
59
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
60
+
61
+ def segments_to_srt(segments):
62
+ parts = []
63
+ for i, seg in enumerate(segments, start=1):
64
+ start = seconds_to_timestamp(seg['start'])
65
+ end = seconds_to_timestamp(seg['end'])
66
+ text = seg['text'].strip()
67
+ parts.append(f"{i}\n{start} --> {end}\n{text}\n")
68
+ return "\n".join(parts)
69
+
70
+ def segments_to_vtt(segments):
71
+ parts = ["WEBVTT\n"]
72
+ for seg in segments:
73
+ start = format_timestamp_vtt(seg['start'])
74
+ end = format_timestamp_vtt(seg['end'])
75
+ text = seg['text'].strip()
76
+ parts.append(f"{start} --> {end}\n{text}\n")
77
+ return "\n".join(parts)
78
+
79
+ def segments_to_txt(segments):
80
+ return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
81
+
82
+ def segments_to_json(segments, language=None, metadata=None):
83
+ data = {"language": language, "segments": segments}
84
+ if metadata:
85
+ data["metadata"] = metadata
86
+ return json.dumps(data, ensure_ascii=False, indent=2)
87
+
88
+ def transcribe_pipeline(file_obj, url, model_size, keep_video=False, cookies_file=None, format_selector=None):
89
+ if whisper is None:
90
+ return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
91
+ tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
92
+ try:
93
+ if url:
94
+ cookies_path = cookies_file if cookies_file and os.path.exists(cookies_file) else None
95
+ video_path = download_video_with_ytdlp(url, tmpdir, cookies_path=cookies_path, format_selector=format_selector)
96
+ elif file_obj:
97
+ if isinstance(file_obj, str) and os.path.exists(file_obj):
98
+ video_path = file_obj
99
+ else:
100
+ uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
101
+ with open(uploaded_path, "wb") as f:
102
+ f.write(file_obj.read())
103
+ video_path = str(uploaded_path)
104
  else:
105
+ return "Kein Video angegeben.", None, None, None, None, None
106
+ audio_wav = str(Path(tmpdir) / "audio.wav")
107
+ extract_audio_ffmpeg(video_path, audio_wav)
108
+ model = whisper.load_model(model_size)
109
+ result = model.transcribe(audio_wav, verbose=False)
110
+ segments = result.get("segments", [])
111
+ language = result.get("language", "unknown")
112
+ srt_text = segments_to_srt(segments)
113
+ vtt_text = segments_to_vtt(segments)
114
+ txt_text = segments_to_txt(segments)
115
+ json_text = segments_to_json(segments, language, {"model": model_size})
116
+ base = Path(video_path).stem
117
+ files = {}
118
+ for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
119
+ p = Path(tmpdir) / f"{base}.{ext}"
120
+ p.write_text(content, encoding="utf-8")
121
+ files[ext] = str(p)
122
+ if not keep_video and url:
123
+ try:
124
+ os.remove(video_path)
125
+ except Exception:
126
+ pass
127
+ return txt_text, files["srt"], files["vtt"], files["txt"], files["json"], f"Model: {model_size}, Sprache: {language}"
128
+ except Exception as e:
129
+ return f"Fehler: {e}", None, None, None, None, None
130
+
131
+ with gr.Blocks() as demo:
132
+ gr.Markdown("# Video → Whisper Transkript (SRT/TXT/VTT/JSON)")
133
+ with gr.Row():
134
+ with gr.Column():
135
+ url_in = gr.Textbox(label="Video URL", placeholder="https://...")
136
+ file_in = gr.File(label="Oder Videodatei hochladen")
137
+ cookies_in = gr.File(label="Cookies.txt (optional)")
138
+ fmt_in = gr.Textbox(label="Format (optional, yt-dlp -f)")
139
+ model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell")
140
+ keep_chk = gr.Checkbox(label="Video behalten", value=False)
141
+ btn = gr.Button("Transkribieren")
142
+ status = gr.Textbox(label="Status")
143
+ with gr.Column():
144
+ transcript = gr.Textbox(label="Transkript", lines=20)
145
+ srt_dl = gr.File(label="SRT", visible=False)
146
+ vtt_dl = gr.File(label="VTT", visible=False)
147
+ txt_dl = gr.File(label="TXT", visible=False)
148
+ json_dl = gr.File(label="JSON", visible=False)
149
+
150
+ def run_transcribe(f, u, m, k, c, fmt):
151
+ cookies_path = c if isinstance(c, str) and os.path.exists(c) else None
152
+ display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k, cookies_file=cookies_path, format_selector=fmt)
153
+ return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta
154
+
155
+ btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk, cookies_in, fmt_in], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])
156
+
157
+ if __name__ == "__main__":
158
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))