neuralworm commited on
Commit
d926f18
·
verified ·
1 Parent(s): 1f568e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -155
app.py CHANGED
@@ -1,161 +1,89 @@
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)
5
- """
6
- import os
7
- import subprocess
8
- import tempfile
9
- import json
10
- from pathlib import Path
11
- from datetime import timedelta
12
 
13
  import gradio as gr
14
 
15
- try:
16
- import whisper
17
- except Exception:
18
- whisper = None
19
 
20
- def run(cmd, hide_output=False):
21
- if hide_output:
22
- subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
23
- else:
24
- subprocess.run(cmd, check=True)
25
-
26
- def download_video_with_ytdlp(url: str, out_dir: str) -> str:
27
- out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
28
- cmd = ["yt-dlp", "-f", "best", "-o", out_template, url]
29
- run(cmd)
30
- files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
31
- if not files:
32
- raise FileNotFoundError("Download fehlgeschlagen keine Datei gefunden.")
33
- return str(files[0])
34
-
35
- def extract_audio_ffmpeg(video_path: str, out_wav: str):
36
- cmd = [
37
- "ffmpeg", "-y", "-i", video_path,
38
- "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav
39
- ]
40
- run(cmd, hide_output=True)
41
- return out_wav
42
-
43
- def seconds_to_timestamp(s: float) -> str:
44
- hours = int(s // 3600)
45
- minutes = int((s % 3600) // 60)
46
- seconds = int(s % 60)
47
- ms = int(round((s - int(s)) * 1000))
48
- return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
49
-
50
- def format_timestamp_vtt(s: float) -> str:
51
- hours = int(s // 3600)
52
- minutes = int((s % 3600) // 60)
53
- seconds = int(s % 60)
54
- ms = int(round((s - int(s)) * 1000))
55
- return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
56
-
57
- def segments_to_srt(segments):
58
- out = []
59
- for i, seg in enumerate(segments, start=1):
60
- start = seconds_to_timestamp(seg['start'])
61
- end = seconds_to_timestamp(seg['end'])
62
- text = seg['text'].strip()
63
- out.append(f"{i}\n{start} --> {end}\n{text}\n")
64
- return "\n".join(out)
65
-
66
- def segments_to_vtt(segments):
67
- out = ["WEBVTT\n"]
68
- for seg in segments:
69
- start = format_timestamp_vtt(seg['start'])
70
- end = format_timestamp_vtt(seg['end'])
71
- text = seg['text'].strip()
72
- out.append(f"{start} --> {end}\n{text}\n")
73
- return "\n".join(out)
74
-
75
- def segments_to_txt(segments):
76
- return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
77
-
78
- def segments_to_json(segments, language=None, metadata=None):
79
- data = {"language": language, "segments": segments}
80
- if metadata:
81
- data["metadata"] = metadata
82
- return json.dumps(data, ensure_ascii=False, indent=2)
83
-
84
- def transcribe_pipeline(file_obj, url, model_size, keep_video=False):
85
- if whisper is None:
86
- return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
87
-
88
- tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
89
- try:
90
- if url:
91
- video_path = download_video_with_ytdlp(url, tmpdir)
92
- elif file_obj:
93
- if isinstance(file_obj, str) and os.path.exists(file_obj):
94
- video_path = file_obj
95
- else:
96
- uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
97
- with open(uploaded_path, "wb") as f:
98
- f.write(file_obj.read())
99
- video_path = str(uploaded_path)
100
  else:
101
- return "Kein Video angegeben.", None, None, None, None, None
102
-
103
- audio_wav = str(Path(tmpdir) / "audio.wav")
104
- extract_audio_ffmpeg(video_path, audio_wav)
105
-
106
- model = whisper.load_model(model_size)
107
- result = model.transcribe(audio_wav, verbose=False)
108
- segments = result.get("segments", [])
109
- language = result.get("language", "unknown")
110
-
111
- srt_text = segments_to_srt(segments)
112
- vtt_text = segments_to_vtt(segments)
113
- txt_text = segments_to_txt(segments)
114
- json_text = segments_to_json(segments, language, {"model": model_size})
115
-
116
- out_files = {}
117
- base = Path(video_path).stem
118
- for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
119
- p = Path(tmpdir) / f"{base}.{ext}"
120
- p.write_text(content, encoding="utf-8")
121
- out_files[ext] = str(p)
122
-
123
- display_text = txt_text
124
-
125
- if not keep_video and url:
126
- try:
127
- os.remove(video_path)
128
- except Exception:
129
- pass
130
-
131
- return display_text, out_files["srt"], out_files["vtt"], out_files["txt"], out_files["json"], f"Model: {model_size}, Sprache: {language}"
132
- except Exception as e:
133
- return f"Fehler: {e}", None, None, None, None, None
134
- finally:
135
- pass
136
-
137
- with gr.Blocks() as demo:
138
- gr.Markdown("# Video Whisper Transkript (SRT/TXT/VTT/JSON)")
139
- with gr.Row():
140
- with gr.Column():
141
- url_in = gr.Textbox(label="Video URL", placeholder="https://...")
142
- file_in = gr.File(label="Oder Videodatei hochladen")
143
- model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell")
144
- keep_chk = gr.Checkbox(label="Video behalten", value=False)
145
- btn = gr.Button("Transkribieren")
146
- status = gr.Textbox(label="Status")
147
- with gr.Column():
148
- transcript = gr.Textbox(label="Transkript mit Zeitmarken", lines=20)
149
- srt_dl = gr.File(label="SRT", visible=False)
150
- vtt_dl = gr.File(label="VTT", visible=False)
151
- txt_dl = gr.File(label="TXT", visible=False)
152
- json_dl = gr.File(label="JSON", visible=False)
153
-
154
- def run_transcribe(f, u, m, k):
155
- display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k)
156
- return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta
157
-
158
- btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])
159
-
160
- if __name__ == "__main__":
161
- demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
 
1
  #!/usr/bin/env python3
2
+
3
+ -- coding: utf-8 --
4
+
5
+ """ Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON) """ import os import subprocess import tempfile import json from pathlib import Path from datetime import timedelta
 
 
 
 
 
 
6
 
7
  import gradio as gr
8
 
9
+ try: import whisper except Exception: whisper = None
 
 
 
10
 
11
+ def run(cmd, hide_output=False): if hide_output: subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) else: subprocess.run(cmd, check=True)
12
+
13
+ def download_video_with_ytdlp(url: str, out_dir: str) -> str: out_template = str(Path(out_dir) / "%(title)s.%(ext)s") cmd = ["yt-dlp", "-f", "best", "-o", out_template, url] run(cmd) files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True) if not files: raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.") return str(files[0])
14
+
15
+ def extract_audio_ffmpeg(video_path: str, out_wav: str): cmd = [ "ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav ] run(cmd, hide_output=True) return out_wav
16
+
17
+ def seconds_to_timestamp(s: float) -> str: hours = int(s // 3600) minutes = int((s % 3600) // 60) seconds = int(s % 60) ms = int(round((s - int(s)) * 1000)) return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
18
+
19
+ def format_timestamp_vtt(s: float) -> str: hours = int(s // 3600) minutes = int((s % 3600) // 60) seconds = int(s % 60) ms = int(round((s - int(s)) * 1000)) return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
20
+
21
+ def segments_to_srt(segments): out = [] for i, seg in enumerate(segments, start=1): start = seconds_to_timestamp(seg['start']) end = seconds_to_timestamp(seg['end']) text = seg['text'].strip() out.append(f"{i}\n{start} --> {end}\n{text}\n") return "\n".join(out)
22
+
23
+ def segments_to_vtt(segments): out = ["WEBVTT\n"] for seg in segments: start = format_timestamp_vtt(seg['start']) end = format_timestamp_vtt(seg['end']) text = seg['text'].strip() out.append(f"{start} --> {end}\n{text}\n") return "\n".join(out)
24
+
25
+ def segments_to_txt(segments): return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
26
+
27
+ def segments_to_json(segments, language=None, metadata=None): data = {"language": language, "segments": segments} if metadata: data["metadata"] = metadata return json.dumps(data, ensure_ascii=False, indent=2)
28
+
29
+ def transcribe_pipeline(file_obj, url, model_size, keep_video=False): if whisper is None: return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
30
+
31
+ tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
32
+ try:
33
+ if url:
34
+ video_path = download_video_with_ytdlp(url, tmpdir)
35
+ elif file_obj:
36
+ if isinstance(file_obj, str) and os.path.exists(file_obj):
37
+ video_path = file_obj
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  else:
39
+ uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
40
+ with open(uploaded_path, "wb") as f:
41
+ f.write(file_obj.read())
42
+ video_path = str(uploaded_path)
43
+ else:
44
+ return "Kein Video angegeben.", None, None, None, None, None
45
+
46
+ audio_wav = str(Path(tmpdir) / "audio.wav")
47
+ extract_audio_ffmpeg(video_path, audio_wav)
48
+
49
+ model = whisper.load_model(model_size)
50
+ result = model.transcribe(audio_wav, verbose=False)
51
+ segments = result.get("segments", [])
52
+ language = result.get("language", "unknown")
53
+
54
+ srt_text = segments_to_srt(segments)
55
+ vtt_text = segments_to_vtt(segments)
56
+ txt_text = segments_to_txt(segments)
57
+ json_text = segments_to_json(segments, language, {"model": model_size})
58
+
59
+ out_files = {}
60
+ base = Path(video_path).stem
61
+ for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
62
+ p = Path(tmpdir) / f"{base}.{ext}"
63
+ p.write_text(content, encoding="utf-8")
64
+ out_files[ext] = str(p)
65
+
66
+ display_text = txt_text
67
+
68
+ if not keep_video and url:
69
+ try:
70
+ os.remove(video_path)
71
+ except Exception:
72
+ pass
73
+
74
+ return display_text, out_files["srt"], out_files["vtt"], out_files["txt"], out_files["json"], f"Model: {model_size}, Sprache: {language}"
75
+ except Exception as e:
76
+ return f"Fehler: {e}", None, None, None, None, None
77
+ finally:
78
+ pass
79
+
80
+ with gr.Blocks() as demo: gr.Markdown("# Video → Whisper Transkript (SRT/TXT/VTT/JSON)") with gr.Row(): with gr.Column(): url_in = gr.Textbox(label="Video URL", placeholder="https://...") file_in = gr.File(label="Oder Videodatei hochladen") model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell") keep_chk = gr.Checkbox(label="Video behalten", value=False) btn = gr.Button("Transkribieren") status = gr.Textbox(label="Status") with gr.Column(): transcript = gr.Textbox(label="Transkript mit Zeitmarken", lines=20) srt_dl = gr.File(label="SRT", visible=False) vtt_dl = gr.File(label="VTT", visible=False) txt_dl = gr.File(label="TXT", visible=False) json_dl = gr.File(label="JSON", visible=False)
81
+
82
+ def run_transcribe(f, u, m, k):
83
+ display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k)
84
+ return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta
85
+
86
+ btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])
87
+
88
+ if name == "main": demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
89
+