neuralworm commited on
Commit
a9d392f
·
verified ·
1 Parent(s): 185f270

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -0
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)
5
+
6
+ Rechtlicher Hinweis:
7
+ - Verwende diese App nur für eigene Inhalte oder Inhalte, für die du explizit die Erlaubnis hast.
8
+ - Respektiere Urheberrecht und die Terms of Service der jeweiligen Plattformen.
9
+
10
+ Benötigt:
11
+ - ffmpeg (systemweit)
12
+ - Python-Pakete siehe requirements.txt
13
+ """
14
+ import os
15
+ import subprocess
16
+ import tempfile
17
+ import json
18
+ from pathlib import Path
19
+ from datetime import timedelta
20
+
21
+ import gradio as gr
22
+
23
+ # Versuch, whisper zu importieren (installiert via requirements.txt as git+repo)
24
+ try:
25
+ import whisper
26
+ except Exception as e:
27
+ whisper = None
28
+
29
+ # Hilfsfunktionen ----------------------------------------------------------
30
+
31
+ def run(cmd, hide_output=False):
32
+ """Run shell command, raise on error."""
33
+ if hide_output:
34
+ subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
35
+ else:
36
+ subprocess.run(cmd, check=True)
37
+
38
+ def download_video_with_ytdlp(url: str, out_dir: str) -> str:
39
+ """Download best video using yt-dlp into out_dir, return filepath"""
40
+ out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
41
+ cmd = ["yt-dlp", "-f", "best", "-o", out_template, url]
42
+ run(cmd)
43
+ # pick most recently modified file
44
+ files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
45
+ if not files:
46
+ raise FileNotFoundError("Download erfolglos — keine Datei gefunden.")
47
+ return str(files[0])
48
+
49
+ def extract_audio_ffmpeg(video_path: str, out_wav: str):
50
+ """Extract mono 16k WAV for Whisper"""
51
+ cmd = [
52
+ "ffmpeg",
53
+ "-y",
54
+ "-i", video_path,
55
+ "-vn",
56
+ "-ac", "1",
57
+ "-ar", "16000",
58
+ "-f", "wav",
59
+ out_wav
60
+ ]
61
+ run(cmd, hide_output=True)
62
+ return out_wav
63
+
64
+ def seconds_to_timestamp(s: float, always_ms: bool = True) -> str:
65
+ """Convert seconds (float) to SRT/VTT time format HH:MM:SS,mmm"""
66
+ td = timedelta(seconds=float(s))
67
+ total_seconds = int(td.total_seconds())
68
+ hours = total_seconds // 3600
69
+ minutes = (total_seconds % 3600) // 60
70
+ seconds = total_seconds % 60
71
+ milliseconds = int(td.microseconds / 1000 + (td.seconds - int(td.seconds)) * 1000)
72
+ # Better approach using fractional part:
73
+ frac = s - int(s)
74
+ ms = int(round((s - int(s)) * 1000)) if s >= 0 else 0
75
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
76
+
77
+ def format_timestamp_vtt(s: float) -> str:
78
+ td = timedelta(seconds=float(s))
79
+ total_seconds = int(td.total_seconds())
80
+ hours = total_seconds // 3600
81
+ minutes = (total_seconds % 3600) // 60
82
+ seconds = total_seconds % 60
83
+ ms = int(round((s - int(s)) * 1000))
84
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
85
+
86
+ def segments_to_srt(segments):
87
+ """Create SRT string from whisper segments"""
88
+ parts = []
89
+ for i, seg in enumerate(segments, start=1):
90
+ start = seconds_to_timestamp(seg['start'])
91
+ end = seconds_to_timestamp(seg['end'])
92
+ text = seg['text'].strip()
93
+ parts.append(f"{i}\n{start} --> {end}\n{text}\n")
94
+ return "\n".join(parts)
95
+
96
+ def segments_to_vtt(segments):
97
+ """Create VTT string from whisper segments"""
98
+ parts = ["WEBVTT\n"]
99
+ for seg in segments:
100
+ start = format_timestamp_vtt(seg['start'])
101
+ end = format_timestamp_vtt(seg['end'])
102
+ text = seg['text'].strip()
103
+ parts.append(f"{start} --> {end}\n{text}\n")
104
+ return "\n".join(parts)
105
+
106
+ def segments_to_txt(segments):
107
+ """Create plain TXT with timestamps per segment"""
108
+ lines = []
109
+ for seg in segments:
110
+ start = seconds_to_timestamp(seg['start'])
111
+ text = seg['text'].strip()
112
+ lines.append(f"[{start}] {text}")
113
+ return "\n".join(lines)
114
+
115
+ def segments_to_json(segments, language=None, metadata=None):
116
+ obj = {
117
+ "language": language,
118
+ "segments": segments
119
+ }
120
+ if metadata:
121
+ obj["metadata"] = metadata
122
+ return json.dumps(obj, ensure_ascii=False, indent=2)
123
+
124
+ # Haupt-Workflow ----------------------------------------------------------
125
+
126
+ def transcribe_pipeline(file_obj, url, model_size, keep_video=False):
127
+ """
128
+ file_obj: uploaded file (temp path) or None
129
+ url: optional URL to download via yt-dlp
130
+ model_size: whisper model size
131
+ """
132
+ if whisper is None:
133
+ return "Fehler: lokales whisper nicht verfügbar. Stelle sicher, dass das Repo installiert ist.", None, None, None, None, None
134
+
135
+ tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
136
+ try:
137
+ # 1) Get video path either from uploaded file or by downloading URL
138
+ if url:
139
+ video_path = download_video_with_ytdlp(url, tmpdir)
140
+ elif file_obj:
141
+ # file_obj is a tuple (name, file-like) or a path depending on Gradio version.
142
+ # Gradio typically supplies a filesystem path.
143
+ if isinstance(file_obj, str) and os.path.exists(file_obj):
144
+ video_path = file_obj
145
+ else:
146
+ # try to write content to temp file
147
+ uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
148
+ with open(uploaded_path, "wb") as f:
149
+ # file_obj may be a SpooledTemporaryFile or similar with .read()
150
+ f.write(file_obj.read())
151
+ video_path = str(uploaded_path)
152
+ else:
153
+ return "Kein Video angegeben (weder Datei noch URL).", None, None, None, None, None
154
+
155
+ # 2) Extract audio
156
+ audio_wav = str(Path(tmpdir) / "audio.wav")
157
+ extract_audio_ffmpeg(video_path, audio_wav)
158
+
159
+ # 3) Load whisper model and transcribe
160
+ model = whisper.load_model(model_size)
161
+ # transcribe: get segments to generate SRT/VTT etc.
162
+ result = model.transcribe(audio_wav, verbose=False)
163
+ segments = result.get("segments", [])
164
+ language = result.get("language", None)
165
+
166
+ # 4) Create output strings
167
+ srt_text = segments_to_srt(segments)
168
+ vtt_text = segments_to_vtt(segments)
169
+ txt_text = segments_to_txt(segments)
170
+ json_text = segments_to_json(segments, language=language, metadata={"model": model_size})
171
+
172
+ # 5) Save files to tmpdir for download via Gradio
173
+ out_files = {}
174
+ base_name = Path(video_path).stem
175
+ files_map = {
176
+ f"{base_name}.srt": srt_text,
177
+ f"{base_name}.vtt": vtt_text,
178
+ f"{base_name}.txt": txt_text,
179
+ f"{base_name}.json": json_text
180
+ }
181
+ for fname, content in files_map.items():
182
+ path = Path(tmpdir) / fname
183
+ path.write_text(content, encoding="utf-8")
184
+ out_files[fname] = str(path)
185
+
186
+ # 6) prepare display text with timestamps for UI (simple combined view)
187
+ display_lines = []
188
+ for seg in segments:
189
+ start = seconds_to_timestamp(seg['start'])
190
+ display_lines.append(f"[{start}] {seg['text'].strip()}")
191
+ display_text = "\n".join(display_lines)
192
+
193
+ # Optionally remove video to save space
194
+ if not keep_video and url:
195
+ try:
196
+ os.remove(video_path)
197
+ except Exception:
198
+ pass
199
+
200
+ return display_text, out_files[f"{base_name}.srt"], out_files[f"{base_name}.vtt"], out_files[f"{base_name}.txt"], out_files[f"{base_name}.json"], f"Model: {model_size}, Language: {language}"
201
+ except Exception as e:
202
+ return f"Fehler während Verarbeitung: {e}", None, None, None, None, None
203
+ finally:
204
+ # Do not delete tmpdir immediately if the user wants to download