neuralworm commited on
Commit
1f568e6
·
verified ·
1 Parent(s): 339b063

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -77
app.py CHANGED
@@ -2,14 +2,6 @@
2
  # -*- coding: utf-8 -*-
3
  """
4
  Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)
5
-
6
- Rechtlicher Hinweis:
7
- - Verwende diese App nur für eigene Inhalte oder Inhalte, für die du explizit die Erlaubnis hast.
8
- - Respektiere Urheberrecht und die Terms of Service der jeweiligen Plattformen.
9
-
10
- Benötigt:
11
- - ffmpeg (systemweit)
12
- - Python-Pakete siehe requirements.txt
13
  """
14
  import os
15
  import subprocess
@@ -20,60 +12,41 @@ from datetime import timedelta
20
 
21
  import gradio as gr
22
 
23
- # Versuch, whisper zu importieren (installiert via requirements.txt as git+repo)
24
  try:
25
  import whisper
26
- except Exception as e:
27
  whisper = None
28
 
29
- # Hilfsfunktionen ----------------------------------------------------------
30
-
31
-
32
  def run(cmd, hide_output=False):
33
- """Run shell command, raise on error."""
34
  if hide_output:
35
  subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
36
  else:
37
  subprocess.run(cmd, check=True)
38
 
39
-
40
  def download_video_with_ytdlp(url: str, out_dir: str) -> str:
41
- """Download best video using yt-dlp into out_dir, return filepath"""
42
  out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
43
  cmd = ["yt-dlp", "-f", "best", "-o", out_template, url]
44
  run(cmd)
45
- # pick most recently modified file
46
  files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
47
  if not files:
48
- raise FileNotFoundError("Download erfolglos — keine Datei gefunden.")
49
  return str(files[0])
50
 
51
-
52
  def extract_audio_ffmpeg(video_path: str, out_wav: str):
53
- """Extract mono 16k WAV for Whisper"""
54
  cmd = [
55
- "ffmpeg",
56
- "-y",
57
- "-i", video_path,
58
- "-vn",
59
- "-ac", "1",
60
- "-ar", "16000",
61
- "-f", "wav",
62
- out_wav
63
  ]
64
  run(cmd, hide_output=True)
65
  return out_wav
66
 
67
-
68
- def seconds_to_timestamp(s: float, always_ms: bool = True) -> str:
69
- """Convert seconds (float) to SRT/VTT time format HH:MM:SS,mmm"""
70
  hours = int(s // 3600)
71
  minutes = int((s % 3600) // 60)
72
  seconds = int(s % 60)
73
  ms = int(round((s - int(s)) * 1000))
74
  return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
75
 
76
-
77
  def format_timestamp_vtt(s: float) -> str:
78
  hours = int(s // 3600)
79
  minutes = int((s % 3600) // 60)
@@ -81,90 +54,108 @@ def format_timestamp_vtt(s: float) -> str:
81
  ms = int(round((s - int(s)) * 1000))
82
  return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
83
 
84
-
85
  def segments_to_srt(segments):
86
- """Create SRT string from whisper segments"""
87
- parts = []
88
  for i, seg in enumerate(segments, start=1):
89
  start = seconds_to_timestamp(seg['start'])
90
  end = seconds_to_timestamp(seg['end'])
91
  text = seg['text'].strip()
92
- parts.append(f"{i}\n{start} --> {end}\n{text}\n")
93
- return "\n".join(parts)
94
-
95
 
96
  def segments_to_vtt(segments):
97
- """Create VTT string from whisper segments"""
98
- parts = ["WEBVTT\n"]
99
  for seg in segments:
100
  start = format_timestamp_vtt(seg['start'])
101
  end = format_timestamp_vtt(seg['end'])
102
  text = seg['text'].strip()
103
- parts.append(f"{start} --> {end}\n{text}\n")
104
- return "\n".join(parts)
105
-
106
 
107
  def segments_to_txt(segments):
108
- """Create plain TXT with timestamps per segment"""
109
- lines = []
110
- for seg in segments:
111
- start = seconds_to_timestamp(seg['start'])
112
- text = seg['text'].strip()
113
- lines.append(f"[{start}] {text}")
114
- return "\n".join(lines)
115
-
116
 
117
  def segments_to_json(segments, language=None, metadata=None):
118
- obj = {
119
- "language": language,
120
- "segments": segments
121
- }
122
  if metadata:
123
- obj["metadata"] = metadata
124
- return json.dumps(obj, ensure_ascii=False, indent=2)
125
-
126
- # Haupt-Workflow ----------------------------------------------------------
127
 
128
  def transcribe_pipeline(file_obj, url, model_size, keep_video=False):
129
- """
130
- file_obj: uploaded file (temp path) or None
131
- url: optional URL to download via yt-dlp
132
- model_size: whisper model size
133
- """
134
  if whisper is None:
135
- return "Fehler: lokales whisper nicht verfügbar. Stelle sicher, dass das Repo installiert ist.", None, None, None, None, None
136
 
137
  tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
138
  try:
139
- # 1) Get video path either from uploaded file or by downloading URL
140
  if url:
141
  video_path = download_video_with_ytdlp(url, tmpdir)
142
  elif file_obj:
143
- # file_obj is a tuple (name, file-like) or a path depending on Gradio version.
144
- # Gradio typically supplies a filesystem path.
145
  if isinstance(file_obj, str) and os.path.exists(file_obj):
146
  video_path = file_obj
147
  else:
148
- # try to write content to temp file
149
  uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
150
  with open(uploaded_path, "wb") as f:
151
- # file_obj may be a SpooledTemporaryFile or similar with .read()
152
  f.write(file_obj.read())
153
  video_path = str(uploaded_path)
154
  else:
155
- return "Kein Video angegeben (weder Datei noch URL).", None, None, None, None, None
156
 
157
- # 2) Extract audio
158
  audio_wav = str(Path(tmpdir) / "audio.wav")
159
  extract_audio_ffmpeg(video_path, audio_wav)
160
 
161
- # 3) Load whisper model and transcribe
162
  model = whisper.load_model(model_size)
163
- # transcribe: get segments to generate SRT/VTT etc.
164
  result = model.transcribe(audio_wav, verbose=False)
165
  segments = result.get("segments", [])
166
- language = result.get("language", None)
167
 
168
- # 4) Create output strings
169
  srt_text = segments_to_srt(segments)
170
- vtt_text = segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  # -*- coding: utf-8 -*-
3
  """
4
  Hugging Face Space (Gradio) App: Video -> Audio -> Whisper Transkript (+ Downloads SRT/TXT/VTT/JSON)
 
 
 
 
 
 
 
 
5
  """
6
  import os
7
  import subprocess
 
12
 
13
  import gradio as gr
14
 
 
15
  try:
16
  import whisper
17
+ except Exception:
18
  whisper = None
19
 
 
 
 
20
  def run(cmd, hide_output=False):
 
21
  if hide_output:
22
  subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
23
  else:
24
  subprocess.run(cmd, check=True)
25
 
 
26
  def download_video_with_ytdlp(url: str, out_dir: str) -> str:
 
27
  out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
28
  cmd = ["yt-dlp", "-f", "best", "-o", out_template, url]
29
  run(cmd)
 
30
  files = sorted(Path(out_dir).glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)
31
  if not files:
32
+ raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.")
33
  return str(files[0])
34
 
 
35
  def extract_audio_ffmpeg(video_path: str, out_wav: str):
 
36
  cmd = [
37
+ "ffmpeg", "-y", "-i", video_path,
38
+ "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav
 
 
 
 
 
 
39
  ]
40
  run(cmd, hide_output=True)
41
  return out_wav
42
 
43
+ def seconds_to_timestamp(s: float) -> str:
 
 
44
  hours = int(s // 3600)
45
  minutes = int((s % 3600) // 60)
46
  seconds = int(s % 60)
47
  ms = int(round((s - int(s)) * 1000))
48
  return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}"
49
 
 
50
  def format_timestamp_vtt(s: float) -> str:
51
  hours = int(s // 3600)
52
  minutes = int((s % 3600) // 60)
 
54
  ms = int(round((s - int(s)) * 1000))
55
  return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
56
 
 
57
  def segments_to_srt(segments):
58
+ out = []
 
59
  for i, seg in enumerate(segments, start=1):
60
  start = seconds_to_timestamp(seg['start'])
61
  end = seconds_to_timestamp(seg['end'])
62
  text = seg['text'].strip()
63
+ out.append(f"{i}\n{start} --> {end}\n{text}\n")
64
+ return "\n".join(out)
 
65
 
66
  def segments_to_vtt(segments):
67
+ out = ["WEBVTT\n"]
 
68
  for seg in segments:
69
  start = format_timestamp_vtt(seg['start'])
70
  end = format_timestamp_vtt(seg['end'])
71
  text = seg['text'].strip()
72
+ out.append(f"{start} --> {end}\n{text}\n")
73
+ return "\n".join(out)
 
74
 
75
  def segments_to_txt(segments):
76
+ return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
 
 
 
 
 
 
 
77
 
78
  def segments_to_json(segments, language=None, metadata=None):
79
+ data = {"language": language, "segments": segments}
 
 
 
80
  if metadata:
81
+ data["metadata"] = metadata
82
+ return json.dumps(data, ensure_ascii=False, indent=2)
 
 
83
 
84
  def transcribe_pipeline(file_obj, url, model_size, keep_video=False):
 
 
 
 
 
85
  if whisper is None:
86
+ return "Fehler: whisper ist nicht installiert.", None, None, None, None, None
87
 
88
  tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
89
  try:
 
90
  if url:
91
  video_path = download_video_with_ytdlp(url, tmpdir)
92
  elif file_obj:
 
 
93
  if isinstance(file_obj, str) and os.path.exists(file_obj):
94
  video_path = file_obj
95
  else:
 
96
  uploaded_path = Path(tmpdir) / Path(getattr(file_obj, "name", "upload")).name
97
  with open(uploaded_path, "wb") as f:
 
98
  f.write(file_obj.read())
99
  video_path = str(uploaded_path)
100
  else:
101
+ return "Kein Video angegeben.", None, None, None, None, None
102
 
 
103
  audio_wav = str(Path(tmpdir) / "audio.wav")
104
  extract_audio_ffmpeg(video_path, audio_wav)
105
 
 
106
  model = whisper.load_model(model_size)
 
107
  result = model.transcribe(audio_wav, verbose=False)
108
  segments = result.get("segments", [])
109
+ language = result.get("language", "unknown")
110
 
 
111
  srt_text = segments_to_srt(segments)
112
+ vtt_text = segments_to_vtt(segments)
113
+ txt_text = segments_to_txt(segments)
114
+ json_text = segments_to_json(segments, language, {"model": model_size})
115
+
116
+ out_files = {}
117
+ base = Path(video_path).stem
118
+ for ext, content in {"srt": srt_text, "vtt": vtt_text, "txt": txt_text, "json": json_text}.items():
119
+ p = Path(tmpdir) / f"{base}.{ext}"
120
+ p.write_text(content, encoding="utf-8")
121
+ out_files[ext] = str(p)
122
+
123
+ display_text = txt_text
124
+
125
+ if not keep_video and url:
126
+ try:
127
+ os.remove(video_path)
128
+ except Exception:
129
+ pass
130
+
131
+ return display_text, out_files["srt"], out_files["vtt"], out_files["txt"], out_files["json"], f"Model: {model_size}, Sprache: {language}"
132
+ except Exception as e:
133
+ return f"Fehler: {e}", None, None, None, None, None
134
+ finally:
135
+ pass
136
+
137
+ with gr.Blocks() as demo:
138
+ gr.Markdown("# Video → Whisper Transkript (SRT/TXT/VTT/JSON)")
139
+ with gr.Row():
140
+ with gr.Column():
141
+ url_in = gr.Textbox(label="Video URL", placeholder="https://...")
142
+ file_in = gr.File(label="Oder Videodatei hochladen")
143
+ model_sel = gr.Radio(["tiny", "base", "small", "medium", "large"], value="small", label="Whisper-Modell")
144
+ keep_chk = gr.Checkbox(label="Video behalten", value=False)
145
+ btn = gr.Button("Transkribieren")
146
+ status = gr.Textbox(label="Status")
147
+ with gr.Column():
148
+ transcript = gr.Textbox(label="Transkript mit Zeitmarken", lines=20)
149
+ srt_dl = gr.File(label="SRT", visible=False)
150
+ vtt_dl = gr.File(label="VTT", visible=False)
151
+ txt_dl = gr.File(label="TXT", visible=False)
152
+ json_dl = gr.File(label="JSON", visible=False)
153
+
154
+ def run_transcribe(f, u, m, k):
155
+ display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(f, u, m, k)
156
+ return display, gr.update(value=srtf, visible=bool(srtf)), gr.update(value=vttf, visible=bool(vttf)), gr.update(value=txtf, visible=bool(txtf)), gr.update(value=jsonf, visible=bool(jsonf)), meta
157
+
158
+ btn.click(run_transcribe, [file_in, url_in, model_sel, keep_chk], [transcript, srt_dl, vtt_dl, txt_dl, json_dl, status])
159
+
160
+ if __name__ == "__main__":
161
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))