neuralworm commited on
Commit
abecc06
·
verified ·
1 Parent(s): f862cfc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -36
app.py CHANGED
@@ -27,13 +27,19 @@ from pathlib import Path
27
  from datetime import timedelta
28
  import socket
29
  import urllib.request
 
30
 
31
  import gradio as gr
32
 
33
  try:
34
  import whisper
35
- except Exception:
36
  whisper = None
 
 
 
 
 
37
 
38
  # ---------------------------------------------------------------------------
39
  # Helper: Shell
@@ -43,25 +49,67 @@ def run_capture(cmd):
43
  """Run a command and return stdout; raise RuntimeError with readable stderr on failure."""
44
  result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
45
  if result.returncode != 0:
46
- stderr_text = result.stderr or "" # Keep only tail to avoid massive logs
47
  tail = stderr_text[-2000:]
48
- # KORRIGIERT: Fehlermeldung korrekt zusammengebaut und abgeschlossen.
49
  raise RuntimeError("Command failed: " + " ".join(cmd) + " " + tail)
50
  return result.stdout
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  # ---------------------------------------------------------------------------
53
- # Download & Audio
54
  # ---------------------------------------------------------------------------
55
 
56
  def download_video_with_ytdlp(url, out_dir, cookies_path=None, format_selector=None):
57
- """Download a video with yt-dlp into out_dir and return the video path."""
58
  out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
59
  cmd = ["yt-dlp", "-o", out_template]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  if format_selector:
61
  cmd += ["-f", format_selector]
62
  if cookies_path:
63
  cmd += ["--cookies", cookies_path]
64
  cmd.append(url)
 
 
65
 
66
  try:
67
  run_capture(cmd)
@@ -69,9 +117,8 @@ def download_video_with_ytdlp(url, out_dir, cookies_path=None, format_selector=N
69
  msg = str(e)
70
  if "Failed to resolve" in msg or "Name or service not known" in msg:
71
  raise RuntimeError(
72
- "DNS/Internet-Problem: Der Space kann den Host nicht auflösen. "
73
- "URL-Download ist hier nicht möglich. Bitte Videodatei direkt hochladen "
74
- "oder den Space in einer Umgebung mit Internet/DNS-Freigabe ausführen."
75
  )
76
  raise
77
 
@@ -80,11 +127,15 @@ def download_video_with_ytdlp(url, out_dir, cookies_path=None, format_selector=N
80
  raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.")
81
  return str(files[0])
82
 
 
83
  def extract_audio_ffmpeg(video_path, out_wav):
84
  cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav]
85
  subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
86
  return out_wav
87
 
 
 
 
88
  # ---------------------------------------------------------------------------
89
  # Zeit- und Format-Helfer
90
  # ---------------------------------------------------------------------------
@@ -104,33 +155,24 @@ def format_timestamp_vtt(s):
104
  return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
105
 
106
  def segments_to_srt(segments):
107
- """Formats transcript segments into a valid SRT string."""
108
  parts = []
109
  for i, seg in enumerate(segments, start=1):
110
  start = seconds_to_timestamp(seg['start'])
111
  end = seconds_to_timestamp(seg['end'])
112
  text = seg['text'].strip()
113
- # KORRIGIERT: Korrektes SRT-Blockformat mit Zeilenumbrüchen.
114
  parts.append(f"{i}\n{start} --> {end}\n{text}")
115
- # KORRIGIERT: Blöcke mit zwei Zeilenumbrüchen verbinden, um gültiges SRT zu erzeugen.
116
  return "\n\n".join(parts) + "\n\n"
117
 
118
  def segments_to_vtt(segments):
119
- """Formats transcript segments into a valid VTT string."""
120
- # KORRIGIERT: Korrekter Header mit nachfolgendem Zeilenumbruch.
121
  parts = ["WEBVTT\n"]
122
  for seg in segments:
123
  start = format_timestamp_vtt(seg['start'])
124
  end = format_timestamp_vtt(seg['end'])
125
  text = seg['text'].strip()
126
- # KORRIGIERT: Korrektes VTT-Blockformat mit Zeilenumbruch.
127
  parts.append(f"{start} --> {end}\n{text}")
128
- # KORRIGIERT: Blöcke mit zwei Zeilenumbrüchen verbinden.
129
  return "\n\n".join(parts)
130
 
131
  def segments_to_txt(segments):
132
- """Formats segments to a readable plain text file."""
133
- # VERBESSERT: Segmente mit Zeilenumbruch statt Leerzeichen für bessere Lesbarkeit trennen.
134
  return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
135
 
136
  def segments_to_json(segments, language=None, metadata=None):
@@ -149,26 +191,21 @@ def transcribe_pipeline(file_obj, url, model_size, keep_video=False, cookies_fil
149
 
150
  tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
151
  try:
152
- # Quelle bestimmen
153
  if url:
154
  video_path = download_video_with_ytdlp(url, tmpdir, cookies_path=cookies_file, format_selector=format_selector)
155
  elif file_obj:
156
- # Gradio übergibt ein temporäres Dateiobjekt, dessen .name Attribut der Pfad ist.
157
  video_path = file_obj.name
158
  else:
159
  return "Kein Video angegeben.", None, None, None, None, None
160
 
161
- # Audio extrahieren
162
  audio_wav = str(Path(tmpdir) / "audio.wav")
163
  extract_audio_ffmpeg(video_path, audio_wav)
164
 
165
- # Whisper laden & transkribieren
166
  model = whisper.load_model(model_size)
167
  result = model.transcribe(audio_wav, verbose=False)
168
  segments = result.get("segments", [])
169
  language = result.get("language", "unknown")
170
 
171
- # Ausgaben erzeugen
172
  txt_text = segments_to_txt(segments)
173
  srt_text = segments_to_srt(segments)
174
  vtt_text = segments_to_vtt(segments)
@@ -197,11 +234,9 @@ def transcribe_pipeline(file_obj, url, model_size, keep_video=False, cookies_fil
197
  # ---------------------------------------------------------------------------
198
 
199
  def dns_internet_diag():
200
- """Führt einige Basis-Checks aus und gibt einen Textreport zurück."""
201
  lines = []
202
 
203
- # DNS-Checks
204
- lines.append("=== DNS-Auflösung ===")
205
  for host in ["huggingface.co", "www.google.com", "www.instagram.com", "youtube.com"]:
206
  try:
207
  ip = socket.gethostbyname(host)
@@ -209,9 +244,17 @@ def dns_internet_diag():
209
  except Exception as e:
210
  lines.append(f"{host} -> ERROR: {e}")
211
 
212
- # HTTP-Checks
 
 
 
 
 
 
 
 
213
  lines.append("\n\n=== HTTP-Requests (GET) ===")
214
- for url in ["https://huggingface.co", "https://www.google.com", "https://www.instagram.com"]:
215
  try:
216
  with urllib.request.urlopen(url, timeout=5) as resp:
217
  code = getattr(resp, "status", None) or resp.getcode()
@@ -219,7 +262,6 @@ def dns_internet_diag():
219
  except Exception as e:
220
  lines.append(f"{url} -> ERROR: {e}")
221
 
222
- # yt-dlp
223
  lines.append("\n\n=== yt-dlp ===")
224
  try:
225
  out = run_capture(["yt-dlp", "--version"])
@@ -227,7 +269,6 @@ def dns_internet_diag():
227
  except Exception as e:
228
  lines.append(f"yt-dlp Fehler: {e}")
229
 
230
- # ffmpeg
231
  lines.append("\n\n=== ffmpeg ===")
232
  try:
233
  out = run_capture(["ffmpeg", "-version"])
@@ -264,10 +305,7 @@ with gr.Blocks() as demo:
264
  json_dl = gr.File(label="JSON")
265
 
266
  def run_transcribe(f, u, m, k, c, fmt):
267
- # KORRIGIERT: Korrekte Handhabung des Gradio-Dateiobjekts für Cookies.
268
- # Wir holen den Pfad über das .name Attribut.
269
  cookies_path = c.name if c else None
270
-
271
  display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(
272
  f, u, m, k, cookies_file=cookies_path, format_selector=(fmt or None)
273
  )
@@ -288,11 +326,10 @@ with gr.Blocks() as demo:
288
 
289
  with gr.Tab("Netzwerk / DNS Diagnose"):
290
  gr.Markdown(
291
- """Führt einfache Tests für DNS, HTTP sowie yt-dlp/ffmpeg aus.
292
-
293
- Wenn z. B. www.instagram.com nicht auflösbar ist, liegt ein DNS-/Firewall-Problem vor.
294
 
295
- Wenn Hugging Face / Google funktionieren, aber Instagram nicht, blockt vermutlich die Umgebung nur bestimmte Domains."""
 
296
  )
297
  diag_btn = gr.Button("Diagnose starten")
298
  diag_out = gr.Textbox(label="Diagnose-Ausgabe", lines=25)
 
27
  from datetime import timedelta
28
  import socket
29
  import urllib.request
30
+ from urllib.parse import urlparse
31
 
32
  import gradio as gr
33
 
34
  try:
35
  import whisper
36
+ except ImportError:
37
  whisper = None
38
+
39
+ try:
40
+ from dns import resolver as dns_resolver
41
+ except ImportError:
42
+ dns_resolver = None
43
 
44
  # ---------------------------------------------------------------------------
45
  # Helper: Shell
 
49
  """Run a command and return stdout; raise RuntimeError with readable stderr on failure."""
50
  result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
51
  if result.returncode != 0:
52
+ stderr_text = result.stderr or ""
53
  tail = stderr_text[-2000:]
 
54
  raise RuntimeError("Command failed: " + " ".join(cmd) + " " + tail)
55
  return result.stdout
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # NEUE FUNKTION: DNS-Auflösung via dnspython
59
+ # ---------------------------------------------------------------------------
60
+ def resolve_hostname_with_dns_python(hostname):
61
+ """Resolves a hostname using a public DNS server to bypass local DNS blocks."""
62
+ if not dns_resolver:
63
+ # Fallback auf System-DNS, wenn dnspython nicht installiert ist
64
+ print("Warning: dnspython not found. Falling back to system DNS.")
65
+ return socket.gethostbyname(hostname)
66
+
67
+ try:
68
+ resolver = dns_resolver.Resolver()
69
+ resolver.nameservers = ['8.8.8.8', '1.1.1.1'] # Google & Cloudflare DNS
70
+ answers = resolver.resolve(hostname, 'A')
71
+ if answers:
72
+ return answers[0].to_text()
73
+ except Exception as e:
74
+ print(f"DNS resolution with dnspython failed for {hostname}: {e}")
75
+ # Als letzten Ausweg versuchen wir es mit dem System-Resolver
76
+ try:
77
+ return socket.gethostbyname(hostname)
78
+ except Exception as se:
79
+ raise se # Den ursprünglichen Systemfehler auslösen
80
+ return None
81
 
82
  # ---------------------------------------------------------------------------
83
+ # MODIFIZIERTE FUNKTION: Download & Audio
84
  # ---------------------------------------------------------------------------
85
 
86
  def download_video_with_ytdlp(url, out_dir, cookies_path=None, format_selector=None):
87
+ """Download a video with yt-dlp, using custom DNS resolution if necessary."""
88
  out_template = str(Path(out_dir) / "%(title)s.%(ext)s")
89
  cmd = ["yt-dlp", "-o", out_template]
90
+
91
+ # DNS-Umgehung implementieren
92
+ try:
93
+ parsed_url = urlparse(url)
94
+ hostname = parsed_url.hostname
95
+ if hostname:
96
+ print(f"Resolving hostname: {hostname}")
97
+ ip_address = resolve_hostname_with_dns_python(hostname)
98
+ if ip_address:
99
+ print(f"Resolved {hostname} to {ip_address}. Using --resolve.")
100
+ # --resolve weist yt-dlp an, diese IP für den Hostnamen auf Port 443 zu verwenden
101
+ resolve_arg = f"{hostname}:443:{ip_address}"
102
+ cmd.extend(["--resolve", resolve_arg])
103
+ except Exception as e:
104
+ print(f"Could not perform custom DNS resolution, proceeding without it. Error: {e}")
105
+
106
  if format_selector:
107
  cmd += ["-f", format_selector]
108
  if cookies_path:
109
  cmd += ["--cookies", cookies_path]
110
  cmd.append(url)
111
+
112
+ print(f"Running command: {' '.join(cmd)}")
113
 
114
  try:
115
  run_capture(cmd)
 
117
  msg = str(e)
118
  if "Failed to resolve" in msg or "Name or service not known" in msg:
119
  raise RuntimeError(
120
+ "DNS/Internet-Problem: Der Host konnte nicht aufgelöst werden. "
121
+ "Selbst die DNS-Umgehung ist fehlgeschlagen. Möglicherweise blockiert eine Firewall auch die IP-Adressen."
 
122
  )
123
  raise
124
 
 
127
  raise FileNotFoundError("Download fehlgeschlagen — keine Datei gefunden.")
128
  return str(files[0])
129
 
130
+
131
  def extract_audio_ffmpeg(video_path, out_wav):
132
  cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", "-f", "wav", out_wav]
133
  subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
134
  return out_wav
135
 
136
+ # ... (der Rest des Codes von "Zeit- und Format-Helfer" bis zum Ende bleibt identisch)
137
+ # Ich füge ihn hier zur Vollständigkeit ein.
138
+
139
  # ---------------------------------------------------------------------------
140
  # Zeit- und Format-Helfer
141
  # ---------------------------------------------------------------------------
 
155
  return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{ms:03d}"
156
 
157
  def segments_to_srt(segments):
 
158
  parts = []
159
  for i, seg in enumerate(segments, start=1):
160
  start = seconds_to_timestamp(seg['start'])
161
  end = seconds_to_timestamp(seg['end'])
162
  text = seg['text'].strip()
 
163
  parts.append(f"{i}\n{start} --> {end}\n{text}")
 
164
  return "\n\n".join(parts) + "\n\n"
165
 
166
  def segments_to_vtt(segments):
 
 
167
  parts = ["WEBVTT\n"]
168
  for seg in segments:
169
  start = format_timestamp_vtt(seg['start'])
170
  end = format_timestamp_vtt(seg['end'])
171
  text = seg['text'].strip()
 
172
  parts.append(f"{start} --> {end}\n{text}")
 
173
  return "\n\n".join(parts)
174
 
175
  def segments_to_txt(segments):
 
 
176
  return "\n".join([f"[{seconds_to_timestamp(seg['start'])}] {seg['text'].strip()}" for seg in segments])
177
 
178
  def segments_to_json(segments, language=None, metadata=None):
 
191
 
192
  tmpdir = tempfile.mkdtemp(prefix="whisper_space_")
193
  try:
 
194
  if url:
195
  video_path = download_video_with_ytdlp(url, tmpdir, cookies_path=cookies_file, format_selector=format_selector)
196
  elif file_obj:
 
197
  video_path = file_obj.name
198
  else:
199
  return "Kein Video angegeben.", None, None, None, None, None
200
 
 
201
  audio_wav = str(Path(tmpdir) / "audio.wav")
202
  extract_audio_ffmpeg(video_path, audio_wav)
203
 
 
204
  model = whisper.load_model(model_size)
205
  result = model.transcribe(audio_wav, verbose=False)
206
  segments = result.get("segments", [])
207
  language = result.get("language", "unknown")
208
 
 
209
  txt_text = segments_to_txt(segments)
210
  srt_text = segments_to_srt(segments)
211
  vtt_text = segments_to_vtt(segments)
 
234
  # ---------------------------------------------------------------------------
235
 
236
  def dns_internet_diag():
 
237
  lines = []
238
 
239
+ lines.append("=== DNS-Auflösung (System) ===")
 
240
  for host in ["huggingface.co", "www.google.com", "www.instagram.com", "youtube.com"]:
241
  try:
242
  ip = socket.gethostbyname(host)
 
244
  except Exception as e:
245
  lines.append(f"{host} -> ERROR: {e}")
246
 
247
+ if dns_resolver:
248
+ lines.append("\n\n=== DNS-Auflösung (via dnspython mit 8.8.8.8) ===")
249
+ for host in ["huggingface.co", "www.google.com", "www.instagram.com", "youtube.com"]:
250
+ try:
251
+ ip = resolve_hostname_with_dns_python(host)
252
+ lines.append(f"{host} -> {ip} (OK)")
253
+ except Exception as e:
254
+ lines.append(f"{host} -> ERROR: {e}")
255
+
256
  lines.append("\n\n=== HTTP-Requests (GET) ===")
257
+ for url in ["https://huggingface.co", "https://www.google.com"]:
258
  try:
259
  with urllib.request.urlopen(url, timeout=5) as resp:
260
  code = getattr(resp, "status", None) or resp.getcode()
 
262
  except Exception as e:
263
  lines.append(f"{url} -> ERROR: {e}")
264
 
 
265
  lines.append("\n\n=== yt-dlp ===")
266
  try:
267
  out = run_capture(["yt-dlp", "--version"])
 
269
  except Exception as e:
270
  lines.append(f"yt-dlp Fehler: {e}")
271
 
 
272
  lines.append("\n\n=== ffmpeg ===")
273
  try:
274
  out = run_capture(["ffmpeg", "-version"])
 
305
  json_dl = gr.File(label="JSON")
306
 
307
  def run_transcribe(f, u, m, k, c, fmt):
 
 
308
  cookies_path = c.name if c else None
 
309
  display, srtf, vttf, txtf, jsonf, meta = transcribe_pipeline(
310
  f, u, m, k, cookies_file=cookies_path, format_selector=(fmt or None)
311
  )
 
326
 
327
  with gr.Tab("Netzwerk / DNS Diagnose"):
328
  gr.Markdown(
329
+ """Führt Tests für den System-DNS und einen externen DNS (via dnspython) durch.
 
 
330
 
331
+ Wenn der System-DNS fehlschlägt, der externe aber funktioniert, ist die DNS-Umgehung aktiv.
332
+ Wenn beides fehlschlägt, blockiert eine Firewall wahrscheinlich auch die IP-Adressen."""
333
  )
334
  diag_btn = gr.Button("Diagnose starten")
335
  diag_out = gr.Textbox(label="Diagnose-Ausgabe", lines=25)