Spaces:
Runtime error
Runtime error
Commit
·
c3c9064
1
Parent(s):
1be32a3
access browser cookies instead of prompting the user
Browse files- app.py +43 -34
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -10,32 +10,42 @@ from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
|
|
| 10 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 12 |
import cv2
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
ydl_opts = {
|
|
|
|
| 19 |
'format': 'bestvideo+bestaudio',
|
| 20 |
-
'outtmpl': os.path.join(
|
| 21 |
-
'username': 'oauth2',
|
| 22 |
-
'password': ''
|
| 23 |
}
|
| 24 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 25 |
ydl.download([video_url])
|
| 26 |
video_info = ydl.extract_info(video_url, download=False)
|
| 27 |
video_title = video_info.get('title', 'video')
|
| 28 |
-
return os.path.join(
|
| 29 |
|
| 30 |
-
def convert_to_mp4(input_path
|
| 31 |
-
output_file = os.path.join(
|
| 32 |
command = ['ffmpeg', '-i', input_path, '-c', 'copy', output_file]
|
| 33 |
subprocess.run(command, check=True)
|
| 34 |
return output_file
|
| 35 |
|
| 36 |
def extract_audio_from_video(video_path):
|
| 37 |
video_clip = VideoFileClip(video_path)
|
| 38 |
-
audio_output = os.path.join(
|
| 39 |
audio_clip = video_clip.audio
|
| 40 |
audio_clip.write_audiofile(audio_output)
|
| 41 |
return audio_output
|
|
@@ -43,7 +53,7 @@ def extract_audio_from_video(video_path):
|
|
| 43 |
def convert_mp3_to_wav(mp3_path):
|
| 44 |
from pydub import AudioSegment
|
| 45 |
audio = AudioSegment.from_mp3(mp3_path)
|
| 46 |
-
wav_output = os.path.join(
|
| 47 |
audio.export(wav_output, format="wav")
|
| 48 |
return wav_output
|
| 49 |
|
|
@@ -94,35 +104,33 @@ emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
|
|
| 94 |
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
|
| 95 |
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
|
| 96 |
|
| 97 |
-
def analyze_video(video_url):
|
| 98 |
global output_path
|
| 99 |
output_path = './'
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
mp4_path = convert_to_mp4(video_path, output_path)
|
| 106 |
-
|
| 107 |
audio_path = extract_audio_from_video(mp4_path)
|
| 108 |
-
|
| 109 |
audio_wav_path = convert_mp3_to_wav(audio_path)
|
| 110 |
|
| 111 |
model_whisper = whisper.load_model("base")
|
| 112 |
-
|
| 113 |
result_whisper = model_whisper.transcribe(audio_wav_path)
|
| 114 |
-
|
| 115 |
transcript = result_whisper['text']
|
| 116 |
-
|
| 117 |
emotion_dict_text, predicted_emotion_text = process_text(transcript)
|
| 118 |
-
|
| 119 |
n_frame_interval = 60
|
| 120 |
emotion_vectors_video = []
|
| 121 |
-
|
| 122 |
video_capture = cv2.VideoCapture(mp4_path)
|
| 123 |
-
|
| 124 |
total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 125 |
-
|
| 126 |
frame_count_video = 0
|
| 127 |
|
| 128 |
while video_capture.isOpened():
|
|
@@ -134,7 +142,7 @@ def analyze_video(video_url):
|
|
| 134 |
if frame_count_video % n_frame_interval == 0:
|
| 135 |
pixel_values_video = preprocess_frame(frame_video)
|
| 136 |
caption_video = generate_caption(pixel_values_video)
|
| 137 |
-
predicted_emotions_video = predict_emotions(caption_video)
|
| 138 |
emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
|
| 139 |
|
| 140 |
frame_count_video += 1
|
|
@@ -152,10 +160,11 @@ def analyze_video(video_url):
|
|
| 152 |
return transcript, predicted_emotion_text, final_most_predicted_emotion
|
| 153 |
|
| 154 |
iface = gr.Interface(fn=analyze_video,
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
| 159 |
|
| 160 |
if __name__ == "__main__":
|
| 161 |
-
|
|
|
|
| 10 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 11 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 12 |
import cv2
|
| 13 |
+
import browser_cookie3
|
| 14 |
+
|
| 15 |
+
def get_youtube_cookies(browser):
|
| 16 |
+
if browser == 'Chrome':
|
| 17 |
+
return browser_cookie3.chrome()
|
| 18 |
+
elif browser == 'Firefox':
|
| 19 |
+
return browser_cookie3.firefox()
|
| 20 |
+
elif browser == 'Edge':
|
| 21 |
+
return browser_cookie3.edge()
|
| 22 |
+
elif browser == 'Brave':
|
| 23 |
+
return browser_cookie3.brave()
|
| 24 |
+
else:
|
| 25 |
+
raise ValueError("Unsupported browser")
|
| 26 |
+
|
| 27 |
+
def download_youtube_video(video_url, browser):
|
| 28 |
+
cookies = get_youtube_cookies(browser)
|
| 29 |
ydl_opts = {
|
| 30 |
+
'cookiefile': cookies,
|
| 31 |
'format': 'bestvideo+bestaudio',
|
| 32 |
+
'outtmpl': os.path.join('./', '%(title)s.%(ext)s'),
|
|
|
|
|
|
|
| 33 |
}
|
| 34 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 35 |
ydl.download([video_url])
|
| 36 |
video_info = ydl.extract_info(video_url, download=False)
|
| 37 |
video_title = video_info.get('title', 'video')
|
| 38 |
+
return os.path.join('./', f"{video_title}.webm")
|
| 39 |
|
| 40 |
+
def convert_to_mp4(input_path):
|
| 41 |
+
output_file = os.path.join('./', 'video.mp4')
|
| 42 |
command = ['ffmpeg', '-i', input_path, '-c', 'copy', output_file]
|
| 43 |
subprocess.run(command, check=True)
|
| 44 |
return output_file
|
| 45 |
|
| 46 |
def extract_audio_from_video(video_path):
|
| 47 |
video_clip = VideoFileClip(video_path)
|
| 48 |
+
audio_output = os.path.join('./', 'audio.mp3')
|
| 49 |
audio_clip = video_clip.audio
|
| 50 |
audio_clip.write_audiofile(audio_output)
|
| 51 |
return audio_output
|
|
|
|
| 53 |
def convert_mp3_to_wav(mp3_path):
|
| 54 |
from pydub import AudioSegment
|
| 55 |
audio = AudioSegment.from_mp3(mp3_path)
|
| 56 |
+
wav_output = os.path.join('./', 'audio.wav')
|
| 57 |
audio.export(wav_output, format="wav")
|
| 58 |
return wav_output
|
| 59 |
|
|
|
|
| 104 |
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
|
| 105 |
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
|
| 106 |
|
| 107 |
+
def analyze_video(video_url, browser):
|
| 108 |
global output_path
|
| 109 |
output_path = './'
|
| 110 |
|
| 111 |
+
video_path = download_youtube_video(video_url, browser)
|
| 112 |
+
|
| 113 |
+
mp4_path = convert_to_mp4(video_path)
|
| 114 |
+
|
|
|
|
|
|
|
| 115 |
audio_path = extract_audio_from_video(mp4_path)
|
| 116 |
+
|
| 117 |
audio_wav_path = convert_mp3_to_wav(audio_path)
|
| 118 |
|
| 119 |
model_whisper = whisper.load_model("base")
|
| 120 |
+
|
| 121 |
result_whisper = model_whisper.transcribe(audio_wav_path)
|
| 122 |
+
|
| 123 |
transcript = result_whisper['text']
|
| 124 |
+
|
| 125 |
emotion_dict_text, predicted_emotion_text = process_text(transcript)
|
| 126 |
+
|
| 127 |
n_frame_interval = 60
|
| 128 |
emotion_vectors_video = []
|
| 129 |
+
|
| 130 |
video_capture = cv2.VideoCapture(mp4_path)
|
| 131 |
+
|
| 132 |
total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 133 |
+
|
| 134 |
frame_count_video = 0
|
| 135 |
|
| 136 |
while video_capture.isOpened():
|
|
|
|
| 142 |
if frame_count_video % n_frame_interval == 0:
|
| 143 |
pixel_values_video = preprocess_frame(frame_video)
|
| 144 |
caption_video = generate_caption(pixel_values_video)
|
| 145 |
+
predicted_emotions_video, _ = predict_emotions(caption_video)
|
| 146 |
emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
|
| 147 |
|
| 148 |
frame_count_video += 1
|
|
|
|
| 160 |
return transcript, predicted_emotion_text, final_most_predicted_emotion
|
| 161 |
|
| 162 |
iface = gr.Interface(fn=analyze_video,
|
| 163 |
+
inputs=[gr.Textbox(label="YouTube Video URL"),
|
| 164 |
+
gr.Dropdown(label="Select Browser", choices=["Chrome", "Firefox", "Edge", "Brave"])],
|
| 165 |
+
outputs=["text", "text", "text"],
|
| 166 |
+
title="Multimodal Emotion Recognition",
|
| 167 |
+
description="Enter a YouTube Video URL and select your browser to analyze emotions from both audio and visual content.")
|
| 168 |
|
| 169 |
if __name__ == "__main__":
|
| 170 |
+
iface.launch()
|
requirements.txt
CHANGED
|
@@ -8,4 +8,5 @@ moviepy
|
|
| 8 |
openai-whisper
|
| 9 |
yt-dlp
|
| 10 |
torch
|
| 11 |
-
opencv-python
|
|
|
|
|
|
| 8 |
openai-whisper
|
| 9 |
yt-dlp
|
| 10 |
torch
|
| 11 |
+
opencv-python
|
| 12 |
+
browser-cookie3
|