Update legal.py
Browse files
legal.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
from flask import Flask, request, jsonify, send_from_directory
|
| 2 |
import speech_recognition as sr
|
| 3 |
-
import threading
|
| 4 |
import datetime
|
| 5 |
import pyttsx3
|
| 6 |
from langdetect import detect
|
|
@@ -16,18 +15,15 @@ from pydub import AudioSegment
|
|
| 16 |
import os
|
| 17 |
from werkzeug.utils import secure_filename
|
| 18 |
import tempfile
|
|
|
|
| 19 |
|
| 20 |
app = Flask(__name__, static_folder='.') # Serve static files from the current directory
|
| 21 |
|
| 22 |
# Load Hugging Face API key from environment variable
|
| 23 |
-
|
|
|
|
| 24 |
if not hf_token:
|
| 25 |
-
|
| 26 |
-
from dotenv import load_dotenv
|
| 27 |
-
load_dotenv()
|
| 28 |
-
hf_token = os.environ.get("API_KEY")
|
| 29 |
-
if not hf_token:
|
| 30 |
-
raise ValueError("Hugging Face API key not found. Please set 'API_KEY' as an environment variable or in a .env file.")
|
| 31 |
|
| 32 |
login(token=hf_token)
|
| 33 |
|
|
@@ -44,12 +40,19 @@ summarizer_pipeline = pipeline("summarization", model=summarizer_model, tokenize
|
|
| 44 |
embed_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
|
| 45 |
|
| 46 |
# Load both datasets
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# Extract text from JSON
|
| 55 |
corpus_json = []
|
|
@@ -63,6 +66,7 @@ for entry in json_data:
|
|
| 63 |
corpus = corpus_parquet + corpus_json
|
| 64 |
|
| 65 |
# Compute embeddings
|
|
|
|
| 66 |
embeddings = embed_model.encode(corpus, show_progress_bar=True, batch_size=16)
|
| 67 |
|
| 68 |
# Build FAISS index
|
|
@@ -107,7 +111,7 @@ def init_tts_engine():
|
|
| 107 |
tts_engine.setProperty('voice', v.id)
|
| 108 |
break
|
| 109 |
|
| 110 |
-
init_tts_engine()
|
| 111 |
|
| 112 |
# Global variables for managing state (simplify for web context)
|
| 113 |
conversation_history = []
|
|
@@ -120,8 +124,12 @@ def serve_index():
|
|
| 120 |
|
| 121 |
@app.route('/<path:path>')
|
| 122 |
def serve_static_files(path):
|
|
|
|
|
|
|
|
|
|
| 123 |
return send_from_directory('.', path)
|
| 124 |
|
|
|
|
| 125 |
@app.route('/answer', methods=['POST'])
|
| 126 |
def generate_answer_endpoint():
|
| 127 |
global last_question_text, last_answer_text, conversation_history
|
|
@@ -143,6 +151,8 @@ def generate_answer_endpoint():
|
|
| 143 |
|
| 144 |
@app.route('/read-aloud', methods=['POST'])
|
| 145 |
def read_aloud_endpoint():
|
|
|
|
|
|
|
| 146 |
data = request.get_json()
|
| 147 |
text_to_read = data.get('text', '').strip()
|
| 148 |
|
|
@@ -157,20 +167,13 @@ def read_aloud_endpoint():
|
|
| 157 |
tts_engine.save_to_file(text_to_read, temp_audio_path)
|
| 158 |
tts_engine.runAndWait()
|
| 159 |
|
| 160 |
-
# You would typically serve this file or stream it.
|
| 161 |
-
# For
|
| 162 |
-
# In a real app, you might use Flask's send_file for audio playback.
|
| 163 |
-
# For now, let's just return success.
|
| 164 |
-
# This approach is suitable if the browser requests the audio file directly after this.
|
| 165 |
-
# For direct playback, you might stream it or serve it immediately.
|
| 166 |
-
# For web, it's more common to have the frontend's SpeechSynthesis API handle this.
|
| 167 |
-
# The frontend `readAloud` function already does this.
|
| 168 |
-
# So, this endpoint might not be strictly necessary unless for server-side TTS.
|
| 169 |
return jsonify({"status": "TTS audio generated (server-side)."})
|
| 170 |
except Exception as e:
|
| 171 |
return jsonify({"status": f"Error during TTS: {str(e)}"}), 500
|
| 172 |
finally:
|
| 173 |
-
if os.path.exists(temp_audio_path):
|
| 174 |
os.remove(temp_audio_path)
|
| 175 |
|
| 176 |
|
|
@@ -186,38 +189,45 @@ def upload_mp3_endpoint():
|
|
| 186 |
if file:
|
| 187 |
filename = secure_filename(file.filename)
|
| 188 |
# Create a temporary directory to save the uploaded file and its WAV conversion
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
@app.route('/summarize', methods=['POST'])
|
| 223 |
def summarize_endpoint():
|
|
|
|
| 1 |
from flask import Flask, request, jsonify, send_from_directory
|
| 2 |
import speech_recognition as sr
|
|
|
|
| 3 |
import datetime
|
| 4 |
import pyttsx3
|
| 5 |
from langdetect import detect
|
|
|
|
| 15 |
import os
|
| 16 |
from werkzeug.utils import secure_filename
|
| 17 |
import tempfile
|
| 18 |
+
from dotenv import load_dotenv # Ensure dotenv is imported for .env loading
|
| 19 |
|
| 20 |
app = Flask(__name__, static_folder='.') # Serve static files from the current directory
|
| 21 |
|
| 22 |
# Load Hugging Face API key from environment variable
|
| 23 |
+
load_dotenv() # Load environment variables from .env file
|
| 24 |
+
hf_token = os.environ.get("API_KEY")
|
| 25 |
if not hf_token:
|
| 26 |
+
raise ValueError("Hugging Face API key not found. Please set 'API_KEY' as an environment variable or in a .env file.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
login(token=hf_token)
|
| 29 |
|
|
|
|
| 40 |
embed_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2")
|
| 41 |
|
| 42 |
# Load both datasets
|
| 43 |
+
try:
|
| 44 |
+
df_parquet = pd.read_parquet("ibtehaj dataset.parquet")
|
| 45 |
+
corpus_parquet = df_parquet["text"].dropna().tolist()
|
| 46 |
+
except FileNotFoundError:
|
| 47 |
+
raise FileNotFoundError("ibtehaj dataset.parquet not found. Make sure it's in the same directory as app.py")
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
with open("pdf_data.json", "r", encoding="utf-8") as f:
|
| 51 |
+
json_data = json.load(f)
|
| 52 |
+
except FileNotFoundError:
|
| 53 |
+
raise FileNotFoundError("pdf_data.json not found. Make sure it's in the same directory as app.py")
|
| 54 |
+
except json.JSONDecodeError as e:
|
| 55 |
+
raise ValueError(f"Error decoding pdf_data.json: {e}")
|
| 56 |
|
| 57 |
# Extract text from JSON
|
| 58 |
corpus_json = []
|
|
|
|
| 66 |
corpus = corpus_parquet + corpus_json
|
| 67 |
|
| 68 |
# Compute embeddings
|
| 69 |
+
# This can take a while. Consider pre-computing and saving the index if corpus is large.
|
| 70 |
embeddings = embed_model.encode(corpus, show_progress_bar=True, batch_size=16)
|
| 71 |
|
| 72 |
# Build FAISS index
|
|
|
|
| 111 |
tts_engine.setProperty('voice', v.id)
|
| 112 |
break
|
| 113 |
|
| 114 |
+
init_tts_engine() # Initialize TTS engine once on startup
|
| 115 |
|
| 116 |
# Global variables for managing state (simplify for web context)
|
| 117 |
conversation_history = []
|
|
|
|
| 124 |
|
| 125 |
@app.route('/<path:path>')
|
| 126 |
def serve_static_files(path):
|
| 127 |
+
# This route serves static files like CSS, JS, and images
|
| 128 |
+
# It must be specific to paths that exist as files, otherwise it might catch API calls
|
| 129 |
+
# For now, it's fine, but in complex apps, static files are often served by Nginx/Apache.
|
| 130 |
return send_from_directory('.', path)
|
| 131 |
|
| 132 |
+
|
| 133 |
@app.route('/answer', methods=['POST'])
|
| 134 |
def generate_answer_endpoint():
|
| 135 |
global last_question_text, last_answer_text, conversation_history
|
|
|
|
| 151 |
|
| 152 |
@app.route('/read-aloud', methods=['POST'])
|
| 153 |
def read_aloud_endpoint():
|
| 154 |
+
# This endpoint is generally not needed if client-side SpeechSynthesis API is used.
|
| 155 |
+
# Keeping it for completeness if server-side TTS is desired.
|
| 156 |
data = request.get_json()
|
| 157 |
text_to_read = data.get('text', '').strip()
|
| 158 |
|
|
|
|
| 167 |
tts_engine.save_to_file(text_to_read, temp_audio_path)
|
| 168 |
tts_engine.runAndWait()
|
| 169 |
|
| 170 |
+
# You would typically serve this file or stream it for client playback.
|
| 171 |
+
# For this setup, we'll confirm generation. The frontend handles playback.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
return jsonify({"status": "TTS audio generated (server-side)."})
|
| 173 |
except Exception as e:
|
| 174 |
return jsonify({"status": f"Error during TTS: {str(e)}"}), 500
|
| 175 |
finally:
|
| 176 |
+
if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path):
|
| 177 |
os.remove(temp_audio_path)
|
| 178 |
|
| 179 |
|
|
|
|
| 189 |
if file:
|
| 190 |
filename = secure_filename(file.filename)
|
| 191 |
# Create a temporary directory to save the uploaded file and its WAV conversion
|
| 192 |
+
# Ensure that the temp directory is managed for cleanup.
|
| 193 |
+
try:
|
| 194 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 195 |
+
mp3_path = os.path.join(tmpdir, filename)
|
| 196 |
+
file.save(mp3_path)
|
| 197 |
+
|
| 198 |
+
wav_path = os.path.join(tmpdir, filename.replace(".mp3", ".wav"))
|
| 199 |
+
try:
|
| 200 |
+
sound = AudioSegment.from_mp3(mp3_path)
|
| 201 |
+
sound.export(wav_path, format="wav")
|
| 202 |
+
except Exception as e:
|
| 203 |
+
# Catch pydub/ffmpeg related errors
|
| 204 |
+
return jsonify({"message": f"Error converting MP3 to WAV. Ensure FFmpeg is installed and in your system's PATH. Details: {e}"}), 500
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
recognizer = sr.Recognizer()
|
| 208 |
+
with sr.AudioFile(wav_path) as src:
|
| 209 |
+
audio = recognizer.record(src)
|
| 210 |
+
text = recognizer.recognize_google(audio)
|
| 211 |
+
except sr.UnknownValueError:
|
| 212 |
+
return jsonify({"message": "Speech not understood. Please try again."}), 400
|
| 213 |
+
except sr.RequestError as e:
|
| 214 |
+
return jsonify({"message": f"Could not request results from speech recognition service; {e}"}), 500
|
| 215 |
+
except Exception as e: # Catch any other unexpected SR errors
|
| 216 |
+
return jsonify({"message": f"An unexpected error occurred during speech recognition: {e}"}), 500
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
# For web, you don't typically "save that file in .txt format and asks the user where to store that" server-side.
|
| 220 |
+
# The transcription is returned to the client. The client can then decide to save it.
|
| 221 |
+
return jsonify({
|
| 222 |
+
"message": "MP3 transcribed successfully.",
|
| 223 |
+
"transcription": text
|
| 224 |
+
})
|
| 225 |
+
except Exception as e:
|
| 226 |
+
# Catch any errors related to temporary directory creation or file saving
|
| 227 |
+
return jsonify({"message": f"An error occurred during file upload or temporary processing: {e}"}), 500
|
| 228 |
+
# This point should not be reached if 'if file' condition is handled.
|
| 229 |
+
return jsonify({"message": "An unknown file processing error occurred."}), 500
|
| 230 |
+
|
| 231 |
|
| 232 |
@app.route('/summarize', methods=['POST'])
|
| 233 |
def summarize_endpoint():
|