Spaces:
Running
Running
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| def get_text_from_content_for_doc(content): | |
| text = "" | |
| for page in content: | |
| text += content[page]["texte"] | |
| return text | |
| def get_text_from_content_for_audio(content): | |
| return content["transcription"] | |
| def get_text_chunks(text): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, # the character length of the chunck | |
| chunk_overlap=100, # the character length of the overlap between chuncks | |
| length_function=len # the length function - in this case, character length (aka the python len() fn.) | |
| ) | |
| chunks = text_splitter.split_text(text) | |
| return chunks | |
| def get_vectorstore(text_chunks): | |
| embedding = OpenAIEmbeddings(model="text-embedding-3-small") | |
| vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding) | |
| return vectorstore | |
| def setup_rag(file_type,content): | |
| if file_type == "pdf": | |
| text = get_text_from_content_for_doc(content) | |
| elif file_type == "audio": | |
| text = get_text_from_content_for_audio(content) | |
| chunks = get_text_chunks(text) | |
| vectorstore = get_vectorstore(chunks) | |
| return vectorstore | |