| ############################################################################################################################# | |
| # Filename : app.py | |
| # Description: A Streamlit application to turn an image to audio story. | |
| # Author : Georgios Ioannou | |
| # | |
| # Copyright © 2024 by Georgios Ioannou | |
| ############################################################################################################################# | |
| # Import libraries. | |
| import os # Load environment variable(s). | |
| import requests # Send HTTP GET request to Hugging Face models for inference. | |
| import streamlit as st # Build the GUI of the application. | |
| from langchain.chat_models import ChatOpenAI # Access to OpenAI gpt-3.5-turbo model. | |
| from langchain.chains import LLMChain # Chain to run queries against LLMs. | |
| # A prompt template. It accepts a set of parameters from the user that can be used to generate a prompt for a language model. | |
| from langchain.prompts import PromptTemplate | |
| from transformers import pipeline # Access to Hugging Face models. | |
| ############################################################################################################################# | |
| # Load environment variable(s). | |
| HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
| ############################################################################################################################# | |
| # Function to apply local CSS. | |
| def local_css(file_name): | |
| with open(file_name) as f: | |
| st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True) | |
| ############################################################################################################################# | |
| # Return the text generated by the model for the image. | |
| # Using pipeline. | |
| def img_to_text(image_path): | |
| # https://huggingface.co/tasks | |
| # Task used here : "image-to-text". | |
| # Model used here: "Salesforce/blip-image-captioning-base". | |
| # Backup model: "nlpconnect/vit-gpt2-image-captioning". | |
| # Backup model: "Salesforce/blip-image-captioning-large" | |
| image_to_text = pipeline( | |
| "image-to-text", model="Salesforce/blip-image-captioning-base" | |
| ) | |
| # image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") | |
| # image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") | |
| scenario = image_to_text(image_path)[0]["generated_text"] | |
| return scenario | |
| ############################################################################################################################# | |
| # Return the story generated by the model for the scenario. | |
| # Using Langchain. | |
| def generate_story(scenario, personality): | |
| # Model used here: "gpt-3.5-turbo". | |
| # The template can be customized to meet one's needs such as: | |
| # Generate a story and generate lyrics of a song. | |
| template = """ | |
| You are a story teller. | |
| You must sound like {personality}. | |
| The story should be less than 50 words. | |
| Generate a story based on the above constraints and the following scenario: {scenario}. | |
| """ | |
| prompt = PromptTemplate( | |
| template=template, input_variables=["scenario", "personality"] | |
| ) | |
| story_llm = LLMChain( | |
| llm=ChatOpenAI( | |
| model_name="gpt-3.5-turbo", temperature=0 | |
| ), # Increasing the temperature, the model becomes more creative and takes longer for inference. | |
| prompt=prompt, | |
| verbose=True, # Print intermediate values to the console. | |
| ) | |
| story = story_llm.predict( | |
| scenario=scenario, personality=personality | |
| ) # Format prompt with kwargs and pass to LLM. | |
| return story | |
| ############################################################################################################################# | |
| # Return the speech generated by the model for the story. | |
| # Using inference api. | |
| def text_to_speech(story): | |
| # Model used here: "espnet/kan-bayashi_ljspeech_vits. | |
| # Backup model: "facebook/mms-tts-eng". | |
| API_URL = ( | |
| "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" | |
| ) | |
| # API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-eng" | |
| headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} | |
| payload = {"inputs": story} | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| with open("audio.flac", "wb") as file: | |
| file.write(response.content) | |
| ############################################################################################################################# | |
| # Main function to create the Streamlit web application. | |
| def main(): | |
| try: | |
| # Page title and favicon. | |
| st.set_page_config(page_title="Image To Audio Story", page_icon="🖼️") | |
| # Load CSS. | |
| local_css("styles/style.css") | |
| # Title. | |
| title = f"""<h1 align="center" style="font-family: monospace; font-size: 2.1rem; margin-top: -4rem"> | |
| Turn Image to Audio Story</h1>""" | |
| st.markdown(title, unsafe_allow_html=True) | |
| # Subtitle. | |
| title = f"""<h2 align="center" style="font-family: monospace; font-size: 1.5rem; margin-top: -2rem"> | |
| CUNY Tech Prep Tutorial 1</h2>""" | |
| st.markdown(title, unsafe_allow_html=True) | |
| # Image. | |
| image = "./ctp.png" | |
| left_co, cent_co, last_co = st.columns(3) | |
| with cent_co: | |
| st.image(image=image) | |
| # Define the personalities for the dropdown menu. | |
| personalities = [ | |
| "Donald Trump", | |
| "Abraham Lincoln", | |
| "Aristotle", | |
| "Cardi B", | |
| "Kanye West", | |
| ] | |
| personality = st.selectbox("Select a personality:", personalities) | |
| # Upload an image. | |
| uploaded_file = st.file_uploader("Choose an image:") | |
| if uploaded_file is not None: | |
| # Display the uploaded image. | |
| bytes_data = uploaded_file.getvalue() | |
| with open(uploaded_file.name, "wb") as file: | |
| file.write(bytes_data) | |
| st.image(uploaded_file, caption="Uploaded Image.", use_column_width=True) | |
| with st.spinner(text="Model Inference..."): # Spinner to keep the application interactive. | |
| # Model inference. | |
| scenario = img_to_text(uploaded_file.name) | |
| story = generate_story(scenario=scenario, personality=personality) | |
| text_to_speech(story) | |
| # Display the scenario and story. | |
| with st.expander("Scenario"): | |
| st.write(scenario) | |
| with st.expander("Story"): | |
| st.write(story) | |
| # Display the audio. | |
| st.audio("audio.flac") | |
| except Exception as e: | |
| # Display any errors. | |
| st.error(e) | |
| # GitHub repository of author. | |
| st.markdown( | |
| f""" | |
| <p align="center" style="font-family: monospace; color: #FAF9F6; font-size: 1rem;"><b> Check out our | |
| <a href="https://github.com/GeorgiosIoannouCoder/" style="color: #FAF9F6;"> GitHub repository</a></b> | |
| </p> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| ############################################################################################################################# | |
| if __name__ == "__main__": | |
| main() | |