Spaces:

ccapo
/

portfolio

Sleeping

App Files Files Community

Christopher Capobianco commited on Oct 26, 2024

Commit

fc8e190

1 Parent(s): b1eea1f

Get document classifier to load properly

Browse files

Files changed (5) hide show

Home.py +0 -1
app.py +38 -1
projects/01_Document_Classifier.py +30 -47
projects/05_Stock_Market.py +15 -14
projects/06_Generative_Music.py +20 -18

Home.py CHANGED Viewed

@@ -20,7 +20,6 @@ with st.container():
     text_column, image_column = st.columns((3,1))
     with text_column:
         st.subheader("Document Classifier", divider="green")
-        st.warning("Work in Progress")
         st.markdown("""
             - Used OCR text and a Random Forest classification model to predict a document's classification
             - Trained on Real World Documents Collection at Kaggle

     text_column, image_column = st.columns((3,1))
     with text_column:
         st.subheader("Document Classifier", divider="green")
         st.markdown("""
             - Used OCR text and a Random Forest classification model to predict a document's classification
             - Trained on Real World Documents Collection at Kaggle

app.py CHANGED Viewed

@@ -1,9 +1,40 @@
 import streamlit as st
 # Page title
 st.set_page_config(page_title="Chris Capobianco's Profile", page_icon=':rocket:', layout='wide')
-home = st.Page('Home.py', title = 'Home')
 document_classification = st.Page('projects/01_Document_Classifier.py', title='Document Classifier')
 movie_recommendation = st.Page('projects/02_Movie_Recommendation.py', title='Movie Recommendation')
@@ -29,3 +60,9 @@ pg = st.navigation(
 )
 pg.run()

 import streamlit as st
+import spacy
+import pickle
+import subprocess
 # Page title
 st.set_page_config(page_title="Chris Capobianco's Profile", page_icon=':rocket:', layout='wide')
+home = st.Page('Home.py', title = 'Home', default = True)
+# Function to Load the Spacy tokenizer
+@st.cache_resource
+def load_nlp():
+    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
+    return spacy.load('en_core_web_sm')
+def tokenizer(sentence):
+    # Process the text
+    doc = nlp(sentence)
+    # Convert tokens to lemma form for all except '-PRON-'
+    # Recall: Tokens like 'I', 'my', 'me' are represented as '-PRON-' by lemma attribute (See SpaCy Introduction)
+    tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ]
+    # Remove stop words and punctuations
+    tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ]
+    return tokens
+# Function to Load the model
+@st.cache_resource
+def load_tokenizer_model():
+    with open('./models/autoclassifier.pkl', 'rb') as model_file:
+        stopwords = pickle.load(model_file)
+        punctuations = pickle.load(model_file)
+        model_pipe = pickle.load(model_file)
+    return (stopwords, punctuations, model_pipe)
 document_classification = st.Page('projects/01_Document_Classifier.py', title='Document Classifier')
 movie_recommendation = st.Page('projects/02_Movie_Recommendation.py', title='Movie Recommendation')
 )
 pg.run()
+# Load the Spacy tokenizer
+nlp = load_nlp()
+# Load the Model
+stopwords, punctuations, model_pipe = load_tokenizer_model()

projects/01_Document_Classifier.py CHANGED Viewed

@@ -7,38 +7,24 @@ import os
 import subprocess
 # Function to Load the Spacy tokenizer
-@st.cache_data
 def load_nlp():
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     return spacy.load('en_core_web_sm')
-# Function to Initialze the OCR Engine
-@st.cache_resource
-def load_ocr_engine():
-    return easyocr.Reader(['en'])
 # Function to Load the model
 @st.cache_resource
-def load_model():
-    with open('models/autoclassifier.pkl', 'rb') as model_file:
         stopwords = pickle.load(model_file)
         punctuations = pickle.load(model_file)
         model_pipe = pickle.load(model_file)
     return (stopwords, punctuations, model_pipe)
-# Function to tokenize the text
-def tokenizer(sentence):
-    # Process the text
-    doc = nlp(sentence)
-    # Convert tokens to lemma form for all except '-PRON-'
-    # Recall: Tokens like 'I', 'my', 'me' are represented as '-PRON-' by lemma attribute (See SpaCy Introduction)
-    tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ]
-    # Remove stop words and punctuations
-    tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ]
-    return tokens
 # Function to process uploaded images
 @st.cache_data
@@ -72,35 +58,32 @@ def autoclassifier(images):
             # Delete image file
             os.remove(image.name)
-if __name__ == "__main__":
-    st.header('Document Classifier', divider='green')
-    st.warning("Work in Progress")
-    st.markdown("#### What is OCR?")
-    st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.")
-    st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.")
-    st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.")
-    st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`")
-    st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*")
-    st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*")
-    st.divider()
-    # Load the Spacy tokenizer
-    nlp = load_nlp()
-    # Initialze the OCR Engine
-    ocr_engine = load_ocr_engine()
-    # Load the Model
-    stopwords, punctuations, model_pipe = load_model()
-    # Fetch uploaded images
-    images = st.file_uploader(
-        "Choose an image to classify",
-        type=['png','jpg','jpeg'],
-        accept_multiple_files=True
-    )
-    # Process and predict document classification
-    autoclassifier(images)

 import subprocess
 # Function to Load the Spacy tokenizer
+@st.cache_resource
 def load_nlp():
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     return spacy.load('en_core_web_sm')
 # Function to Load the model
 @st.cache_resource
+def load_tokenizer_model():
+    with open('./models/autoclassifier.pkl', 'rb') as model_file:
         stopwords = pickle.load(model_file)
         punctuations = pickle.load(model_file)
         model_pipe = pickle.load(model_file)
     return (stopwords, punctuations, model_pipe)
+# Function to Initialze the OCR Engine
+@st.cache_resource
+def load_ocr_engine():
+    return easyocr.Reader(['en'])
 # Function to process uploaded images
 @st.cache_data
             # Delete image file
             os.remove(image.name)
+st.header('Document Classifier', divider='green')
+st.markdown("#### What is OCR?")
+st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.")
+st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.")
+st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.")
+st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`")
+st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*")
+st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*")
+st.divider()
+# Fetch uploaded images
+images = st.file_uploader(
+    "Choose an image to classify",
+    type=['png','jpg','jpeg'],
+    accept_multiple_files=True
+)
+# Load the Spacy tokenizer
+nlp = load_nlp()
+# Load the Model
+stopwords, punctuations, model_pipe = load_tokenizer_model()
+# Initialze the OCR Engine
+ocr_engine = load_ocr_engine()
+# Process and predict document classification
+autoclassifier(images)

projects/05_Stock_Market.py CHANGED Viewed

@@ -5,25 +5,23 @@ from PIL import Image
 @st.cache_resource
 def load_model():
-    model_file = open('./models/stock_market_model.pkl', 'rb')
-    amazon_predictions = pickle.load(model_file)
-    amazon_scores = pickle.load(model_file)
-    google_predictions = pickle.load(model_file)
-    google_scores = pickle.load(model_file)
-    ibm_predictions = pickle.load(model_file)
-    ibm_scores = pickle.load(model_file)
-    microsoft_predictions = pickle.load(model_file)
-    microsoft_scores = pickle.load(model_file)
-    model_file.close()
-    return amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores
 # Load Image
 gru = Image.open("assets/gru.png")
 nn = Image.open("assets/nn.png")
-# Load the Model
-amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores = load_model()
 st.header('Stock Market Forecast', divider='green')
 st.markdown("#### Time Series Forecasting")
@@ -42,6 +40,9 @@ st.divider()
 st.markdown("Below each graph is the mean square error (MSE) for the train and test sets, where the test set consists of the last 20 days.")
 fig1 = go.Figure()
 fig1.add_trace(go.Scatter(go.Scatter(x=amazon_predictions['Date'], y=amazon_predictions['Train Prediction'],
                     mode='lines',

 @st.cache_resource
 def load_model():
+    with st.spinner(f"Fetching Models"):
+        model_file = open('./models/stock_market_model.pkl', 'rb')
+        amazon_predictions = pickle.load(model_file)
+        amazon_scores = pickle.load(model_file)
+        google_predictions = pickle.load(model_file)
+        google_scores = pickle.load(model_file)
+        ibm_predictions = pickle.load(model_file)
+        ibm_scores = pickle.load(model_file)
+        microsoft_predictions = pickle.load(model_file)
+        microsoft_scores = pickle.load(model_file)
+        model_file.close()
+        return amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores
 # Load Image
 gru = Image.open("assets/gru.png")
 nn = Image.open("assets/nn.png")
 st.header('Stock Market Forecast', divider='green')
 st.markdown("#### Time Series Forecasting")
 st.markdown("Below each graph is the mean square error (MSE) for the train and test sets, where the test set consists of the last 20 days.")
+# Load the Model
+amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores = load_model()
 fig1 = go.Figure()
 fig1.add_trace(go.Scatter(go.Scatter(x=amazon_predictions['Date'], y=amazon_predictions['Train Prediction'],
                     mode='lines',

projects/06_Generative_Music.py CHANGED Viewed

@@ -9,18 +9,20 @@ from scipy.io import wavfile
 @st.cache_resource
 def load_notes():
-    notes_filepath = 'models/music_notes.pkl'
-    with open(notes_filepath, 'rb') as filepath:
-        notes = pickle.load(filepath)
-        pitchnames = pickle.load(filepath)
-        n_vocab = pickle.load(filepath)
-    return (notes, pitchnames, n_vocab)
 @st.cache_resource
 def model_load():
-    model_filepath = 'models/music_model.keras'
-    model = load_model(model_filepath)
-    return model
 @st.cache_data
 def prepare_sequences(notes, pitchnames, n_vocab, sequence_length=100):
@@ -109,15 +111,6 @@ def generate(model, network_input, pitchnames, n_vocab, nlength=500, istart=-1):
 st.header('Generative Music', divider='green')
-# Load notes
-notes, pitchnames, n_vocab = load_notes()
-# Prepare note sequences
-network_input = prepare_sequences(notes, pitchnames, n_vocab)
-# Load model
-model = model_load()
 st.markdown("#### What are Recurrent Neural Networks?")
 st.markdown("A recurrent neural network is a class of artificial neural networks that make use of sequential information. They are called recurrent because they perform the same function for every single element of a sequence, with the result being dependent on previous computations. Whereas outputs are independent of previous computations in traditional neural networks.")
 st.markdown("In this project we will use a **Long Short-Term Memory** (LSTM) network. They are a type of Recurrent Neural Network that can efficiently learn via gradient descent. Using a gating mechanism, LSTMs are able to recognise and encode long-term patterns. LSTMs are extremely useful to solve problems where the network has to remember information for a long period of time as is the case in music and text generation.")
@@ -130,6 +123,15 @@ st.markdown("It may be possible to improve this model by playing around with the
 st.markdown("*This is based off the tutorial by Sigurður Skúli [How to Generate Music using a LSTM Neural Network in Keras](https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5)*")
 st.divider()
 midi_file = None
 generated_midi = None
 sample_midi = None

 @st.cache_resource
 def load_notes():
+    with st.spinner(f"Fetching Notes"):
+        notes_filepath = 'models/music_notes.pkl'
+        with open(notes_filepath, 'rb') as filepath:
+            notes = pickle.load(filepath)
+            pitchnames = pickle.load(filepath)
+            n_vocab = pickle.load(filepath)
+        return (notes, pitchnames, n_vocab)
 @st.cache_resource
 def model_load():
+    with st.spinner(f"Fetching Model"):
+        model_filepath = 'models/music_model.keras'
+        model = load_model(model_filepath)
+        return model
 @st.cache_data
 def prepare_sequences(notes, pitchnames, n_vocab, sequence_length=100):
 st.header('Generative Music', divider='green')
 st.markdown("#### What are Recurrent Neural Networks?")
 st.markdown("A recurrent neural network is a class of artificial neural networks that make use of sequential information. They are called recurrent because they perform the same function for every single element of a sequence, with the result being dependent on previous computations. Whereas outputs are independent of previous computations in traditional neural networks.")
 st.markdown("In this project we will use a **Long Short-Term Memory** (LSTM) network. They are a type of Recurrent Neural Network that can efficiently learn via gradient descent. Using a gating mechanism, LSTMs are able to recognise and encode long-term patterns. LSTMs are extremely useful to solve problems where the network has to remember information for a long period of time as is the case in music and text generation.")
 st.markdown("*This is based off the tutorial by Sigurður Skúli [How to Generate Music using a LSTM Neural Network in Keras](https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5)*")
 st.divider()
+# Load notes
+notes, pitchnames, n_vocab = load_notes()
+# Prepare note sequences
+network_input = prepare_sequences(notes, pitchnames, n_vocab)
+# Load model
+model = model_load()
 midi_file = None
 generated_midi = None
 sample_midi = None