Christopher Capobianco
commited on
Commit
·
fc8e190
1
Parent(s):
b1eea1f
Get document classifier to load properly
Browse files- Home.py +0 -1
- app.py +38 -1
- projects/01_Document_Classifier.py +30 -47
- projects/05_Stock_Market.py +15 -14
- projects/06_Generative_Music.py +20 -18
Home.py
CHANGED
|
@@ -20,7 +20,6 @@ with st.container():
|
|
| 20 |
text_column, image_column = st.columns((3,1))
|
| 21 |
with text_column:
|
| 22 |
st.subheader("Document Classifier", divider="green")
|
| 23 |
-
st.warning("Work in Progress")
|
| 24 |
st.markdown("""
|
| 25 |
- Used OCR text and a Random Forest classification model to predict a document's classification
|
| 26 |
- Trained on Real World Documents Collection at Kaggle
|
|
|
|
| 20 |
text_column, image_column = st.columns((3,1))
|
| 21 |
with text_column:
|
| 22 |
st.subheader("Document Classifier", divider="green")
|
|
|
|
| 23 |
st.markdown("""
|
| 24 |
- Used OCR text and a Random Forest classification model to predict a document's classification
|
| 25 |
- Trained on Real World Documents Collection at Kaggle
|
app.py
CHANGED
|
@@ -1,9 +1,40 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
# Page title
|
| 4 |
st.set_page_config(page_title="Chris Capobianco's Profile", page_icon=':rocket:', layout='wide')
|
| 5 |
|
| 6 |
-
home = st.Page('Home.py', title = 'Home')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
document_classification = st.Page('projects/01_Document_Classifier.py', title='Document Classifier')
|
| 9 |
movie_recommendation = st.Page('projects/02_Movie_Recommendation.py', title='Movie Recommendation')
|
|
@@ -29,3 +60,9 @@ pg = st.navigation(
|
|
| 29 |
)
|
| 30 |
|
| 31 |
pg.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import spacy
|
| 3 |
+
import pickle
|
| 4 |
+
import subprocess
|
| 5 |
|
| 6 |
# Page title
|
| 7 |
st.set_page_config(page_title="Chris Capobianco's Profile", page_icon=':rocket:', layout='wide')
|
| 8 |
|
| 9 |
+
home = st.Page('Home.py', title = 'Home', default = True)
|
| 10 |
+
|
| 11 |
+
# Function to Load the Spacy tokenizer
|
| 12 |
+
@st.cache_resource
|
| 13 |
+
def load_nlp():
|
| 14 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
| 15 |
+
return spacy.load('en_core_web_sm')
|
| 16 |
+
|
| 17 |
+
def tokenizer(sentence):
|
| 18 |
+
# Process the text
|
| 19 |
+
doc = nlp(sentence)
|
| 20 |
+
|
| 21 |
+
# Convert tokens to lemma form for all except '-PRON-'
|
| 22 |
+
# Recall: Tokens like 'I', 'my', 'me' are represented as '-PRON-' by lemma attribute (See SpaCy Introduction)
|
| 23 |
+
tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ]
|
| 24 |
+
|
| 25 |
+
# Remove stop words and punctuations
|
| 26 |
+
tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ]
|
| 27 |
+
|
| 28 |
+
return tokens
|
| 29 |
+
|
| 30 |
+
# Function to Load the model
|
| 31 |
+
@st.cache_resource
|
| 32 |
+
def load_tokenizer_model():
|
| 33 |
+
with open('./models/autoclassifier.pkl', 'rb') as model_file:
|
| 34 |
+
stopwords = pickle.load(model_file)
|
| 35 |
+
punctuations = pickle.load(model_file)
|
| 36 |
+
model_pipe = pickle.load(model_file)
|
| 37 |
+
return (stopwords, punctuations, model_pipe)
|
| 38 |
|
| 39 |
document_classification = st.Page('projects/01_Document_Classifier.py', title='Document Classifier')
|
| 40 |
movie_recommendation = st.Page('projects/02_Movie_Recommendation.py', title='Movie Recommendation')
|
|
|
|
| 60 |
)
|
| 61 |
|
| 62 |
pg.run()
|
| 63 |
+
|
| 64 |
+
# Load the Spacy tokenizer
|
| 65 |
+
nlp = load_nlp()
|
| 66 |
+
|
| 67 |
+
# Load the Model
|
| 68 |
+
stopwords, punctuations, model_pipe = load_tokenizer_model()
|
projects/01_Document_Classifier.py
CHANGED
|
@@ -7,38 +7,24 @@ import os
|
|
| 7 |
import subprocess
|
| 8 |
|
| 9 |
# Function to Load the Spacy tokenizer
|
| 10 |
-
@st.
|
| 11 |
def load_nlp():
|
| 12 |
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
| 13 |
return spacy.load('en_core_web_sm')
|
| 14 |
|
| 15 |
-
# Function to Initialze the OCR Engine
|
| 16 |
-
@st.cache_resource
|
| 17 |
-
def load_ocr_engine():
|
| 18 |
-
return easyocr.Reader(['en'])
|
| 19 |
-
|
| 20 |
# Function to Load the model
|
| 21 |
@st.cache_resource
|
| 22 |
-
def
|
| 23 |
-
with open('models/autoclassifier.pkl', 'rb') as model_file:
|
| 24 |
stopwords = pickle.load(model_file)
|
| 25 |
punctuations = pickle.load(model_file)
|
| 26 |
model_pipe = pickle.load(model_file)
|
| 27 |
return (stopwords, punctuations, model_pipe)
|
| 28 |
|
| 29 |
-
# Function to
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
# Convert tokens to lemma form for all except '-PRON-'
|
| 35 |
-
# Recall: Tokens like 'I', 'my', 'me' are represented as '-PRON-' by lemma attribute (See SpaCy Introduction)
|
| 36 |
-
tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ]
|
| 37 |
-
|
| 38 |
-
# Remove stop words and punctuations
|
| 39 |
-
tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ]
|
| 40 |
-
|
| 41 |
-
return tokens
|
| 42 |
|
| 43 |
# Function to process uploaded images
|
| 44 |
@st.cache_data
|
|
@@ -72,35 +58,32 @@ def autoclassifier(images):
|
|
| 72 |
# Delete image file
|
| 73 |
os.remove(image.name)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
st.header('Document Classifier', divider='green')
|
| 77 |
-
|
| 78 |
-
st.warning("Work in Progress")
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
"Choose an image to classify",
|
| 101 |
-
type=['png','jpg','jpeg'],
|
| 102 |
-
accept_multiple_files=True
|
| 103 |
-
)
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
|
|
|
| 7 |
import subprocess
|
| 8 |
|
| 9 |
# Function to Load the Spacy tokenizer
|
| 10 |
+
@st.cache_resource
|
| 11 |
def load_nlp():
|
| 12 |
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
| 13 |
return spacy.load('en_core_web_sm')
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Function to Load the model
|
| 16 |
@st.cache_resource
|
| 17 |
+
def load_tokenizer_model():
|
| 18 |
+
with open('./models/autoclassifier.pkl', 'rb') as model_file:
|
| 19 |
stopwords = pickle.load(model_file)
|
| 20 |
punctuations = pickle.load(model_file)
|
| 21 |
model_pipe = pickle.load(model_file)
|
| 22 |
return (stopwords, punctuations, model_pipe)
|
| 23 |
|
| 24 |
+
# Function to Initialze the OCR Engine
|
| 25 |
+
@st.cache_resource
|
| 26 |
+
def load_ocr_engine():
|
| 27 |
+
return easyocr.Reader(['en'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# Function to process uploaded images
|
| 30 |
@st.cache_data
|
|
|
|
| 58 |
# Delete image file
|
| 59 |
os.remove(image.name)
|
| 60 |
|
| 61 |
+
st.header('Document Classifier', divider='green')
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
st.markdown("#### What is OCR?")
|
| 64 |
+
st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.")
|
| 65 |
+
st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.")
|
| 66 |
+
st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.")
|
| 67 |
+
st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`")
|
| 68 |
+
st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*")
|
| 69 |
+
st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*")
|
| 70 |
+
st.divider()
|
| 71 |
|
| 72 |
+
# Fetch uploaded images
|
| 73 |
+
images = st.file_uploader(
|
| 74 |
+
"Choose an image to classify",
|
| 75 |
+
type=['png','jpg','jpeg'],
|
| 76 |
+
accept_multiple_files=True
|
| 77 |
+
)
|
| 78 |
|
| 79 |
+
# Load the Spacy tokenizer
|
| 80 |
+
nlp = load_nlp()
|
| 81 |
|
| 82 |
+
# Load the Model
|
| 83 |
+
stopwords, punctuations, model_pipe = load_tokenizer_model()
|
| 84 |
|
| 85 |
+
# Initialze the OCR Engine
|
| 86 |
+
ocr_engine = load_ocr_engine()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
# Process and predict document classification
|
| 89 |
+
autoclassifier(images)
|
projects/05_Stock_Market.py
CHANGED
|
@@ -5,25 +5,23 @@ from PIL import Image
|
|
| 5 |
|
| 6 |
@st.cache_resource
|
| 7 |
def load_model():
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
# Load Image
|
| 21 |
gru = Image.open("assets/gru.png")
|
| 22 |
nn = Image.open("assets/nn.png")
|
| 23 |
|
| 24 |
-
# Load the Model
|
| 25 |
-
amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores = load_model()
|
| 26 |
-
|
| 27 |
st.header('Stock Market Forecast', divider='green')
|
| 28 |
|
| 29 |
st.markdown("#### Time Series Forecasting")
|
|
@@ -42,6 +40,9 @@ st.divider()
|
|
| 42 |
|
| 43 |
st.markdown("Below each graph is the mean square error (MSE) for the train and test sets, where the test set consists of the last 20 days.")
|
| 44 |
|
|
|
|
|
|
|
|
|
|
| 45 |
fig1 = go.Figure()
|
| 46 |
fig1.add_trace(go.Scatter(go.Scatter(x=amazon_predictions['Date'], y=amazon_predictions['Train Prediction'],
|
| 47 |
mode='lines',
|
|
|
|
| 5 |
|
| 6 |
@st.cache_resource
|
| 7 |
def load_model():
|
| 8 |
+
with st.spinner(f"Fetching Models"):
|
| 9 |
+
model_file = open('./models/stock_market_model.pkl', 'rb')
|
| 10 |
+
amazon_predictions = pickle.load(model_file)
|
| 11 |
+
amazon_scores = pickle.load(model_file)
|
| 12 |
+
google_predictions = pickle.load(model_file)
|
| 13 |
+
google_scores = pickle.load(model_file)
|
| 14 |
+
ibm_predictions = pickle.load(model_file)
|
| 15 |
+
ibm_scores = pickle.load(model_file)
|
| 16 |
+
microsoft_predictions = pickle.load(model_file)
|
| 17 |
+
microsoft_scores = pickle.load(model_file)
|
| 18 |
+
model_file.close()
|
| 19 |
+
return amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores
|
| 20 |
|
| 21 |
# Load Image
|
| 22 |
gru = Image.open("assets/gru.png")
|
| 23 |
nn = Image.open("assets/nn.png")
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
st.header('Stock Market Forecast', divider='green')
|
| 26 |
|
| 27 |
st.markdown("#### Time Series Forecasting")
|
|
|
|
| 40 |
|
| 41 |
st.markdown("Below each graph is the mean square error (MSE) for the train and test sets, where the test set consists of the last 20 days.")
|
| 42 |
|
| 43 |
+
# Load the Model
|
| 44 |
+
amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores = load_model()
|
| 45 |
+
|
| 46 |
fig1 = go.Figure()
|
| 47 |
fig1.add_trace(go.Scatter(go.Scatter(x=amazon_predictions['Date'], y=amazon_predictions['Train Prediction'],
|
| 48 |
mode='lines',
|
projects/06_Generative_Music.py
CHANGED
|
@@ -9,18 +9,20 @@ from scipy.io import wavfile
|
|
| 9 |
|
| 10 |
@st.cache_resource
|
| 11 |
def load_notes():
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
@st.cache_resource
|
| 20 |
def model_load():
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
@st.cache_data
|
| 26 |
def prepare_sequences(notes, pitchnames, n_vocab, sequence_length=100):
|
|
@@ -109,15 +111,6 @@ def generate(model, network_input, pitchnames, n_vocab, nlength=500, istart=-1):
|
|
| 109 |
|
| 110 |
st.header('Generative Music', divider='green')
|
| 111 |
|
| 112 |
-
# Load notes
|
| 113 |
-
notes, pitchnames, n_vocab = load_notes()
|
| 114 |
-
|
| 115 |
-
# Prepare note sequences
|
| 116 |
-
network_input = prepare_sequences(notes, pitchnames, n_vocab)
|
| 117 |
-
|
| 118 |
-
# Load model
|
| 119 |
-
model = model_load()
|
| 120 |
-
|
| 121 |
st.markdown("#### What are Recurrent Neural Networks?")
|
| 122 |
st.markdown("A recurrent neural network is a class of artificial neural networks that make use of sequential information. They are called recurrent because they perform the same function for every single element of a sequence, with the result being dependent on previous computations. Whereas outputs are independent of previous computations in traditional neural networks.")
|
| 123 |
st.markdown("In this project we will use a **Long Short-Term Memory** (LSTM) network. They are a type of Recurrent Neural Network that can efficiently learn via gradient descent. Using a gating mechanism, LSTMs are able to recognise and encode long-term patterns. LSTMs are extremely useful to solve problems where the network has to remember information for a long period of time as is the case in music and text generation.")
|
|
@@ -130,6 +123,15 @@ st.markdown("It may be possible to improve this model by playing around with the
|
|
| 130 |
st.markdown("*This is based off the tutorial by Sigurður Skúli [How to Generate Music using a LSTM Neural Network in Keras](https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5)*")
|
| 131 |
st.divider()
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
midi_file = None
|
| 134 |
generated_midi = None
|
| 135 |
sample_midi = None
|
|
|
|
| 9 |
|
| 10 |
@st.cache_resource
|
| 11 |
def load_notes():
|
| 12 |
+
with st.spinner(f"Fetching Notes"):
|
| 13 |
+
notes_filepath = 'models/music_notes.pkl'
|
| 14 |
+
with open(notes_filepath, 'rb') as filepath:
|
| 15 |
+
notes = pickle.load(filepath)
|
| 16 |
+
pitchnames = pickle.load(filepath)
|
| 17 |
+
n_vocab = pickle.load(filepath)
|
| 18 |
+
return (notes, pitchnames, n_vocab)
|
| 19 |
|
| 20 |
@st.cache_resource
|
| 21 |
def model_load():
|
| 22 |
+
with st.spinner(f"Fetching Model"):
|
| 23 |
+
model_filepath = 'models/music_model.keras'
|
| 24 |
+
model = load_model(model_filepath)
|
| 25 |
+
return model
|
| 26 |
|
| 27 |
@st.cache_data
|
| 28 |
def prepare_sequences(notes, pitchnames, n_vocab, sequence_length=100):
|
|
|
|
| 111 |
|
| 112 |
st.header('Generative Music', divider='green')
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
st.markdown("#### What are Recurrent Neural Networks?")
|
| 115 |
st.markdown("A recurrent neural network is a class of artificial neural networks that make use of sequential information. They are called recurrent because they perform the same function for every single element of a sequence, with the result being dependent on previous computations. Whereas outputs are independent of previous computations in traditional neural networks.")
|
| 116 |
st.markdown("In this project we will use a **Long Short-Term Memory** (LSTM) network. They are a type of Recurrent Neural Network that can efficiently learn via gradient descent. Using a gating mechanism, LSTMs are able to recognise and encode long-term patterns. LSTMs are extremely useful to solve problems where the network has to remember information for a long period of time as is the case in music and text generation.")
|
|
|
|
| 123 |
st.markdown("*This is based off the tutorial by Sigurður Skúli [How to Generate Music using a LSTM Neural Network in Keras](https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5)*")
|
| 124 |
st.divider()
|
| 125 |
|
| 126 |
+
# Load notes
|
| 127 |
+
notes, pitchnames, n_vocab = load_notes()
|
| 128 |
+
|
| 129 |
+
# Prepare note sequences
|
| 130 |
+
network_input = prepare_sequences(notes, pitchnames, n_vocab)
|
| 131 |
+
|
| 132 |
+
# Load model
|
| 133 |
+
model = model_load()
|
| 134 |
+
|
| 135 |
midi_file = None
|
| 136 |
generated_midi = None
|
| 137 |
sample_midi = None
|