Commit
·
031e5e2
0
Parent(s):
Duplicate from ppsingh/cpu-demo
Browse files- .gitattributes +35 -0
- .gitignore +1 -0
- .vscode/launch.json +20 -0
- README.md +13 -0
- app.py +20 -0
- appStore/__init__.py +1 -0
- appStore/adapmit.py +212 -0
- appStore/info.py +67 -0
- appStore/multiapp.py +67 -0
- appStore/netzero.py +206 -0
- appStore/sector.py +211 -0
- appStore/target.py +211 -0
- docStore/img/dsc_giz.png +0 -0
- docStore/img/ndc.png +0 -0
- docStore/img/paris.png +0 -0
- docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt +737 -0
- docStore/sample/Seychelles-revised_first_ndc-EN.pdf +0 -0
- docStore/sample/South Africa_s Low Emission Development Strategy.pdf +3 -0
- docStore/sample/files.json +4 -0
- packages.txt +4 -0
- paramconfig.cfg +39 -0
- requirements.txt +19 -0
- style.css +180 -0
- utils/__init__.py +1 -0
- utils/adapmit_classifier.py +136 -0
- utils/config.py +31 -0
- utils/netzero_classifier.py +137 -0
- utils/preprocessing.py +275 -0
- utils/sector_classifier.py +146 -0
- utils/target_classifier.py +138 -0
- utils/uploadAndExample.py +33 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
docStore/sample/South[[:space:]]Africa_s[[:space:]]Low[[:space:]]Emission[[:space:]]Development[[:space:]]Strategy.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
.vscode/launch.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
// Use IntelliSense to learn about possible attributes.
|
| 3 |
+
// Hover to view descriptions of existing attributes.
|
| 4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
| 5 |
+
"version": "0.2.0",
|
| 6 |
+
"configurations": [
|
| 7 |
+
{
|
| 8 |
+
"name": "Streamlit",
|
| 9 |
+
"type": "python",
|
| 10 |
+
"request": "launch",
|
| 11 |
+
"program": ".venv/bin/streamlit",
|
| 12 |
+
"args": [
|
| 13 |
+
"run",
|
| 14 |
+
"app.py"
|
| 15 |
+
],
|
| 16 |
+
"console": "integratedTerminal",
|
| 17 |
+
"justMyCode": false
|
| 18 |
+
}
|
| 19 |
+
]
|
| 20 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Cpu Demo
|
| 3 |
+
emoji: 🦀
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.19.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
duplicated_from: ppsingh/cpu-demo
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import appStore.target as target_extraction
|
| 2 |
+
import appStore.netzero as netzero
|
| 3 |
+
import appStore.sector as sector
|
| 4 |
+
import appStore.adapmit as adapmit
|
| 5 |
+
# import appStore.info as info
|
| 6 |
+
from appStore.multiapp import MultiApp
|
| 7 |
+
import streamlit as st
|
| 8 |
+
|
| 9 |
+
st.set_page_config(page_title = 'Climate Policy Intelligence',
|
| 10 |
+
initial_sidebar_state='expanded', layout="wide")
|
| 11 |
+
|
| 12 |
+
app = MultiApp()
|
| 13 |
+
|
| 14 |
+
# app.add_app("About","house", info.app)
|
| 15 |
+
app.add_app("Economy-Wide Target Extraction","gear",target_extraction.app)
|
| 16 |
+
app.add_app("NetZero Target Extraction","gear", netzero.app)
|
| 17 |
+
app.add_app("Sector Classification","gear", sector.app)
|
| 18 |
+
app.add_app("Adaptation-Mitigation","gear", adapmit.app)
|
| 19 |
+
|
| 20 |
+
app.run()
|
appStore/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# creating appstore package
|
appStore/adapmit.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# set path
|
| 2 |
+
import glob, os, sys
|
| 3 |
+
sys.path.append('../utils')
|
| 4 |
+
|
| 5 |
+
#import needed libraries
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import streamlit as st
|
| 11 |
+
# from st_aggrid import AgGrid
|
| 12 |
+
# from st_aggrid.shared import ColumnsAutoSizeMode
|
| 13 |
+
from utils.adapmit_classifier import adapmit_classification
|
| 14 |
+
from utils.adapmit_classifier import runAdapMitPreprocessingPipeline, load_adapmitClassifier
|
| 15 |
+
# from utils.keyword_extraction import textrank
|
| 16 |
+
import logging
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
from utils.config import get_classifier_params
|
| 19 |
+
from utils.preprocessing import paraLengthCheck
|
| 20 |
+
from io import BytesIO
|
| 21 |
+
import xlsxwriter
|
| 22 |
+
import plotly.express as px
|
| 23 |
+
|
| 24 |
+
# Declare all the necessary variables
|
| 25 |
+
classifier_identifier = 'adapmit'
|
| 26 |
+
params = get_classifier_params(classifier_identifier)
|
| 27 |
+
|
| 28 |
+
@st.cache_data
|
| 29 |
+
def to_excel(df):
|
| 30 |
+
len_df = len(df)
|
| 31 |
+
output = BytesIO()
|
| 32 |
+
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
| 33 |
+
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
| 34 |
+
workbook = writer.book
|
| 35 |
+
worksheet = writer.sheets['Sheet1']
|
| 36 |
+
worksheet.data_validation('E2:E{}'.format(len_df),
|
| 37 |
+
{'validate': 'list',
|
| 38 |
+
'source': ['No', 'Yes', 'Discard']})
|
| 39 |
+
worksheet.data_validation('F2:F{}'.format(len_df),
|
| 40 |
+
{'validate': 'list',
|
| 41 |
+
'source': ['No', 'Yes', 'Discard']})
|
| 42 |
+
worksheet.data_validation('G2:G{}'.format(len_df),
|
| 43 |
+
{'validate': 'list',
|
| 44 |
+
'source': ['No', 'Yes', 'Discard']})
|
| 45 |
+
writer.save()
|
| 46 |
+
processed_data = output.getvalue()
|
| 47 |
+
return processed_data
|
| 48 |
+
|
| 49 |
+
def app():
|
| 50 |
+
|
| 51 |
+
#### APP INFO #####
|
| 52 |
+
with st.container():
|
| 53 |
+
st.markdown("<h1 style='text-align: center; color: black;'> Adaptation-Mitigation Classification </h1>", unsafe_allow_html=True)
|
| 54 |
+
st.write(' ')
|
| 55 |
+
st.write(' ')
|
| 56 |
+
|
| 57 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
| 58 |
+
|
| 59 |
+
st.write(
|
| 60 |
+
"""
|
| 61 |
+
The **Adaptation-Mitigation Classification** app is an easy-to-use interface built \
|
| 62 |
+
in Streamlit for analyzing policy documents for \
|
| 63 |
+
Classification of the paragraphs/texts in the document *If it \
|
| 64 |
+
belongs to 'Adaptation' and 'Mitigation' category or not. The paragraph \
|
| 65 |
+
can belong to both category too. \
|
| 66 |
+
- developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
| 67 |
+
SV Klima and SPA. \n
|
| 68 |
+
""")
|
| 69 |
+
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
| 70 |
+
automatically cleaned and split into paragraphs with a maximum \
|
| 71 |
+
length of 60 words using a Haystack preprocessing pipeline. The \
|
| 72 |
+
length of 60 is an empirical value which should reflect the length \
|
| 73 |
+
of a “context” and should limit the paragraph length deviation. \
|
| 74 |
+
However, since we want to respect the sentence boundary the limit \
|
| 75 |
+
can breach and hence this limit of 60 is tentative. \n
|
| 76 |
+
""")
|
| 77 |
+
|
| 78 |
+
st.write("")
|
| 79 |
+
|
| 80 |
+
### Main app code ###
|
| 81 |
+
with st.container():
|
| 82 |
+
if st.button("RUN Adaptation-Mitigation Classification"):
|
| 83 |
+
if 'key4' not in st.session_state:
|
| 84 |
+
st.session_state['key4'] = None
|
| 85 |
+
|
| 86 |
+
if 'filepath' in st.session_state:
|
| 87 |
+
file_name = st.session_state['filename']
|
| 88 |
+
file_path = st.session_state['filepath']
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
all_documents = runAdapMitPreprocessingPipeline(file_name= file_name,
|
| 92 |
+
file_path= file_path, split_by= params['split_by'],
|
| 93 |
+
split_length= params['split_length'],
|
| 94 |
+
split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
|
| 95 |
+
split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
|
| 96 |
+
classifier = load_adapmitClassifier(classifier_name=params['model_name'])
|
| 97 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
| 98 |
+
verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
|
| 99 |
+
if len(verified_paralist) > 100:
|
| 100 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
| 101 |
+
else:
|
| 102 |
+
warning_msg = ""
|
| 103 |
+
|
| 104 |
+
# # with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
|
| 105 |
+
df = adapmit_classification(haystack_doc=verified_paralist,
|
| 106 |
+
threshold= params['threshold'])
|
| 107 |
+
|
| 108 |
+
threshold= params['threshold']
|
| 109 |
+
truth_df = df.drop(['text'],axis=1)
|
| 110 |
+
truth_df = truth_df.astype(float) >= threshold
|
| 111 |
+
truth_df = truth_df.astype(str)
|
| 112 |
+
categories = list(truth_df.columns)
|
| 113 |
+
|
| 114 |
+
placeholder = {}
|
| 115 |
+
for val in categories:
|
| 116 |
+
placeholder[val] = dict(truth_df[val].value_counts())
|
| 117 |
+
count_df = pd.DataFrame.from_dict(placeholder)
|
| 118 |
+
count_df = count_df.T
|
| 119 |
+
count_df = count_df.reset_index()
|
| 120 |
+
# st.write(count_df)
|
| 121 |
+
placeholder = []
|
| 122 |
+
for i in range(len(count_df)):
|
| 123 |
+
placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
|
| 124 |
+
placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
|
| 125 |
+
count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
|
| 126 |
+
# st.write("Total Paragraphs: {}".format(len(df)))
|
| 127 |
+
fig = px.bar(count_df, y='category', x='count',
|
| 128 |
+
color='truth_value',orientation='h', height =200)
|
| 129 |
+
c1, c2 = st.columns([1,1])
|
| 130 |
+
with c1:
|
| 131 |
+
st.plotly_chart(fig,use_container_width= True)
|
| 132 |
+
|
| 133 |
+
truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
|
| 134 |
+
truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
|
| 135 |
+
# st.write(truth_df)
|
| 136 |
+
df = pd.concat([df,truth_df['labels']],axis=1)
|
| 137 |
+
st.markdown("###### Top few 'Mitigation' related paragraph/text ######")
|
| 138 |
+
df = df.sort_values(by = ['Mitigation'], ascending=False)
|
| 139 |
+
for i in range(3):
|
| 140 |
+
if df.iloc[i]['Mitigation'] >= 0.50:
|
| 141 |
+
st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Mitigation']))
|
| 142 |
+
st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
|
| 143 |
+
|
| 144 |
+
st.markdown("###### Top few 'Adaptation' related paragraph/text ######")
|
| 145 |
+
df = df.sort_values(by = ['Adaptation'], ascending=False)
|
| 146 |
+
for i in range(3):
|
| 147 |
+
if df.iloc[i]['Adaptation'] > 0.5:
|
| 148 |
+
st.write('**Result {}** (Relevancy Score: {:.2f})'.format(i+1,df.iloc[i]['Adaptation']))
|
| 149 |
+
st.write("\t Text: \t{}".format(df.iloc[i]['text'].replace("\n", " ")))
|
| 150 |
+
# st.write(df[['text','labels']])
|
| 151 |
+
df['Validation'] = 'No'
|
| 152 |
+
df['Val-Mitigation'] = 'No'
|
| 153 |
+
df['Val-Adaptation'] = 'No'
|
| 154 |
+
df_xlsx = to_excel(df)
|
| 155 |
+
st.download_button(label='📥 Download Current Result',
|
| 156 |
+
data=df_xlsx ,
|
| 157 |
+
file_name= 'file_adaptation-mitigation.xlsx')
|
| 158 |
+
# st.session_state.key4 =
|
| 159 |
+
|
| 160 |
+
# category =set(df.columns)
|
| 161 |
+
# removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
|
| 162 |
+
# category = list(category - removecols)
|
| 163 |
+
|
| 164 |
+
else:
|
| 165 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
| 166 |
+
logging.warning("Terminated as no document provided")
|
| 167 |
+
|
| 168 |
+
# # Creating truth value dataframe
|
| 169 |
+
# if 'key4' in st.session_state:
|
| 170 |
+
# if st.session_state.key4 is not None:
|
| 171 |
+
# df = st.session_state.key4
|
| 172 |
+
# st.markdown("###### Select the threshold for classifier ######")
|
| 173 |
+
# c4, c5 = st.columns([1,1])
|
| 174 |
+
|
| 175 |
+
# with c4:
|
| 176 |
+
# threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
|
| 177 |
+
# step=0.01, value=0.5,
|
| 178 |
+
# help = "Keep High Value if want refined result, low if dont want to miss anything" )
|
| 179 |
+
# category =set(df.columns)
|
| 180 |
+
# removecols = {'Validation','Val-Adaptation','Val-Mitigation','text'}
|
| 181 |
+
# category = list(category - removecols)
|
| 182 |
+
|
| 183 |
+
# placeholder = {}
|
| 184 |
+
# for val in category:
|
| 185 |
+
# temp = df[val].astype(float) > threshold
|
| 186 |
+
# temp = temp.astype(str)
|
| 187 |
+
# placeholder[val] = dict(temp.value_counts())
|
| 188 |
+
|
| 189 |
+
# count_df = pd.DataFrame.from_dict(placeholder)
|
| 190 |
+
# count_df = count_df.T
|
| 191 |
+
# count_df = count_df.reset_index()
|
| 192 |
+
# placeholder = []
|
| 193 |
+
# for i in range(len(count_df)):
|
| 194 |
+
# placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
|
| 195 |
+
# placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
|
| 196 |
+
|
| 197 |
+
# count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
|
| 198 |
+
# fig = px.bar(count_df, x='category', y='count',
|
| 199 |
+
# color='truth_value',
|
| 200 |
+
# height=400)
|
| 201 |
+
# st.write("")
|
| 202 |
+
# st.plotly_chart(fig)
|
| 203 |
+
|
| 204 |
+
# df['Validation'] = 'No'
|
| 205 |
+
# df['Val-Mitigation'] = 'No'
|
| 206 |
+
# df['Val-Adaptation'] = 'No'
|
| 207 |
+
# df_xlsx = to_excel(df)
|
| 208 |
+
# st.download_button(label='📥 Download Current Result',
|
| 209 |
+
# data=df_xlsx ,
|
| 210 |
+
# file_name= 'file_adaptation-mitigation.xlsx')
|
| 211 |
+
|
| 212 |
+
|
appStore/info.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
from PIL import Image
|
| 4 |
+
_ROOT = os.path.abspath(os.path.dirname(__file__))
|
| 5 |
+
def get_data(path):
|
| 6 |
+
return os.path.join(_ROOT, 'data', path)
|
| 7 |
+
|
| 8 |
+
def app():
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
with open('style.css') as f:
|
| 12 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
| 13 |
+
|
| 14 |
+
st.markdown("<h2 style='text-align: center; \
|
| 15 |
+
color: black;'> Climate Policy Understanding App</h2>",
|
| 16 |
+
unsafe_allow_html=True)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
st.markdown("<div style='text-align: center; \
|
| 20 |
+
color: grey;'>Climate Policy Understanding App is an open-source\
|
| 21 |
+
digital tool which aims to assist policy analysts and \
|
| 22 |
+
other users in extracting and filtering relevant \
|
| 23 |
+
information from public documents.</div>",
|
| 24 |
+
unsafe_allow_html=True)
|
| 25 |
+
footer = """
|
| 26 |
+
<div class="footer-custom">
|
| 27 |
+
Guidance & Feedback - <a>Nadja Taeger</a> |<a>Marie Hertel</a> | <a>Cecile Schneider</a> |
|
| 28 |
+
Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
|
| 29 |
+
<a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
|
| 30 |
+
|
| 31 |
+
</div>
|
| 32 |
+
"""
|
| 33 |
+
st.markdown(footer, unsafe_allow_html=True)
|
| 34 |
+
|
| 35 |
+
c1, c2, c3 = st.columns([8,1,12])
|
| 36 |
+
with c1:
|
| 37 |
+
image = Image.open('docStore/img/ndc.png')
|
| 38 |
+
st.image(image)
|
| 39 |
+
with c3:
|
| 40 |
+
st.markdown('<div style="text-align: justify;">The manual extraction \
|
| 41 |
+
of relevant information from text documents is a \
|
| 42 |
+
time-consuming task for any policy analysts. As the amount and length of \
|
| 43 |
+
public policy documents in relation to sustainable development (such as \
|
| 44 |
+
National Development Plans and Nationally Determined Contributions) \
|
| 45 |
+
continuously increases, a major challenge for policy action tracking – the \
|
| 46 |
+
evaluation of stated goals and targets and their actual implementation on \
|
| 47 |
+
the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
|
| 48 |
+
Language Processing (NLP) methods can help in shortening and easing this \
|
| 49 |
+
task for policy analysts.</div><br>',
|
| 50 |
+
unsafe_allow_html=True)
|
| 51 |
+
|
| 52 |
+
intro = """
|
| 53 |
+
<div style="text-align: justify;">
|
| 54 |
+
|
| 55 |
+
For this purpose, IKI Tracs, SV KLIMA, SPA and Data Service Center (Deutsche Gesellschaft für Internationale \
|
| 56 |
+
Zusammenarbeit (GIZ) GmbH) are collaborating since 2022 in the development \
|
| 57 |
+
of an AI-powered open-source web application that helps find and extract \
|
| 58 |
+
relevant information from public policy documents faster to facilitate \
|
| 59 |
+
evidence-based decision-making processes in sustainable development and beyond.
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
</div>
|
| 63 |
+
<br>
|
| 64 |
+
"""
|
| 65 |
+
st.markdown(intro, unsafe_allow_html=True)
|
| 66 |
+
image2 = Image.open('docStore/img/paris.png')
|
| 67 |
+
st.image(image2)
|
appStore/multiapp.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Frameworks for running multiple Streamlit applications as a single app.
|
| 2 |
+
"""
|
| 3 |
+
import streamlit as st
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from utils.uploadAndExample import add_upload
|
| 6 |
+
|
| 7 |
+
class MultiApp:
|
| 8 |
+
"""Framework for combining multiple streamlit applications.
|
| 9 |
+
Usage:
|
| 10 |
+
def foo():
|
| 11 |
+
st.title("Hello Foo")
|
| 12 |
+
def bar():
|
| 13 |
+
st.title("Hello Bar")
|
| 14 |
+
app = MultiApp()
|
| 15 |
+
app.add_app("Foo", foo)
|
| 16 |
+
app.add_app("Bar", bar)
|
| 17 |
+
app.run()
|
| 18 |
+
It is also possible keep each application in a separate file.
|
| 19 |
+
import foo
|
| 20 |
+
import bar
|
| 21 |
+
app = MultiApp()
|
| 22 |
+
app.add_app("Foo", foo.app)
|
| 23 |
+
app.add_app("Bar", bar.app)
|
| 24 |
+
app.run()
|
| 25 |
+
"""
|
| 26 |
+
def __init__(self):
|
| 27 |
+
self.apps = []
|
| 28 |
+
|
| 29 |
+
def add_app(self,title,icon, func):
|
| 30 |
+
"""Adds a new application.
|
| 31 |
+
Parameters
|
| 32 |
+
----------
|
| 33 |
+
func:
|
| 34 |
+
the python function to render this app.
|
| 35 |
+
title:
|
| 36 |
+
title of the app. Appears in the dropdown in the sidebar.
|
| 37 |
+
"""
|
| 38 |
+
self.apps.append({
|
| 39 |
+
"title": title,
|
| 40 |
+
"icon": icon,
|
| 41 |
+
"function": func
|
| 42 |
+
})
|
| 43 |
+
|
| 44 |
+
def run(self):
|
| 45 |
+
|
| 46 |
+
st.sidebar.write(format_func=lambda app: app['title'])
|
| 47 |
+
#image = Image.open('docStore/img/dsc_giz.png')
|
| 48 |
+
#st.sidebar.image(image, width =200)
|
| 49 |
+
|
| 50 |
+
with st.sidebar:
|
| 51 |
+
selected = st.selectbox("Select the Task to perform", [page["title"] for page in self.apps],)
|
| 52 |
+
st.markdown("---")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
for index, item in enumerate(self.apps):
|
| 56 |
+
if item["title"] == selected:
|
| 57 |
+
self.apps[index]["function"]()
|
| 58 |
+
break
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
choice = st.sidebar.radio(label = 'Select the Document',
|
| 62 |
+
help = 'You can upload the document \
|
| 63 |
+
or else you can try a example document',
|
| 64 |
+
options = ('Upload Document', 'Try Example'),
|
| 65 |
+
horizontal = True)
|
| 66 |
+
add_upload(choice)
|
| 67 |
+
|
appStore/netzero.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# set path
|
| 2 |
+
import glob, os, sys;
|
| 3 |
+
sys.path.append('../utils')
|
| 4 |
+
|
| 5 |
+
#import needed libraries
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import streamlit as st
|
| 11 |
+
# from st_aggrid import AgGrid
|
| 12 |
+
# from st_aggrid.shared import ColumnsAutoSizeMode
|
| 13 |
+
from utils.netzero_classifier import netzero_classification
|
| 14 |
+
from utils.netzero_classifier import runNetZeroPreprocessingPipeline, load_netzeroClassifier
|
| 15 |
+
# from utils.keyword_extraction import textrank
|
| 16 |
+
import logging
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
from utils.config import get_classifier_params
|
| 19 |
+
from io import BytesIO
|
| 20 |
+
import xlsxwriter
|
| 21 |
+
import plotly.express as px
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Declare all the necessary variables
|
| 25 |
+
classifier_identifier = 'netzero'
|
| 26 |
+
params = get_classifier_params(classifier_identifier)
|
| 27 |
+
|
| 28 |
+
# Labels dictionary ###
|
| 29 |
+
_lab_dict = {
|
| 30 |
+
'NEGATIVE':'NO NETZERO TARGET',
|
| 31 |
+
'NETZERO':'NETZERO TARGET',
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@st.cache_data
|
| 36 |
+
def to_excel(df):
|
| 37 |
+
len_df = len(df)
|
| 38 |
+
output = BytesIO()
|
| 39 |
+
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
| 40 |
+
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
| 41 |
+
workbook = writer.book
|
| 42 |
+
worksheet = writer.sheets['Sheet1']
|
| 43 |
+
worksheet.data_validation('E2:E{}'.format(len_df),
|
| 44 |
+
{'validate': 'list',
|
| 45 |
+
'source': ['No', 'Yes', 'Discard']})
|
| 46 |
+
writer.save()
|
| 47 |
+
processed_data = output.getvalue()
|
| 48 |
+
return processed_data
|
| 49 |
+
|
| 50 |
+
def app():
|
| 51 |
+
|
| 52 |
+
#### APP INFO #####
|
| 53 |
+
with st.container():
|
| 54 |
+
st.markdown("<h1 style='text-align: center; color: black;'> NetZero Target Extraction </h1>", unsafe_allow_html=True)
|
| 55 |
+
st.write(' ')
|
| 56 |
+
st.write(' ')
|
| 57 |
+
|
| 58 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
| 59 |
+
|
| 60 |
+
st.write(
|
| 61 |
+
"""
|
| 62 |
+
The **NetZero Extraction** app is an easy-to-use interface built \
|
| 63 |
+
in Streamlit for analyzing policy documents for \
|
| 64 |
+
Classification of the paragraphs/texts in the document *If it \
|
| 65 |
+
contains any Net-Zero target related information* - \
|
| 66 |
+
developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
| 67 |
+
SV Klima and SPA. \n
|
| 68 |
+
""")
|
| 69 |
+
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
| 70 |
+
automatically cleaned and split into paragraphs with a maximum \
|
| 71 |
+
length of 60 words using a Haystack preprocessing pipeline. The \
|
| 72 |
+
length of 60 is an empirical value which should reflect the length \
|
| 73 |
+
of a “context” and should limit the paragraph length deviation. \
|
| 74 |
+
However, since we want to respect the sentence boundary the limit \
|
| 75 |
+
can breach and hence this limit of 60 is tentative. \n
|
| 76 |
+
""")
|
| 77 |
+
|
| 78 |
+
st.write("")
|
| 79 |
+
|
| 80 |
+
### Main app code ###
|
| 81 |
+
with st.container():
|
| 82 |
+
if st.button("RUN NetZero Related Paragraph Extractions"):
|
| 83 |
+
if 'key2' not in st.session_state:
|
| 84 |
+
st.session_state['key2'] = None
|
| 85 |
+
|
| 86 |
+
if 'filepath' in st.session_state:
|
| 87 |
+
file_name = st.session_state['filename']
|
| 88 |
+
file_path = st.session_state['filepath']
|
| 89 |
+
|
| 90 |
+
# Do the preprocessing of the PDF
|
| 91 |
+
|
| 92 |
+
all_documents = runNetZeroPreprocessingPipeline(file_name= file_name,
|
| 93 |
+
file_path= file_path, split_by= params['split_by'],
|
| 94 |
+
split_length= params['split_length'],
|
| 95 |
+
split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
|
| 96 |
+
split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
|
| 97 |
+
|
| 98 |
+
# st.dataframe(all_documents['documents'])
|
| 99 |
+
|
| 100 |
+
# Load the classifier model
|
| 101 |
+
|
| 102 |
+
classifier = load_netzeroClassifier(classifier_name=params['model_name'])
|
| 103 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
| 104 |
+
|
| 105 |
+
if len(all_documents['documents']) > 100:
|
| 106 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
| 107 |
+
else:
|
| 108 |
+
warning_msg = ""
|
| 109 |
+
|
| 110 |
+
# #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
|
| 111 |
+
# with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
|
| 112 |
+
|
| 113 |
+
df = netzero_classification(haystack_doc=all_documents['documents'],
|
| 114 |
+
threshold= params['threshold'])
|
| 115 |
+
st.session_state.key2 = df
|
| 116 |
+
hits = df[df['Target Label'] == 'NETZERO']
|
| 117 |
+
range_val = min(5,len(hits))
|
| 118 |
+
if range_val !=0:
|
| 119 |
+
count_df = df['Target Label'].value_counts()
|
| 120 |
+
count_df = count_df.rename('count')
|
| 121 |
+
count_df = count_df.rename_axis('Target Label').reset_index()
|
| 122 |
+
count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
| 123 |
+
|
| 124 |
+
fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height =200)
|
| 125 |
+
c1, c2 = st.columns([1,1])
|
| 126 |
+
with c1:
|
| 127 |
+
st.plotly_chart(fig,use_container_width= True)
|
| 128 |
+
|
| 129 |
+
hits = hits.sort_values(by=['Relevancy'], ascending=False)
|
| 130 |
+
st.write("")
|
| 131 |
+
st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
|
| 132 |
+
range_val = min(5,len(hits))
|
| 133 |
+
for i in range(range_val):
|
| 134 |
+
# the page number reflects the page that contains the main paragraph
|
| 135 |
+
# according to split limit, the overlapping part can be on a separate page
|
| 136 |
+
st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
|
| 137 |
+
st.write("\t Text: \t{}".format(hits.iloc[i]['text']))
|
| 138 |
+
else:
|
| 139 |
+
st.info("🤔 No Netzero target found")
|
| 140 |
+
df['Validation'] = 'No'
|
| 141 |
+
df_xlsx = to_excel(df)
|
| 142 |
+
st.download_button(label='📥 Download Current Result',
|
| 143 |
+
data=df_xlsx ,
|
| 144 |
+
file_name= 'file_target.xlsx')
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
else:
|
| 148 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
| 149 |
+
logging.warning("Terminated as no document provided")
|
| 150 |
+
|
| 151 |
+
# # Creating truth value dataframe
|
| 152 |
+
# if 'key2' in st.session_state:
|
| 153 |
+
# if st.session_state.key2 is not None:
|
| 154 |
+
# df = st.session_state.key2
|
| 155 |
+
# st.markdown("###### Select the threshold for classifier ######")
|
| 156 |
+
# c1, c2 = st.columns([1,1])
|
| 157 |
+
|
| 158 |
+
# netzero_df = df[df['Target Label'] == 'NETZERO'].reset_index(drop = True)
|
| 159 |
+
# if len(netzero_df) >0:
|
| 160 |
+
# with c1:
|
| 161 |
+
# threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
|
| 162 |
+
# step=0.01, value=0.5,
|
| 163 |
+
# help = "Keep High Value if want refined result, low if dont want to miss anything" )
|
| 164 |
+
|
| 165 |
+
# # creating the dataframe for value counts of Labels, along with 'title' of Labels
|
| 166 |
+
# temp = df[df['Relevancy']>threshold]
|
| 167 |
+
# count_df = temp['Target Label'].value_counts()
|
| 168 |
+
# count_df = count_df.rename('count')
|
| 169 |
+
# count_df = count_df.rename_axis('Target Label').reset_index()
|
| 170 |
+
# count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
| 171 |
+
|
| 172 |
+
# plt.rcParams['font.size'] = 25
|
| 173 |
+
# colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
|
| 174 |
+
# # plot
|
| 175 |
+
# fig, ax = plt.subplots()
|
| 176 |
+
# ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
|
| 177 |
+
# wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
| 178 |
+
# textprops={'fontsize': 14},
|
| 179 |
+
# frame=False,labels =list(count_df.Label_def),
|
| 180 |
+
# labeldistance=1.2)
|
| 181 |
+
# st.markdown("#### Anything related to NetZero Targets? ####")
|
| 182 |
+
|
| 183 |
+
# c4, c5, c6 = st.columns([1,2,2])
|
| 184 |
+
|
| 185 |
+
# with c5:
|
| 186 |
+
# st.pyplot(fig)
|
| 187 |
+
# with c6:
|
| 188 |
+
# st.write(count_df[['Label_def','count']])
|
| 189 |
+
|
| 190 |
+
# st.write("")
|
| 191 |
+
|
| 192 |
+
# st.markdown("###### Top few NetZero Target Classified paragraph/text results ######")
|
| 193 |
+
|
| 194 |
+
# st.dataframe(netzero_df.head())
|
| 195 |
+
# else:
|
| 196 |
+
# st.write("🤔 No Results found")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# df['Validation'] = 'No'
|
| 200 |
+
# df_xlsx = to_excel(df)
|
| 201 |
+
# st.download_button(label='📥 Download Current Result',
|
| 202 |
+
# data=df_xlsx ,
|
| 203 |
+
# file_name= 'file_netzero.xlsx')
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
|
appStore/sector.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# set path
|
| 2 |
+
import glob, os, sys;
|
| 3 |
+
sys.path.append('../utils')
|
| 4 |
+
|
| 5 |
+
#import needed libraries
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import streamlit as st
|
| 11 |
+
# from st_aggrid import AgGrid
|
| 12 |
+
# from st_aggrid.shared import ColumnsAutoSizeMode
|
| 13 |
+
from utils.sector_classifier import sector_classification
|
| 14 |
+
from utils.sector_classifier import runSectorPreprocessingPipeline, load_sectorClassifier
|
| 15 |
+
# from utils.keyword_extraction import textrank
|
| 16 |
+
import logging
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
from utils.config import get_classifier_params
|
| 19 |
+
from utils.preprocessing import paraLengthCheck
|
| 20 |
+
from io import BytesIO
|
| 21 |
+
import xlsxwriter
|
| 22 |
+
import plotly.express as px
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Declare all the necessary variables
|
| 26 |
+
classifier_identifier = 'sector'
|
| 27 |
+
params = get_classifier_params(classifier_identifier)
|
| 28 |
+
|
| 29 |
+
@st.cache_data
|
| 30 |
+
def to_excel(df,sectorlist):
|
| 31 |
+
len_df = len(df)
|
| 32 |
+
output = BytesIO()
|
| 33 |
+
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
| 34 |
+
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
| 35 |
+
workbook = writer.book
|
| 36 |
+
worksheet = writer.sheets['Sheet1']
|
| 37 |
+
worksheet.data_validation('S2:S{}'.format(len_df),
|
| 38 |
+
{'validate': 'list',
|
| 39 |
+
'source': ['No', 'Yes', 'Discard']})
|
| 40 |
+
worksheet.data_validation('X2:X{}'.format(len_df),
|
| 41 |
+
{'validate': 'list',
|
| 42 |
+
'source': sectorlist + ['Blank']})
|
| 43 |
+
worksheet.data_validation('T2:T{}'.format(len_df),
|
| 44 |
+
{'validate': 'list',
|
| 45 |
+
'source': sectorlist + ['Blank']})
|
| 46 |
+
worksheet.data_validation('U2:U{}'.format(len_df),
|
| 47 |
+
{'validate': 'list',
|
| 48 |
+
'source': sectorlist + ['Blank']})
|
| 49 |
+
worksheet.data_validation('V2:V{}'.format(len_df),
|
| 50 |
+
{'validate': 'list',
|
| 51 |
+
'source': sectorlist + ['Blank']})
|
| 52 |
+
worksheet.data_validation('W2:U{}'.format(len_df),
|
| 53 |
+
{'validate': 'list',
|
| 54 |
+
'source': sectorlist + ['Blank']})
|
| 55 |
+
writer.save()
|
| 56 |
+
processed_data = output.getvalue()
|
| 57 |
+
return processed_data
|
| 58 |
+
|
| 59 |
+
def app():
|
| 60 |
+
|
| 61 |
+
#### APP INFO #####
|
| 62 |
+
with st.container():
|
| 63 |
+
st.markdown("<h1 style='text-align: center; color: black;'> Sector Classification </h1>", unsafe_allow_html=True)
|
| 64 |
+
st.write(' ')
|
| 65 |
+
st.write(' ')
|
| 66 |
+
|
| 67 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
| 68 |
+
|
| 69 |
+
st.write(
|
| 70 |
+
"""
|
| 71 |
+
The **Sector Classification** app is an easy-to-use interface built \
|
| 72 |
+
in Streamlit for analyzing policy documents for \
|
| 73 |
+
Classification of the paragraphs/texts in the document *If it \
|
| 74 |
+
belongs to particular sector or not*. The paragraph can belong to multiple sectors - \
|
| 75 |
+
developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
| 76 |
+
SV Klima and SPA. \n
|
| 77 |
+
""")
|
| 78 |
+
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
| 79 |
+
automatically cleaned and split into paragraphs with a maximum \
|
| 80 |
+
length of 60 words using a Haystack preprocessing pipeline. The \
|
| 81 |
+
length of 60 is an empirical value which should reflect the length \
|
| 82 |
+
of a “context” and should limit the paragraph length deviation. \
|
| 83 |
+
However, since we want to respect the sentence boundary the limit \
|
| 84 |
+
can breach and hence this limit of 60 is tentative. \n
|
| 85 |
+
""")
|
| 86 |
+
|
| 87 |
+
st.write("")
|
| 88 |
+
|
| 89 |
+
### Main app code ###
|
| 90 |
+
with st.container():
|
| 91 |
+
if st.button("RUN Sector Classification"):
|
| 92 |
+
if 'key' not in st.session_state:
|
| 93 |
+
st.session_state['key'] = None
|
| 94 |
+
|
| 95 |
+
if 'filepath' in st.session_state:
|
| 96 |
+
file_name = st.session_state['filename']
|
| 97 |
+
file_path = st.session_state['filepath']
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
all_documents = runSectorPreprocessingPipeline(file_name= file_name,
|
| 101 |
+
file_path= file_path, split_by= params['split_by'],
|
| 102 |
+
split_length= params['split_length'],
|
| 103 |
+
split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
|
| 104 |
+
split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
|
| 105 |
+
# st.write(all_documents['documents'])
|
| 106 |
+
classifier = load_sectorClassifier(classifier_name=params['model_name'])
|
| 107 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
| 108 |
+
verified_paralist = paraLengthCheck(all_documents['paraList'], 100)
|
| 109 |
+
if len(verified_paralist) > 100:
|
| 110 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
| 111 |
+
else:
|
| 112 |
+
warning_msg = ""
|
| 113 |
+
|
| 114 |
+
# #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
|
| 115 |
+
# with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
|
| 116 |
+
|
| 117 |
+
df = sector_classification(haystack_doc=verified_paralist,
|
| 118 |
+
threshold= params['threshold'])
|
| 119 |
+
# st.write(df)
|
| 120 |
+
threshold= params['threshold']
|
| 121 |
+
truth_df = df.drop(['text'],axis=1)
|
| 122 |
+
truth_df = truth_df.astype(float) >= threshold
|
| 123 |
+
truth_df = truth_df.astype(str)
|
| 124 |
+
categories = list(truth_df.columns)
|
| 125 |
+
|
| 126 |
+
placeholder = {}
|
| 127 |
+
for val in categories:
|
| 128 |
+
placeholder[val] = dict(truth_df[val].value_counts())
|
| 129 |
+
count_df = pd.DataFrame.from_dict(placeholder)
|
| 130 |
+
count_df = count_df.T
|
| 131 |
+
count_df = count_df.reset_index()
|
| 132 |
+
# st.write(count_df)
|
| 133 |
+
placeholder = []
|
| 134 |
+
for i in range(len(count_df)):
|
| 135 |
+
placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'Yes'])
|
| 136 |
+
placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'No'])
|
| 137 |
+
count_df = pd.DataFrame(placeholder, columns = ['category','count','truth_value'])
|
| 138 |
+
# st.write("Total Paragraphs: {}".format(len(df)))
|
| 139 |
+
fig = px.bar(count_df, x='category', y='count',
|
| 140 |
+
color='truth_value')
|
| 141 |
+
# c1, c2 = st.columns([1,1])
|
| 142 |
+
# with c1:
|
| 143 |
+
st.plotly_chart(fig,use_container_width= True)
|
| 144 |
+
|
| 145 |
+
truth_df['labels'] = truth_df.apply(lambda x: {i if x[i]=='True' else None for i in categories}, axis=1)
|
| 146 |
+
truth_df['labels'] = truth_df.apply(lambda x: list(x['labels'] -{None}),axis=1)
|
| 147 |
+
# st.write(truth_df)
|
| 148 |
+
df = pd.concat([df,truth_df['labels']],axis=1)
|
| 149 |
+
df['Validation'] = 'No'
|
| 150 |
+
df['Sector1'] = 'Blank'
|
| 151 |
+
df['Sector2'] = 'Blank'
|
| 152 |
+
df['Sector3'] = 'Blank'
|
| 153 |
+
df['Sector4'] = 'Blank'
|
| 154 |
+
df['Sector5'] = 'Blank'
|
| 155 |
+
df_xlsx = to_excel(df,categories)
|
| 156 |
+
st.download_button(label='📥 Download Current Result',
|
| 157 |
+
data=df_xlsx ,
|
| 158 |
+
file_name= 'file_sector.xlsx')
|
| 159 |
+
else:
|
| 160 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
| 161 |
+
logging.warning("Terminated as no document provided")
|
| 162 |
+
|
| 163 |
+
# # Creating truth value dataframe
|
| 164 |
+
# if 'key' in st.session_state:
|
| 165 |
+
# if st.session_state.key is not None:
|
| 166 |
+
# df = st.session_state.key
|
| 167 |
+
# st.markdown("###### Select the threshold for classifier ######")
|
| 168 |
+
# c4, c5 = st.columns([1,1])
|
| 169 |
+
|
| 170 |
+
# with c4:
|
| 171 |
+
# threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
|
| 172 |
+
# step=0.01, value=0.5,
|
| 173 |
+
# help = "Keep High Value if want refined result, low if dont want to miss anything" )
|
| 174 |
+
# sectors =set(df.columns)
|
| 175 |
+
# removecols = {'Validation','Sector1','Sector2','Sector3','Sector4',
|
| 176 |
+
# 'Sector5','text'}
|
| 177 |
+
# sectors = list(sectors - removecols)
|
| 178 |
+
|
| 179 |
+
# placeholder = {}
|
| 180 |
+
# for val in sectors:
|
| 181 |
+
# temp = df[val].astype(float) > threshold
|
| 182 |
+
# temp = temp.astype(str)
|
| 183 |
+
# placeholder[val] = dict(temp.value_counts())
|
| 184 |
+
|
| 185 |
+
# count_df = pd.DataFrame.from_dict(placeholder)
|
| 186 |
+
# count_df = count_df.T
|
| 187 |
+
# count_df = count_df.reset_index()
|
| 188 |
+
# placeholder = []
|
| 189 |
+
# for i in range(len(count_df)):
|
| 190 |
+
# placeholder.append([count_df.iloc[i]['index'],count_df['False'][i],'False'])
|
| 191 |
+
# placeholder.append([count_df.iloc[i]['index'],count_df['True'][i],'True'])
|
| 192 |
+
|
| 193 |
+
# count_df = pd.DataFrame(placeholder, columns = ['sector','count','truth_value'])
|
| 194 |
+
# fig = px.bar(count_df, x='sector', y='count',
|
| 195 |
+
# color='truth_value',
|
| 196 |
+
# height=400)
|
| 197 |
+
# st.write("")
|
| 198 |
+
# st.plotly_chart(fig)
|
| 199 |
+
|
| 200 |
+
# df['Validation'] = 'No'
|
| 201 |
+
# df['Sector1'] = 'Blank'
|
| 202 |
+
# df['Sector2'] = 'Blank'
|
| 203 |
+
# df['Sector3'] = 'Blank'
|
| 204 |
+
# df['Sector4'] = 'Blank'
|
| 205 |
+
# df['Sector5'] = 'Blank'
|
| 206 |
+
# df_xlsx = to_excel(df,sectors)
|
| 207 |
+
# st.download_button(label='📥 Download Current Result',
|
| 208 |
+
# data=df_xlsx ,
|
| 209 |
+
# file_name= 'file_sector.xlsx')
|
| 210 |
+
|
| 211 |
+
|
appStore/target.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# set path
|
| 2 |
+
import glob, os, sys;
|
| 3 |
+
sys.path.append('../utils')
|
| 4 |
+
|
| 5 |
+
#import needed libraries
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import streamlit as st
|
| 11 |
+
# from st_aggrid import AgGrid
|
| 12 |
+
# from st_aggrid.shared import ColumnsAutoSizeMode
|
| 13 |
+
from utils.target_classifier import target_classification
|
| 14 |
+
from utils.target_classifier import runTargetPreprocessingPipeline, load_targetClassifier
|
| 15 |
+
# from utils.keyword_extraction import textrank
|
| 16 |
+
import logging
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
from utils.config import get_classifier_params
|
| 19 |
+
from io import BytesIO
|
| 20 |
+
import xlsxwriter
|
| 21 |
+
import plotly.express as px
|
| 22 |
+
|
| 23 |
+
# Declare all the necessary variables
|
| 24 |
+
classifier_identifier = 'target'
|
| 25 |
+
params = get_classifier_params(classifier_identifier)
|
| 26 |
+
|
| 27 |
+
## Labels dictionary ###
|
| 28 |
+
_lab_dict = {
|
| 29 |
+
'LABEL_0':'NO TARGET INFO',
|
| 30 |
+
'LABEL_1':'ECONOMY-WIDE TARGET',
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
@st.cache_data
|
| 34 |
+
def to_excel(df):
|
| 35 |
+
len_df = len(df)
|
| 36 |
+
output = BytesIO()
|
| 37 |
+
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
| 38 |
+
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
| 39 |
+
workbook = writer.book
|
| 40 |
+
worksheet = writer.sheets['Sheet1']
|
| 41 |
+
worksheet.data_validation('E2:E{}'.format(len_df),
|
| 42 |
+
{'validate': 'list',
|
| 43 |
+
'source': ['No', 'Yes', 'Discard']})
|
| 44 |
+
writer.save()
|
| 45 |
+
processed_data = output.getvalue()
|
| 46 |
+
return processed_data
|
| 47 |
+
|
| 48 |
+
def app():
|
| 49 |
+
|
| 50 |
+
#### APP INFO #####
|
| 51 |
+
with st.container():
|
| 52 |
+
st.markdown("<h1 style='text-align: center; color: black;'> Targets Extraction </h1>", unsafe_allow_html=True)
|
| 53 |
+
st.write(' ')
|
| 54 |
+
st.write(' ')
|
| 55 |
+
|
| 56 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
| 57 |
+
|
| 58 |
+
st.write(
|
| 59 |
+
"""
|
| 60 |
+
The **Target Extraction** app is an easy-to-use interface built \
|
| 61 |
+
in Streamlit for analyzing policy documents for \
|
| 62 |
+
Classification of the paragraphs/texts in the document *If it \
|
| 63 |
+
contains any Economy-Wide Targets related information* - \
|
| 64 |
+
developed by GIZ Data Service Center, GFA, IKI Tracs, \
|
| 65 |
+
SV Klima and SPA. \n
|
| 66 |
+
""")
|
| 67 |
+
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
| 68 |
+
automatically cleaned and split into paragraphs with a maximum \
|
| 69 |
+
length of 60 words using a Haystack preprocessing pipeline. The \
|
| 70 |
+
length of 60 is an empirical value which should reflect the length \
|
| 71 |
+
of a “context” and should limit the paragraph length deviation. \
|
| 72 |
+
However, since we want to respect the sentence boundary the limit \
|
| 73 |
+
can breach and hence this limit of 60 is tentative. \n
|
| 74 |
+
""")
|
| 75 |
+
|
| 76 |
+
st.write("")
|
| 77 |
+
|
| 78 |
+
### Main app code ###
|
| 79 |
+
with st.container():
|
| 80 |
+
if st.button("RUN Target Related Paragraph Extractions"):
|
| 81 |
+
if 'key1' not in st.session_state:
|
| 82 |
+
st.session_state['key1'] = None
|
| 83 |
+
|
| 84 |
+
if 'filepath' in st.session_state:
|
| 85 |
+
file_name = st.session_state['filename']
|
| 86 |
+
file_path = st.session_state['filepath']
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
all_documents = runTargetPreprocessingPipeline(file_name= file_name,
|
| 90 |
+
file_path= file_path, split_by= params['split_by'],
|
| 91 |
+
split_length= params['split_length'],
|
| 92 |
+
split_respect_sentence_boundary= params['split_respect_sentence_boundary'],
|
| 93 |
+
split_overlap= params['split_overlap'], remove_punc= params['remove_punc'])
|
| 94 |
+
# st.write(all_documents['documents'])
|
| 95 |
+
|
| 96 |
+
#load Classifier
|
| 97 |
+
classifier = load_targetClassifier(classifier_name=params['model_name'])
|
| 98 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
| 99 |
+
if len(all_documents['documents']) > 100:
|
| 100 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
| 101 |
+
else:
|
| 102 |
+
warning_msg = ""
|
| 103 |
+
|
| 104 |
+
# #st.write(all_documents['documents'],_lab_dict,classifier_identifier,params['threshold'])
|
| 105 |
+
# with st.spinner("Running Target Related Paragraph Extractions{}".format(warning_msg)):
|
| 106 |
+
|
| 107 |
+
df = target_classification(haystack_doc=all_documents['documents'],
|
| 108 |
+
threshold= params['threshold'])
|
| 109 |
+
st.session_state.key1 = df
|
| 110 |
+
# temp = df[df['Relevancy']>threshold]
|
| 111 |
+
hits = df[df['Target Label'] == 'LABEL_1']
|
| 112 |
+
range_val = min(5,len(hits))
|
| 113 |
+
if range_val !=0:
|
| 114 |
+
count_df = df['Target Label'].value_counts()
|
| 115 |
+
count_df = count_df.rename('count')
|
| 116 |
+
count_df = count_df.rename_axis('Target Label').reset_index()
|
| 117 |
+
count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
| 118 |
+
|
| 119 |
+
fig = px.bar(count_df, y="Label_def", x="count", orientation='h', height=200)
|
| 120 |
+
c1, c2 = st.columns([1,1])
|
| 121 |
+
with c1:
|
| 122 |
+
st.plotly_chart(fig,use_container_width= True)
|
| 123 |
+
|
| 124 |
+
hits = hits.sort_values(by=['Relevancy'], ascending=False)
|
| 125 |
+
st.write("")
|
| 126 |
+
st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
|
| 127 |
+
range_val = min(5,len(hits))
|
| 128 |
+
for i in range(range_val):
|
| 129 |
+
# the page number reflects the page that contains the main paragraph
|
| 130 |
+
# according to split limit, the overlapping part can be on a separate page
|
| 131 |
+
st.write('**Result {}** `page {}` (Relevancy Score: {:.2f})'.format(i+1,hits.iloc[i]['page'],hits.iloc[i]['Relevancy']))
|
| 132 |
+
st.write("\t Text: \t{}".format(hits.iloc[i]['text'].replace("\n", " ")))
|
| 133 |
+
|
| 134 |
+
else:
|
| 135 |
+
st.info("🤔 No Economy Wide Target found")
|
| 136 |
+
df['Validation'] = 'No'
|
| 137 |
+
df_xlsx = to_excel(df)
|
| 138 |
+
st.download_button(label='📥 Download Current Result',
|
| 139 |
+
data=df_xlsx ,
|
| 140 |
+
file_name= 'file_target.xlsx')
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
else:
|
| 144 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
| 145 |
+
logging.warning("Terminated as no document provided")
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# # Creating truth value dataframe
|
| 162 |
+
# if 'key1' in st.session_state:
|
| 163 |
+
# if st.session_state.key1 is not None:
|
| 164 |
+
# df = st.session_state.key1
|
| 165 |
+
# st.markdown("###### Select the threshold for classifier ######")
|
| 166 |
+
# c1, c2 = st.columns([1,1])
|
| 167 |
+
|
| 168 |
+
# with c1:
|
| 169 |
+
# threshold = st.slider("Threshold", min_value=0.00, max_value=1.0,
|
| 170 |
+
# step=0.01, value=0.5,
|
| 171 |
+
# help = "Keep High Value if want refined result, low if dont want to miss anything" )
|
| 172 |
+
# sectors =set(df.columns)
|
| 173 |
+
# removecols = {'Validation','Sectors','text'}
|
| 174 |
+
# sectors = list(sectors - removecols)
|
| 175 |
+
|
| 176 |
+
# # creating the dataframe for value counts of Labels, along with 'title' of Labels
|
| 177 |
+
# temp = df[df['Relevancy']>threshold]
|
| 178 |
+
# count_df = temp['Target Label'].value_counts()
|
| 179 |
+
# count_df = count_df.rename('count')
|
| 180 |
+
# count_df = count_df.rename_axis('Target Label').reset_index()
|
| 181 |
+
# count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
| 182 |
+
|
| 183 |
+
# plt.rcParams['font.size'] = 25
|
| 184 |
+
# colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(count_df)))
|
| 185 |
+
# # plot
|
| 186 |
+
# fig, ax = plt.subplots()
|
| 187 |
+
# ax.pie(count_df['count'], colors=colors, radius=2, center=(4, 4),
|
| 188 |
+
# wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
| 189 |
+
# textprops={'fontsize': 14},
|
| 190 |
+
# frame=False,labels =list(count_df.Label_def),
|
| 191 |
+
# labeldistance=1.2)
|
| 192 |
+
# st.markdown("#### Anything related to Targets? ####")
|
| 193 |
+
|
| 194 |
+
# c4, c5, c6 = st.columns([1,2,2])
|
| 195 |
+
|
| 196 |
+
# with c5:
|
| 197 |
+
# st.pyplot(fig)
|
| 198 |
+
# with c6:
|
| 199 |
+
# st.write(count_df[['Label_def','count']])
|
| 200 |
+
|
| 201 |
+
# st.write("")
|
| 202 |
+
# st.markdown("###### Top few Economy Wide Target Classified paragraph/text results ######")
|
| 203 |
+
# st.dataframe(df[df['Target Label'] == 'LABEL_1'].reset_index(drop = True))
|
| 204 |
+
|
| 205 |
+
# df['Validation'] = 'No'
|
| 206 |
+
# df_xlsx = to_excel(df)
|
| 207 |
+
# st.download_button(label='📥 Download Current Result',
|
| 208 |
+
# data=df_xlsx ,
|
| 209 |
+
# file_name= 'file_target.xlsx')
|
| 210 |
+
|
| 211 |
+
|
docStore/img/dsc_giz.png
ADDED
|
docStore/img/ndc.png
ADDED
|
docStore/img/paris.png
ADDED
|
docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt
ADDED
|
@@ -0,0 +1,737 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Ethiopia 2030: The Pathway to Prosperity
|
| 2 |
+
Ten Years Perspective Development Plan (2021 � 2030)
|
| 3 |
+
1. Baselines and Assumptions
|
| 4 |
+
2. Strategic pillars
|
| 5 |
+
3. Departures
|
| 6 |
+
4. Macroeconomic goals
|
| 7 |
+
5. Implications of the COVID-19 pandemic and necessary mitigation measures
|
| 8 |
+
6. Potentials/capabilities
|
| 9 |
+
7. Focus areas
|
| 10 |
+
7.1. Productive sectors
|
| 11 |
+
7.2. Services sector
|
| 12 |
+
7.3. Enabling sectors
|
| 13 |
+
8. Balanced and competitive development (nationally, regionally and locally)
|
| 14 |
+
9. Monitoring and Evaluation
|
| 15 |
+
Content
|
| 16 |
+
1. Baselines and Assumptions
|
| 17 |
+
Poverty Reduction (%)
|
| 18 |
+
Key performances of previous years
|
| 19 |
+
45.5 44.2
|
| 20 |
+
38.7
|
| 21 |
+
29.6
|
| 22 |
+
23.5
|
| 23 |
+
19
|
| 24 |
+
0
|
| 25 |
+
5
|
| 26 |
+
10
|
| 27 |
+
15
|
| 28 |
+
20
|
| 29 |
+
25
|
| 30 |
+
30
|
| 31 |
+
35
|
| 32 |
+
40
|
| 33 |
+
45
|
| 34 |
+
50
|
| 35 |
+
1994 2000 2005 2011 2016 2020
|
| 36 |
+
Percent
|
| 37 |
+
Year
|
| 38 |
+
Proportion of people living below poverty line
|
| 39 |
+
10.5
|
| 40 |
+
8.8
|
| 41 |
+
10.1
|
| 42 |
+
7.7
|
| 43 |
+
9
|
| 44 |
+
5.19-6.20
|
| 45 |
+
0 2 4 6 8 10 12
|
| 46 |
+
GTP I: 2011-2015
|
| 47 |
+
GTP II: 2015/16
|
| 48 |
+
GTP II: 2016/17
|
| 49 |
+
GTP II: 2017/18
|
| 50 |
+
GTP II: 2018/19
|
| 51 |
+
GTP II: 2019/20 (projection, with
|
| 52 |
+
COVID-19)
|
| 53 |
+
GDP growth rate (%)
|
| 54 |
+
1. Baselines and Assumptions
|
| 55 |
+
Share of economic sectors in GDP (%) Merchandise export as % of GDP
|
| 56 |
+
8.66
|
| 57 |
+
7.33
|
| 58 |
+
6.57
|
| 59 |
+
5.93
|
| 60 |
+
4.91
|
| 61 |
+
3.86 3.56 3.37
|
| 62 |
+
2.77
|
| 63 |
+
0
|
| 64 |
+
1
|
| 65 |
+
2
|
| 66 |
+
3
|
| 67 |
+
4
|
| 68 |
+
5
|
| 69 |
+
6
|
| 70 |
+
7
|
| 71 |
+
8
|
| 72 |
+
9
|
| 73 |
+
10
|
| 74 |
+
Percent
|
| 75 |
+
Year
|
| 76 |
+
46.9
|
| 77 |
+
45
|
| 78 |
+
43.5
|
| 79 |
+
41.4
|
| 80 |
+
39.5
|
| 81 |
+
37.1 35.9
|
| 82 |
+
34.5
|
| 83 |
+
32.8
|
| 84 |
+
13.4
|
| 85 |
+
15
|
| 86 |
+
17.3
|
| 87 |
+
18.8
|
| 88 |
+
21
|
| 89 |
+
23.5
|
| 90 |
+
25.7 26.9 27.8
|
| 91 |
+
4.7 4.8 5 5.3 5.6 6.1 6.9 6.8 6.8
|
| 92 |
+
7.1
|
| 93 |
+
8.6
|
| 94 |
+
10.7 12
|
| 95 |
+
14.2
|
| 96 |
+
16.2
|
| 97 |
+
17.8 19.1 20.1
|
| 98 |
+
39.8 40.1 39.2 39.8 39.4 38.4 38.6 39.4
|
| 99 |
+
0
|
| 100 |
+
5
|
| 101 |
+
10
|
| 102 |
+
15
|
| 103 |
+
20
|
| 104 |
+
25
|
| 105 |
+
30
|
| 106 |
+
35
|
| 107 |
+
40
|
| 108 |
+
45
|
| 109 |
+
50
|
| 110 |
+
2010/11 2011/12 2012/13 2013/14 2014/15 2015/16 2016/17 2017/18 2018/19
|
| 111 |
+
Percent
|
| 112 |
+
Agriculture Industry Manufacturing Construction Services
|
| 113 |
+
1. Baselines and Assumptions
|
| 114 |
+
Labour force participation (2013)
|
| 115 |
+
73%
|
| 116 |
+
7%
|
| 117 |
+
20%
|
| 118 |
+
Agriculture
|
| 119 |
+
Industry
|
| 120 |
+
Services
|
| 121 |
+
7%
|
| 122 |
+
22%
|
| 123 |
+
71%
|
| 124 |
+
Agriculture
|
| 125 |
+
Industry
|
| 126 |
+
Services
|
| 127 |
+
Urban labour force participation (2013)
|
| 128 |
+
1. Baselines and Assumptions
|
| 129 |
+
High and increasing Unemployment Rate
|
| 130 |
+
� Urban unemployment rate = 19.1% in 2018
|
| 131 |
+
� Youth unemployment rate = 25.3 %
|
| 132 |
+
? Male = 18.6%
|
| 133 |
+
? Female 30.9 %
|
| 134 |
+
� Rural unemployment rate = 2% in 2013
|
| 135 |
+
� Declining per capita rural land creating
|
| 136 |
+
disguised unemployment
|
| 137 |
+
402,869
|
| 138 |
+
471,535
|
| 139 |
+
Male Female Total Male Female Total
|
| 140 |
+
2014 2018
|
| 141 |
+
15-19 yr. 20-24 yr. 25-29 yr. Linear (20-24 yr.)
|
| 142 |
+
Number of unemployed people in urban areas
|
| 143 |
+
1. Baselines and Assumptions
|
| 144 |
+
Challenges
|
| 145 |
+
1. Macroeconomic imbalances
|
| 146 |
+
?Sustained high inflation
|
| 147 |
+
?High and rising unemployment especially
|
| 148 |
+
in urban areas
|
| 149 |
+
?High and rising debt burden
|
| 150 |
+
?Chronic foreign currency shortage
|
| 151 |
+
?Sluggish (though encouraging) rate of
|
| 152 |
+
structural change
|
| 153 |
+
2. Vulnerability to shocks (COVID-19, Climate
|
| 154 |
+
changes, Desert Locust infestation, etc)
|
| 155 |
+
3. Poor quality and high inequity in
|
| 156 |
+
infrastructure projects
|
| 157 |
+
4. Poor quality services in health and
|
| 158 |
+
education
|
| 159 |
+
� High repetition and dropout rates from school
|
| 160 |
+
1. Baselines and Assumptions
|
| 161 |
+
� Poor quality of growth and slow
|
| 162 |
+
structural change
|
| 163 |
+
� Excessive aid and loan
|
| 164 |
+
dependence for financing
|
| 165 |
+
infrastructural and construction
|
| 166 |
+
investments
|
| 167 |
+
� Limited success in expanding
|
| 168 |
+
manufacturing and modern
|
| 169 |
+
agriculture which have high job
|
| 170 |
+
creation potentials
|
| 171 |
+
� Weak institutional capacity as
|
| 172 |
+
the main culprit of all failures
|
| 173 |
+
? Provision of quality services
|
| 174 |
+
(electricity, water, telephone,
|
| 175 |
+
internet)
|
| 176 |
+
? Creation of enough jobs and
|
| 177 |
+
improved living standards
|
| 178 |
+
? Generation of reliable foreign
|
| 179 |
+
exchange revenue and debtsustainable
|
| 180 |
+
national economic
|
| 181 |
+
capacity
|
| 182 |
+
? Completion of development
|
| 183 |
+
projects and investment plans
|
| 184 |
+
under public-private
|
| 185 |
+
partnerships
|
| 186 |
+
� Low reward for merit, productivity and effort
|
| 187 |
+
while low disincentive for laziness, wastefulness
|
| 188 |
+
and corruption
|
| 189 |
+
� Slow institutional change and transformation in:
|
| 190 |
+
? Government policies
|
| 191 |
+
? Investor attitude
|
| 192 |
+
? Youth behaviour
|
| 193 |
+
? Role of the intellectuals
|
| 194 |
+
� The need for sustained increase in production
|
| 195 |
+
and productivity
|
| 196 |
+
� The need to set a common national vision to
|
| 197 |
+
achieve major successes with consensus and
|
| 198 |
+
popular legitimacy
|
| 199 |
+
Major areas of failure in the economy
|
| 200 |
+
1. Baselines and Assumptions
|
| 201 |
+
� Poor quality of growth and slow
|
| 202 |
+
structural change
|
| 203 |
+
� Excessive aid and loan
|
| 204 |
+
dependence for financing
|
| 205 |
+
infrastructural and construction
|
| 206 |
+
investments
|
| 207 |
+
� Limited success in expanding
|
| 208 |
+
manufacturing and modern
|
| 209 |
+
agriculture which have high job
|
| 210 |
+
creation potentials
|
| 211 |
+
� Weak institutional capacity as
|
| 212 |
+
the main culprit of all failures
|
| 213 |
+
? Provision of quality services
|
| 214 |
+
(electricity, water, telephone,
|
| 215 |
+
internet)
|
| 216 |
+
? Creation of enough jobs and
|
| 217 |
+
improved living standards
|
| 218 |
+
? Generation of reliable foreign
|
| 219 |
+
exchange revenue and debtsustainable
|
| 220 |
+
national economic
|
| 221 |
+
capacity
|
| 222 |
+
? Completion of development
|
| 223 |
+
projects and investment plans
|
| 224 |
+
under public-private
|
| 225 |
+
partnerships
|
| 226 |
+
� Low reward for merit, productivity and effort
|
| 227 |
+
while low disincentive for laziness, wastefulness
|
| 228 |
+
and corruption
|
| 229 |
+
� Slow institutional change and transformation in:
|
| 230 |
+
? Government policies
|
| 231 |
+
? Investor attitude
|
| 232 |
+
? Youth behaviour
|
| 233 |
+
? Role of the intellectuals
|
| 234 |
+
� The need for sustained increase in production
|
| 235 |
+
and productivity
|
| 236 |
+
� The need to set a common national vision to
|
| 237 |
+
achieve major successes with consensus and
|
| 238 |
+
popular legitimacy
|
| 239 |
+
Major areas of failure in the economy
|
| 240 |
+
2. Departures
|
| 241 |
+
1. Emphasis on quality of economic growth
|
| 242 |
+
2. Participation and coordination of sectors in the planning process
|
| 243 |
+
3. Sectoral linkages and multi-sectoral development focus
|
| 244 |
+
4. Preparation of national development corridors based on development potentials
|
| 245 |
+
5. Focus on solving institutional bottlenecks
|
| 246 |
+
6. The ongoing home grown economic reform programme as a sprinting board
|
| 247 |
+
7. Emphasis on resilience building, innovation and entrepreneurship
|
| 248 |
+
3. Strategic pillars
|
| 249 |
+
1. Ensure quality growth
|
| 250 |
+
2. Improve productivity and competitiveness
|
| 251 |
+
3. Undertake institutional transformation
|
| 252 |
+
4. Ensure private sector's leadership in the economy
|
| 253 |
+
5. Ensure equitable participation of women and children
|
| 254 |
+
6. Build climate resilient green economy
|
| 255 |
+
3. Strategic pillars
|
| 256 |
+
� Increasing export revenues and substituting imports by
|
| 257 |
+
reducing production costs
|
| 258 |
+
� Availing quality and massive infrastructure
|
| 259 |
+
? Linking infrastructural development with development corridors
|
| 260 |
+
� Producing required human resources with quality
|
| 261 |
+
� Producing enough and quality human resources
|
| 262 |
+
� Prioritizing innovative production systems
|
| 263 |
+
� Linking incentives with export revenue and job creation
|
| 264 |
+
performances
|
| 265 |
+
� Modernizing and enhancing the logistic system
|
| 266 |
+
� Creating technological competences needed for longterm
|
| 267 |
+
growth
|
| 268 |
+
� The economic growth should ensure:
|
| 269 |
+
? Participation of all citizens and equitable utilization of the
|
| 270 |
+
growth proceeds
|
| 271 |
+
? Improved standard of living of every citizen
|
| 272 |
+
? Reduced poverty in all indicators
|
| 273 |
+
? Reduced inflation and unemployment
|
| 274 |
+
� The economic growth should lead to increased
|
| 275 |
+
aggregate supply
|
| 276 |
+
� Focus on modern agriculture, manufacturing and
|
| 277 |
+
mining
|
| 278 |
+
� Emphasis on exploiting the sources of growth through
|
| 279 |
+
structural change
|
| 280 |
+
1.Ensuring quality economic growth 2. Raising production and productivity
|
| 281 |
+
3. Strategic pillars
|
| 282 |
+
� Build democratic and judicial institutions that ensure elite bargain,
|
| 283 |
+
national consensus, common vision and government legitimacy
|
| 284 |
+
� Build private sector and competition friendly bureaucracy
|
| 285 |
+
� Coordinate with parents, the society and teachers to make
|
| 286 |
+
educational institutions centers of excellence and virtuous citizens
|
| 287 |
+
� Coordinate with parents as well as social and religious leaders to
|
| 288 |
+
encourage religious institutions and their teachings contribute
|
| 289 |
+
towards poverty reduction efforts
|
| 290 |
+
� Prepare policies, strategies and legal frameworks for achieving
|
| 291 |
+
prosperity
|
| 292 |
+
� Increased focus on innovation and research
|
| 293 |
+
� Creating strong social security system
|
| 294 |
+
3. Institutional Transformation 4. Private sector's leadership in the economy
|
| 295 |
+
� Create conducive investment climate and incentivize
|
| 296 |
+
domestic investors in key sectors
|
| 297 |
+
� Build strong and market-led public-private partnerships in
|
| 298 |
+
order to ensure the establishment of inclusive and
|
| 299 |
+
pragmatic market economy
|
| 300 |
+
� Enhance access and quality of infrastructure to attract
|
| 301 |
+
quality foreign direct investment
|
| 302 |
+
� Identify new sources of growth, empower and stimulate
|
| 303 |
+
the private sector, and supplement the private sector in
|
| 304 |
+
strategic areas
|
| 305 |
+
� Emphasis for public-private partnership on problem
|
| 306 |
+
solving innovations and research activities
|
| 307 |
+
3. Strategic pillars
|
| 308 |
+
� Ensure gender equity in economic and social
|
| 309 |
+
sectors
|
| 310 |
+
? Participation of women at all levels of education
|
| 311 |
+
? Asset ownership of women
|
| 312 |
+
� Ensure fair participation of women and youth in
|
| 313 |
+
leadership and decision making positions
|
| 314 |
+
� Create awareness among citizens about the role of
|
| 315 |
+
women and youth in the country�s overall
|
| 316 |
+
development
|
| 317 |
+
� Increase basin development efforts to fight land
|
| 318 |
+
degradation and to reduce pollutions
|
| 319 |
+
� Improve productivity and reduce GHG emissions
|
| 320 |
+
� Increase forest protection and development
|
| 321 |
+
� Increase production of electricity from renewable
|
| 322 |
+
sources for domestic use and for export
|
| 323 |
+
� Focus on modern and energy saving technologies
|
| 324 |
+
5. Equitable participation of women and children 6. Climate resilient green economy
|
| 325 |
+
4. Macroeconomic Goals
|
| 326 |
+
Assumptions
|
| 327 |
+
? Requirement to significantly reduce
|
| 328 |
+
poverty
|
| 329 |
+
? Available national potentials
|
| 330 |
+
? Potential for investment in the economy
|
| 331 |
+
? Existing potentials in each sector
|
| 332 |
+
? Low productivity that needs to be
|
| 333 |
+
improved
|
| 334 |
+
� Make Ethiopia a middle income
|
| 335 |
+
economy by 2022
|
| 336 |
+
� Raise per capita income to USD 1,115
|
| 337 |
+
in 2022
|
| 338 |
+
? Threshold for middle-income is USD 1,026
|
| 339 |
+
? Plus human development index and
|
| 340 |
+
economic vulnerability index
|
| 341 |
+
� Raise per capita income to USD 2,220
|
| 342 |
+
by 2030
|
| 343 |
+
Sectoral growth Targets (2021-2030)
|
| 344 |
+
Assured middle- income potential
|
| 345 |
+
10.2%
|
| 346 |
+
Average
|
| 347 |
+
Growth
|
| 348 |
+
Target
|
| 349 |
+
Percentage of population below poverty line
|
| 350 |
+
4. Macroeconomic Goals
|
| 351 |
+
Structural change
|
| 352 |
+
Financing Gaps
|
| 353 |
+
Reduce urban unemployment to less than 9%
|
| 354 |
+
?1.36 million new jobs need to be
|
| 355 |
+
created per annum
|
| 356 |
+
Sectoral composition of GDP Labour force participation
|
| 357 |
+
Economic
|
| 358 |
+
Sectors
|
| 359 |
+
Performance Target
|
| 360 |
+
2011 2015 2018/19 2030
|
| 361 |
+
Agriculture 45 39.7 32.8 22.0
|
| 362 |
+
Industry 15.1 21.2 27.6 35.9
|
| 363 |
+
Manufacturing 4.7 5.5 6.8 17.2
|
| 364 |
+
Services 39.9 39 39.4 42.1
|
| 365 |
+
5. Implications of the COVID-19 pandemic and necessary mitigation measures
|
| 366 |
+
� GDP growth for 2019/20 fiscal year is projected to be lower than its target of 9.0% by between 2.81
|
| 367 |
+
and 3.80 percentage points (equivalent to 58.3 - 78.8 billion birr) due to COVID-19 pandemic
|
| 368 |
+
� If the current scenario continues, next year�s GDP growth could decline by 2.8 percentage points
|
| 369 |
+
� Returning the economy to its high growth trajectory requires focusing on sectors with high
|
| 370 |
+
productivity and job creation potentials
|
| 371 |
+
� Public investment should focus on empowering the private sector
|
| 372 |
+
� Promoting both domestic and foreign investments with the right set of incentives (merit based)
|
| 373 |
+
� Modernizing production systems and improving uptake of technology
|
| 374 |
+
� Conducting demand analysis for export commodities to remedy for the declining trend in exports
|
| 375 |
+
and foreign exchange earnings.
|
| 376 |
+
6. Potentials
|
| 377 |
+
� Endowment of various natural resources contributing to the growth potential
|
| 378 |
+
� Huge unutilized arable land creates great potential for the success of the plan
|
| 379 |
+
� Endowment of gemstones, ornamental, energy, metals, and metallic minerals
|
| 380 |
+
� Gold, coal, iron ore, potash, tantalum, marble, petroleum and other natural resources
|
| 381 |
+
Natural
|
| 382 |
+
Resources
|
| 383 |
+
� Large youth population and potential for demographic dividend
|
| 384 |
+
� Cumulative capacity in education and health
|
| 385 |
+
� Positive attitude and noble culture of reaching agreement among citizens
|
| 386 |
+
Human
|
| 387 |
+
capital
|
| 388 |
+
6. Potentials
|
| 389 |
+
Built physical and material capitals
|
| 390 |
+
?Transport and communication
|
| 391 |
+
? Irrigation infrastructures for modern agriculture
|
| 392 |
+
?Industrial Parks
|
| 393 |
+
?Mega energy infrastructures
|
| 394 |
+
Physical
|
| 395 |
+
capital
|
| 396 |
+
Unexploited
|
| 397 |
+
growth
|
| 398 |
+
potentials
|
| 399 |
+
� Utilizing the tourism potential through modernization
|
| 400 |
+
� Using the mining subsector as a source of input as well as a competitive industry in its
|
| 401 |
+
own right
|
| 402 |
+
6. Potentials
|
| 403 |
+
� Solving supply side bottlenecks to satisfy the existing demand
|
| 404 |
+
� Improving international acceptance and reliable partnerships
|
| 405 |
+
? The �medemer�/synergy philosophy
|
| 406 |
+
? The ongoing political reform measures
|
| 407 |
+
? The Homegrown Economic Reform programme
|
| 408 |
+
� Increased finance from partners and multilateral institutions
|
| 409 |
+
? Increased availability of foreign exchange
|
| 410 |
+
? Reduced debt stress for the short to medium term
|
| 411 |
+
? Increased potential for development
|
| 412 |
+
Increased
|
| 413 |
+
demand as
|
| 414 |
+
potential
|
| 415 |
+
Political Capital
|
| 416 |
+
Continental
|
| 417 |
+
and regional
|
| 418 |
+
integrations
|
| 419 |
+
� Regional and continental economic integration agreements
|
| 420 |
+
� International and continental free trade agreements
|
| 421 |
+
6. Potentials
|
| 422 |
+
Low
|
| 423 |
+
technology as
|
| 424 |
+
a potential
|
| 425 |
+
� Undeniably low status of technological development
|
| 426 |
+
� International mobility and spillover effect of technology
|
| 427 |
+
� Potential for development and catching up by filling the technological gaps
|
| 428 |
+
� Doubling crop productivity from the current 24-36 quintals per hectare will result
|
| 429 |
+
in 7% increase in crop production
|
| 430 |
+
� Raise the production efficiency of manufacturing from the current 50% to 80%
|
| 431 |
+
7. Focus Areas
|
| 432 |
+
7.1. Productive sectors: agriculture, manufacturing, mining
|
| 433 |
+
7.2. Service sector: tourism
|
| 434 |
+
7.3. Enabling sectors: energy, transport, sustainable finance,
|
| 435 |
+
innovation and technology, urban development, irrigation,
|
| 436 |
+
human capital development
|
| 437 |
+
7.1. Productive sectors
|
| 438 |
+
Agriculture Objectives
|
| 439 |
+
1. Free agriculture from rain dependence
|
| 440 |
+
2. Agricultural mechanization services
|
| 441 |
+
3. Contract farming, cluster approach and
|
| 442 |
+
land consolidation
|
| 443 |
+
4. Livestock, animal feed and animal health
|
| 444 |
+
5. Horticulture (irrigation and urban farming)
|
| 445 |
+
6. Private sector participation
|
| 446 |
+
7. Institutional implementation capacity
|
| 447 |
+
8. Climate resilient sustainable agricultural
|
| 448 |
+
development
|
| 449 |
+
1. Improve income and livelihood options for farming and pastoral
|
| 450 |
+
communities through increased productivity and competitiveness
|
| 451 |
+
2. Modernize agriculture and ensure national food and nutrition security
|
| 452 |
+
3. Raise export of agricultural output and substitute imports
|
| 453 |
+
4. Make agriculture a viable and profitable enterprise through value addition
|
| 454 |
+
5. Create rural employment opportunities
|
| 455 |
+
6. Enhance livestock health access and quality
|
| 456 |
+
7. Preserve animal genetic resources and increase pastoral research
|
| 457 |
+
8. Improve the development of animal feed and access to markets
|
| 458 |
+
9. Develop livestock specific extension package for each livestock type
|
| 459 |
+
Focus Areas
|
| 460 |
+
7.1. Productive sector
|
| 461 |
+
Manufacturing Industry
|
| 462 |
+
Objectives
|
| 463 |
+
1. Production of quality and competitive food, textile, housing and
|
| 464 |
+
pharmaceutical products for export and domestic markets
|
| 465 |
+
2. Production and productivity of existing manufacturing industries
|
| 466 |
+
3. Utilization of locally available inputs
|
| 467 |
+
4. Value chains, linkages and interdependencies
|
| 468 |
+
5. Linkages between large scale metallurgical and engineering,
|
| 469 |
+
chemical and pharmaceutical industries with other industries
|
| 470 |
+
6. Job creation, cluster approaches and expanding small and medium
|
| 471 |
+
scale manufacturing
|
| 472 |
+
7. Private sector participation and partnership
|
| 473 |
+
1. Establish basis for domestic industrialization
|
| 474 |
+
2. Value addition through enhanced inter-sectoral
|
| 475 |
+
linkages
|
| 476 |
+
3. Enhance productivity through private sector
|
| 477 |
+
leadership and supportive role of the
|
| 478 |
+
government
|
| 479 |
+
? Create job opportunities for the youth leaving
|
| 480 |
+
agriculture and concentrating in urban areas
|
| 481 |
+
? Make exportable commodities internationally
|
| 482 |
+
competitive
|
| 483 |
+
? Ensure structural change
|
| 484 |
+
Focus areas
|
| 485 |
+
7.1. Productive sectors
|
| 486 |
+
Mining
|
| 487 |
+
Objectives
|
| 488 |
+
� Foreign exchange earning and
|
| 489 |
+
domestic revenues
|
| 490 |
+
� Increased investment in mining
|
| 491 |
+
� Participation of manufacturing
|
| 492 |
+
industries that add value
|
| 493 |
+
� Job creation
|
| 494 |
+
� Add value for improved contribution of the subsector
|
| 495 |
+
� Increase inter-sectoral linkages to raise raw material inputs to other
|
| 496 |
+
sectors
|
| 497 |
+
� Make mining a competent subsector and induce structural change
|
| 498 |
+
� Increase human resource and technological capabilities through
|
| 499 |
+
research and trainings
|
| 500 |
+
� Raise foreign exchange revenue from mining through increased
|
| 501 |
+
exploration and production
|
| 502 |
+
� Improve traditional mining production and marketing systems
|
| 503 |
+
� Improve the country�s geological information
|
| 504 |
+
Focus areas
|
| 505 |
+
7.2. Service sector
|
| 506 |
+
Tourism
|
| 507 |
+
Objectives
|
| 508 |
+
� Identification and developing destinations
|
| 509 |
+
� Infrastructure
|
| 510 |
+
� Competitiveness
|
| 511 |
+
?improve existing destinations
|
| 512 |
+
?develop new destinations
|
| 513 |
+
? diversify service and raise quality
|
| 514 |
+
� Market linkages, branding, and promotion
|
| 515 |
+
� Technology, research and development
|
| 516 |
+
� Preservation, maintenance and proper
|
| 517 |
+
utilization of heritage resources
|
| 518 |
+
� Expand job opportunities
|
| 519 |
+
� Raise incomes
|
| 520 |
+
� Build information management
|
| 521 |
+
systems
|
| 522 |
+
� Increase implementation capacity
|
| 523 |
+
Focus areas
|
| 524 |
+
7.3. Enabling sectors
|
| 525 |
+
Urban development
|
| 526 |
+
Objectives
|
| 527 |
+
? Prioritize productive sectors in job creation and enterprise
|
| 528 |
+
development plans
|
| 529 |
+
? Rapid development and equity goals in land provision system
|
| 530 |
+
? Participation of indigenous people in land redevelopment and
|
| 531 |
+
expansion
|
| 532 |
+
? Urban land registration and cadaster system, modern
|
| 533 |
+
property valuation
|
| 534 |
+
? Greenery and public spaces as well as waste disposal and
|
| 535 |
+
management in urban planning and implementation
|
| 536 |
+
? Housing development and financing options to reduce
|
| 537 |
+
housing shortages
|
| 538 |
+
? Integrated infrastructure and services provision
|
| 539 |
+
? Role of private sector in infrastructure development and
|
| 540 |
+
service provision
|
| 541 |
+
� Expand micro and small-scale
|
| 542 |
+
enterprises to reduce urban
|
| 543 |
+
unemployment
|
| 544 |
+
� Develop and avail urban land based on
|
| 545 |
+
demand, equity and cost effectiveness
|
| 546 |
+
� Make quality housing accessible both in
|
| 547 |
+
rural and urban areas
|
| 548 |
+
� Develop quality and integrated
|
| 549 |
+
infrastructure as well as service
|
| 550 |
+
provision in towns
|
| 551 |
+
� Improve financial management and
|
| 552 |
+
resource utilization in urban areas
|
| 553 |
+
Focus areas
|
| 554 |
+
7.3. Enabling sectors
|
| 555 |
+
Innovation and Technology
|
| 556 |
+
Objectives
|
| 557 |
+
? Access to innovation and
|
| 558 |
+
technological information
|
| 559 |
+
? Developing a digital economy
|
| 560 |
+
? Productivity enhancement and
|
| 561 |
+
competitiveness
|
| 562 |
+
? Build a digital economy
|
| 563 |
+
? Develop national scientific research and technological
|
| 564 |
+
capabilities
|
| 565 |
+
? Support problem solving research and development of
|
| 566 |
+
technologies necessary for raising production,
|
| 567 |
+
productivity and service provision
|
| 568 |
+
? Create jobs and capital that are based on technology
|
| 569 |
+
? Develop technological and data security protection
|
| 570 |
+
systems
|
| 571 |
+
Focus areas
|
| 572 |
+
7.3. Enabling sectors
|
| 573 |
+
Sustainable finance
|
| 574 |
+
Objectives
|
| 575 |
+
� Access to modern finance and saving culture in rural
|
| 576 |
+
areas
|
| 577 |
+
� Support to the private sector and corporations to
|
| 578 |
+
reinvest profits in productive sectors
|
| 579 |
+
� Role of private financial institutions in manufacturing
|
| 580 |
+
and agriculture
|
| 581 |
+
� Digital revenue collection system
|
| 582 |
+
� Tax equity (contraband, tax evasion, and bringing the
|
| 583 |
+
underground economy to the tax system)
|
| 584 |
+
� Domestic and foreign strategic partnerships
|
| 585 |
+
� Transform financing from short term to long-term,
|
| 586 |
+
sustainable and quality sources
|
| 587 |
+
� Ensure financing quality based on sectoral prioritization
|
| 588 |
+
and reduction of wastage
|
| 589 |
+
� Increase the number of domestic saving institutions both
|
| 590 |
+
in rural and urban areas
|
| 591 |
+
� Support domestic finance with foreign exchange capacity
|
| 592 |
+
and foreign direct investment
|
| 593 |
+
� Modernize domestic revenue collection system
|
| 594 |
+
� Raise voluntary tax payment attitude
|
| 595 |
+
� Bring the informal sector to the formal tax system
|
| 596 |
+
Focus areas
|
| 597 |
+
7.3. Enabling sectors
|
| 598 |
+
Transport
|
| 599 |
+
Objectives
|
| 600 |
+
� Access to infrastructure
|
| 601 |
+
� Implementation capacity
|
| 602 |
+
� Participation of the private sector and the general
|
| 603 |
+
public
|
| 604 |
+
� Financing capacity
|
| 605 |
+
� Ensure equitable access to transport infrastructure and
|
| 606 |
+
services
|
| 607 |
+
� Improve transport safety
|
| 608 |
+
� Make logistics services fast and reliable
|
| 609 |
+
� Build transport infrastructure and service that is
|
| 610 |
+
resilient to climate change
|
| 611 |
+
Focus areas
|
| 612 |
+
7.3. Enabling sectors
|
| 613 |
+
Energy
|
| 614 |
+
Objectives
|
| 615 |
+
? Equity in access to electricity services
|
| 616 |
+
? Energy access and quality
|
| 617 |
+
? Alternative sources of energy
|
| 618 |
+
? Reliability of electricity infrastructure
|
| 619 |
+
? Investment and income in energy subsector
|
| 620 |
+
� Ensure equitable access to transport
|
| 621 |
+
infrastructure and services
|
| 622 |
+
� Improve transport safety
|
| 623 |
+
� Make logistics services fast and reliable
|
| 624 |
+
� Build transport infrastructure and service that is
|
| 625 |
+
resilient to climate change
|
| 626 |
+
Focus areas
|
| 627 |
+
7.3. Enabling sectors
|
| 628 |
+
Irrigation
|
| 629 |
+
Objectives
|
| 630 |
+
? Medium and large scale irrigation infrastructure
|
| 631 |
+
? Job creation
|
| 632 |
+
? Share of government expenditure and alternative
|
| 633 |
+
financing options
|
| 634 |
+
? Institutional capacity and human resource
|
| 635 |
+
development
|
| 636 |
+
? Improve agricultural output and productivity
|
| 637 |
+
? Reduce government spending and enhance
|
| 638 |
+
institutional capacity and human resources
|
| 639 |
+
development
|
| 640 |
+
? Ensure the inclusion of all genders and
|
| 641 |
+
disabled citizens
|
| 642 |
+
? Develop alternative financing options for
|
| 643 |
+
irrigation development
|
| 644 |
+
Focus areas
|
| 645 |
+
7.3. Enabling sectors
|
| 646 |
+
Human capital development
|
| 647 |
+
Objectives
|
| 648 |
+
� Make education and training inclusive and equitable by
|
| 649 |
+
harmonizing the system with ability, need and capacity
|
| 650 |
+
� Develop capacity of educational institutions (teacher capacity,
|
| 651 |
+
inputs and technology)
|
| 652 |
+
� Establish education and training quality assurance system
|
| 653 |
+
� Avail free and compulsory education for pre-primary to junior
|
| 654 |
+
secondary levels and free education at the senior secondary levels
|
| 655 |
+
equitably
|
| 656 |
+
� Ensure the relevance of education and training system and
|
| 657 |
+
synchronize education policy with economic and social
|
| 658 |
+
development needs
|
| 659 |
+
� Make the education and training policy compatible with the
|
| 660 |
+
nation�s contemporary capacities as well as global and regional
|
| 661 |
+
market opportunities
|
| 662 |
+
� Enhance commitment, capability and responsibility of citizens
|
| 663 |
+
? Ensure equitable and quality health services
|
| 664 |
+
? Raise average life expectancy
|
| 665 |
+
? Achieve universal health coverage through
|
| 666 |
+
proactive and prevention health system
|
| 667 |
+
? Curtail preventable maternal and child deaths
|
| 668 |
+
? Reduce incidences of contagious and noncontagious
|
| 669 |
+
related diseases and deaths
|
| 670 |
+
? Build capacity for health tourism through
|
| 671 |
+
increased treatment capabilities
|
| 672 |
+
? Create a healthy society that is free from
|
| 673 |
+
addictions and use technology for supporting
|
| 674 |
+
knowledge led economic development
|
| 675 |
+
Focus areas
|
| 676 |
+
8 Nationally, regionally and locally balanced and competitive development
|
| 677 |
+
1. Lack of synchronization of investment with
|
| 678 |
+
resource potentials and development needs
|
| 679 |
+
2. Poor alignment of federal, regional and
|
| 680 |
+
district level investment plans with the
|
| 681 |
+
national development goals and envisioned
|
| 682 |
+
settlement patterns
|
| 683 |
+
3. Poor regional coordination due to low
|
| 684 |
+
consideration for trans-regional and
|
| 685 |
+
spatial issues in development plans of
|
| 686 |
+
regional states
|
| 687 |
+
4. Inter-regional and intra-regional
|
| 688 |
+
disparities in infrastructural development
|
| 689 |
+
and access to services
|
| 690 |
+
Challenges
|
| 691 |
+
8. Nationally, regionally and locally balanced and competitive development
|
| 692 |
+
1. Ensure that the investment flow and
|
| 693 |
+
infrastructural development plans fairly go hand in
|
| 694 |
+
hand with resource potential and development
|
| 695 |
+
needs
|
| 696 |
+
?Developing underutilized natural resources
|
| 697 |
+
?Equitable distribution and access to
|
| 698 |
+
infrastructure
|
| 699 |
+
?Sustainable environmental protection
|
| 700 |
+
2. Ensure the inclusion of pastoral and agro-pastoral
|
| 701 |
+
areas in the development
|
| 702 |
+
?Focused infrastructural development in pastoral
|
| 703 |
+
areas such as education and health sector input
|
| 704 |
+
provision as well as governance
|
| 705 |
+
?Market linkages with other areas and the central
|
| 706 |
+
markets
|
| 707 |
+
?Improve rural finance (credit and insurance) to
|
| 708 |
+
encourage fattening, milk processing, leather
|
| 709 |
+
production and irrigation agriculture
|
| 710 |
+
Focus areas
|
| 711 |
+
9. Monitoring and Evaluation
|
| 712 |
+
10 Years Perspective
|
| 713 |
+
Plan KPIs
|
| 714 |
+
Federal Implementing
|
| 715 |
+
Institutions
|
| 716 |
+
Planning and
|
| 717 |
+
Development Commission
|
| 718 |
+
Generate Data (Census,
|
| 719 |
+
Sample and administrative
|
| 720 |
+
data)
|
| 721 |
+
Annual Reports
|
| 722 |
+
Dialogue forums
|
| 723 |
+
(Civic Organizations, professional
|
| 724 |
+
associations, development partners,
|
| 725 |
+
intellectuals)
|
| 726 |
+
Central Statistical Agency
|
| 727 |
+
Database
|
| 728 |
+
National
|
| 729 |
+
Information Portal
|
| 730 |
+
National Statistics
|
| 731 |
+
Development Strategic
|
| 732 |
+
plan
|
| 733 |
+
Evaluation Reports
|
| 734 |
+
Prime Minister�s Office
|
| 735 |
+
House of People�s
|
| 736 |
+
Representatives
|
| 737 |
+
Thank you!
|
docStore/sample/Seychelles-revised_first_ndc-EN.pdf
ADDED
|
Binary file (372 kB). View file
|
|
|
docStore/sample/South Africa_s Low Emission Development Strategy.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bd18bff36fff79b97c5a343912f1296ea2d9d5481cf92c2887774fb4f2800418
|
| 3 |
+
size 1503168
|
docStore/sample/files.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"Ethiopia: 10 Year Development Plan":"docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt",
|
| 2 |
+
"Seychells:Revised NDC":"docStore/sample/Seychelles-revised_first_ndc-EN.pdf",
|
| 3 |
+
"South Africa:Low Emission strategy":"docStore/sample/South Africa_s Low Emission Development Strategy.pdf"
|
| 4 |
+
}
|
packages.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
poppler-utils
|
| 2 |
+
xpdf
|
| 3 |
+
tesseract-ocr
|
| 4 |
+
libtesseract-dev
|
paramconfig.cfg
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[target]
|
| 2 |
+
THRESHOLD = 0.50
|
| 3 |
+
MODEL = mtyrrell/ikitracs_economywide
|
| 4 |
+
SPLIT_BY = word
|
| 5 |
+
REMOVE_PUNC = 0
|
| 6 |
+
SPLIT_LENGTH = 60
|
| 7 |
+
SPLIT_OVERLAP = 10
|
| 8 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
| 9 |
+
TOP_KEY = 10
|
| 10 |
+
|
| 11 |
+
[netzero]
|
| 12 |
+
THRESHOLD = 0.50
|
| 13 |
+
MODEL = ilaria-oneofftech/ikitracks_netzero
|
| 14 |
+
SPLIT_BY = word
|
| 15 |
+
REMOVE_PUNC = 0
|
| 16 |
+
SPLIT_LENGTH = 60
|
| 17 |
+
SPLIT_OVERLAP = 10
|
| 18 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
| 19 |
+
TOP_KEY = 10
|
| 20 |
+
|
| 21 |
+
[sector]
|
| 22 |
+
THRESHOLD = 0.50
|
| 23 |
+
MODEL = ppsingh/bert-multilabel-sector-classifier
|
| 24 |
+
SPLIT_BY = word
|
| 25 |
+
REMOVE_PUNC = 0
|
| 26 |
+
SPLIT_LENGTH = 60
|
| 27 |
+
SPLIT_OVERLAP = 10
|
| 28 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
| 29 |
+
TOP_KEY = 10
|
| 30 |
+
|
| 31 |
+
[adapmit]
|
| 32 |
+
THRESHOLD = 0.50
|
| 33 |
+
MODEL = ppsingh/mpnet-adaptation_mitigation-classifier
|
| 34 |
+
SPLIT_BY = word
|
| 35 |
+
REMOVE_PUNC = 0
|
| 36 |
+
SPLIT_LENGTH = 60
|
| 37 |
+
SPLIT_OVERLAP = 10
|
| 38 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
| 39 |
+
TOP_KEY = 10
|
requirements.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
farm-haystack == 1.16
|
| 2 |
+
farm-haystack[ocr,pdf]==1.16.0
|
| 3 |
+
spacy==3.2.0
|
| 4 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
|
| 5 |
+
matplotlib==3.5.1
|
| 6 |
+
nltk==3.7
|
| 7 |
+
numpy==1.22.1
|
| 8 |
+
pandas==1.4.0
|
| 9 |
+
pdfplumber==0.6.2
|
| 10 |
+
Pillow==9.1.1
|
| 11 |
+
seaborn==0.11.2
|
| 12 |
+
transformers==4.25.1
|
| 13 |
+
st-annotated-text==3.0.0
|
| 14 |
+
markdown==3.4.1
|
| 15 |
+
summa==1.2.0
|
| 16 |
+
plotly
|
| 17 |
+
xlsxwriter
|
| 18 |
+
streamlit-aggrid
|
| 19 |
+
python-docx
|
style.css
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
.row-widget.stTextInput > div:first-of-type {
|
| 3 |
+
background: #fff;
|
| 4 |
+
display: flex;
|
| 5 |
+
border: 1px solid #dfe1e5;
|
| 6 |
+
box-shadow: none;
|
| 7 |
+
border-radius: 24px;
|
| 8 |
+
height: 50px;
|
| 9 |
+
width: auto;
|
| 10 |
+
margin: 10px auto 30px;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
.row-widget.stTextInput > div:first-of-type:hover,
|
| 14 |
+
.row-widget.stTextInput > div:first-of-type:focus {
|
| 15 |
+
box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
.row-widget.stTextInput .st-bq {
|
| 19 |
+
background-color: #fff;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
.row-widget.stTextInput > label {
|
| 23 |
+
color: #b3b3b3;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.row-widget.stButton > button {
|
| 27 |
+
border-radius: 24px;
|
| 28 |
+
background-color: #B6C9B1;
|
| 29 |
+
color: #fff;
|
| 30 |
+
border: none;
|
| 31 |
+
padding: 6px 20px;
|
| 32 |
+
float: right;
|
| 33 |
+
background-image: none;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.row-widget.stButton > button:hover {
|
| 37 |
+
box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
.row-widget.stButton > button:focus {
|
| 41 |
+
border: none;
|
| 42 |
+
color: #fff;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.footer-custom {
|
| 46 |
+
position: fixed;
|
| 47 |
+
bottom: 0;
|
| 48 |
+
width: 100%;
|
| 49 |
+
color: var(--text-color);
|
| 50 |
+
max-width: 698px;
|
| 51 |
+
font-size: 14px;
|
| 52 |
+
height: 50px;
|
| 53 |
+
padding: 10px 0;
|
| 54 |
+
z-index: 50;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.main {
|
| 58 |
+
padding: 20px;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
footer {
|
| 62 |
+
display: none !important;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.footer-custom a {
|
| 66 |
+
color: var(--text-color);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
#wikipedia-assistant {
|
| 70 |
+
font-size: 36px;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.generated-answer p {
|
| 74 |
+
font-size: 16px;
|
| 75 |
+
font-weight: bold;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.react-json-view {
|
| 79 |
+
margin: 40px 0 80px;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
.tooltip {
|
| 83 |
+
text-align: center;
|
| 84 |
+
line-height: 20px;
|
| 85 |
+
display: table-caption;
|
| 86 |
+
font-size: 10px;
|
| 87 |
+
border-radius: 50%;
|
| 88 |
+
height: 20px;
|
| 89 |
+
width: 20px;
|
| 90 |
+
position: relative;
|
| 91 |
+
cursor: pointer;
|
| 92 |
+
color:#000;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
.tooltip .tooltiptext {
|
| 96 |
+
visibility: hidden;
|
| 97 |
+
width: 280px;
|
| 98 |
+
text-align: center;
|
| 99 |
+
border-radius: 6px;
|
| 100 |
+
padding: 10px;
|
| 101 |
+
position: absolute;
|
| 102 |
+
z-index: 1;
|
| 103 |
+
top: 25px;
|
| 104 |
+
left: 50%;
|
| 105 |
+
margin-left: -140px;
|
| 106 |
+
font-size: 14px;
|
| 107 |
+
background-color: #fff;
|
| 108 |
+
border: 1px solid #ccc;
|
| 109 |
+
box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
|
| 110 |
+
color: #000;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.tooltip:hover .tooltiptext {
|
| 114 |
+
visibility: visible;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.sentence-wrapper {
|
| 118 |
+
border-left: 4px solid #ffc423;
|
| 119 |
+
padding-left: 20px;
|
| 120 |
+
margin-bottom: 40px;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
#context {
|
| 124 |
+
padding: 2rem 0 1rem;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
hr {
|
| 128 |
+
margin: 2em 0 1em;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
.technical-details-info {
|
| 133 |
+
margin-bottom: 100px;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.loader-wrapper {
|
| 137 |
+
display: flex;
|
| 138 |
+
align-items: center;
|
| 139 |
+
background-color: rgba(250, 202, 43, 0.2);
|
| 140 |
+
padding: 15px 20px;
|
| 141 |
+
border-radius: 6px;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.loader-wrapper p {
|
| 145 |
+
margin-bottom: 0;
|
| 146 |
+
margin-left: 20px;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.loader {
|
| 150 |
+
width: 30px;
|
| 151 |
+
height: 30px;
|
| 152 |
+
border: dotted 5px #868686;
|
| 153 |
+
border-radius: 100%;
|
| 154 |
+
animation: spin 1s linear infinite;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
.loader-note {
|
| 158 |
+
font-size: 14px;
|
| 159 |
+
color: #b3b3b3;
|
| 160 |
+
margin-left: 5px;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
@keyframes spin {
|
| 164 |
+
0% {
|
| 165 |
+
transform: rotate(0deg) scale(0.8);
|
| 166 |
+
border-top-color: transparent;
|
| 167 |
+
border-right-color: transparent;
|
| 168 |
+
}
|
| 169 |
+
50% { transform: rotate(180deg) scale(1.2);
|
| 170 |
+
border-color: #949494;
|
| 171 |
+
border-top-color: transparent;
|
| 172 |
+
border-right-color: transparent;
|
| 173 |
+
}
|
| 174 |
+
100% { transform: rotate(360deg) scale(0.8);
|
| 175 |
+
border-color: #bbbbbb;
|
| 176 |
+
border-top-color: transparent;
|
| 177 |
+
border-right-color: transparent;
|
| 178 |
+
}
|
| 179 |
+
}
|
| 180 |
+
|
utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# adding for package implementation
|
utils/adapmit_classifier.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from haystack.schema import Document
|
| 2 |
+
from typing import List, Tuple
|
| 3 |
+
from typing_extensions import Literal
|
| 4 |
+
import logging
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from pandas import DataFrame, Series
|
| 7 |
+
from utils.config import getconfig
|
| 8 |
+
from utils.preprocessing import processingpipeline
|
| 9 |
+
import streamlit as st
|
| 10 |
+
from haystack.nodes import TransformersDocumentClassifier
|
| 11 |
+
from transformers import pipeline
|
| 12 |
+
|
| 13 |
+
@st.cache_resource
|
| 14 |
+
def load_adapmitClassifier(config_file:str = None, classifier_name:str = None):
|
| 15 |
+
"""
|
| 16 |
+
loads the document classifier using haystack, where the name/path of model
|
| 17 |
+
in HF-hub as string is used to fetch the model object.Either configfile or
|
| 18 |
+
model should be passed.
|
| 19 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
| 20 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
| 21 |
+
Params
|
| 22 |
+
--------
|
| 23 |
+
config_file: config file path from which to read the model name
|
| 24 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
| 25 |
+
found then will look for configfile, else raise error.
|
| 26 |
+
Return: document classifier model
|
| 27 |
+
"""
|
| 28 |
+
if not classifier_name:
|
| 29 |
+
if not config_file:
|
| 30 |
+
logging.warning("Pass either model name or config file")
|
| 31 |
+
return
|
| 32 |
+
else:
|
| 33 |
+
config = getconfig(config_file)
|
| 34 |
+
classifier_name = config.get('adapmit','MODEL')
|
| 35 |
+
|
| 36 |
+
logging.info("Loading Adaptation Mitigation classifier")
|
| 37 |
+
# doc_classifier = TransformersDocumentClassifier(
|
| 38 |
+
# model_name_or_path=classifier_name,
|
| 39 |
+
# task="text-classification",
|
| 40 |
+
# top_k = None)
|
| 41 |
+
doc_classifier = pipeline("text-classification",
|
| 42 |
+
model=classifier_name,
|
| 43 |
+
return_all_scores=True,
|
| 44 |
+
function_to_apply= "sigmoid")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
return doc_classifier
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def runAdapMitPreprocessingPipeline(file_name:str, file_path:str,
|
| 51 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
| 52 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 53 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
| 54 |
+
"""
|
| 55 |
+
creates the pipeline and runs the preprocessing pipeline,
|
| 56 |
+
the params for pipeline are fetched from paramconfig
|
| 57 |
+
Params
|
| 58 |
+
------------
|
| 59 |
+
file_name: filename, in case of streamlit application use
|
| 60 |
+
st.session_state['filename']
|
| 61 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
| 62 |
+
split_by: document splitting strategy either as word or sentence
|
| 63 |
+
split_length: when synthetically creating the paragrpahs from document,
|
| 64 |
+
it defines the length of paragraph.
|
| 65 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
| 66 |
+
splititng of text.
|
| 67 |
+
split_overlap: Number of words or sentences that overlap when creating
|
| 68 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
| 69 |
+
when read in together with others. Therefore the overlap is used.
|
| 70 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
| 71 |
+
Return
|
| 72 |
+
--------------
|
| 73 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
| 74 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
| 75 |
+
need to use the List of Haystack Document, which can be fetched by
|
| 76 |
+
key = 'documents' on output.
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
adapmit_processing_pipeline = processingpipeline()
|
| 80 |
+
|
| 81 |
+
output_adapmit_pre = adapmit_processing_pipeline.run(file_paths = file_path,
|
| 82 |
+
params= {"FileConverter": {"file_path": file_path, \
|
| 83 |
+
"file_name": file_name},
|
| 84 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
| 85 |
+
"split_by": split_by, \
|
| 86 |
+
"split_length":split_length,\
|
| 87 |
+
"split_overlap": split_overlap, \
|
| 88 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
| 89 |
+
|
| 90 |
+
return output_adapmit_pre
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@st.cache_data
|
| 94 |
+
def adapmit_classification(haystack_doc:List[Document],
|
| 95 |
+
threshold:float = 0.5,
|
| 96 |
+
classifier_model:pipeline= None
|
| 97 |
+
)->Tuple[DataFrame,Series]:
|
| 98 |
+
"""
|
| 99 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
| 100 |
+
most appropriate label for each text. these labels are in terms of if text
|
| 101 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
| 102 |
+
Params
|
| 103 |
+
---------
|
| 104 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
| 105 |
+
contains the list of paragraphs in different format,here the list of
|
| 106 |
+
Haystack Documents is used.
|
| 107 |
+
threshold: threshold value for the model to keep the results from classifier
|
| 108 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
| 109 |
+
however if not then looks for model in streamlit session.
|
| 110 |
+
In case of streamlit avoid passing the model directly.
|
| 111 |
+
Returns
|
| 112 |
+
----------
|
| 113 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
| 114 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
| 115 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
| 116 |
+
"""
|
| 117 |
+
logging.info("Working on Adaptation-Mitigation Identification")
|
| 118 |
+
if not classifier_model:
|
| 119 |
+
classifier_model = st.session_state['adapmit_classifier']
|
| 120 |
+
|
| 121 |
+
predictions = classifier_model(haystack_doc)
|
| 122 |
+
# converting the predictions to desired format
|
| 123 |
+
list_ = []
|
| 124 |
+
for i in range(len(predictions)):
|
| 125 |
+
|
| 126 |
+
temp = predictions[i]
|
| 127 |
+
placeholder = {}
|
| 128 |
+
for j in range(len(temp)):
|
| 129 |
+
placeholder[temp[j]['label']] = temp[j]['score']
|
| 130 |
+
list_.append(placeholder)
|
| 131 |
+
labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
|
| 132 |
+
# labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
|
| 133 |
+
df = DataFrame.from_dict(labels_)
|
| 134 |
+
df = df.round(2)
|
| 135 |
+
|
| 136 |
+
return df
|
utils/config.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import configparser
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
def getconfig(configfile_path:str):
|
| 5 |
+
"""
|
| 6 |
+
configfile_path: file path of .cfg file
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
config = configparser.ConfigParser()
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
config.read_file(open(configfile_path))
|
| 13 |
+
return config
|
| 14 |
+
except:
|
| 15 |
+
logging.warning("config file not found")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Declare all the necessary variables
|
| 19 |
+
def get_classifier_params(model_name):
|
| 20 |
+
config = getconfig('paramconfig.cfg')
|
| 21 |
+
params = {}
|
| 22 |
+
params['model_name'] = config.get(model_name,'MODEL')
|
| 23 |
+
params['split_by'] = config.get(model_name,'SPLIT_BY')
|
| 24 |
+
params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
|
| 25 |
+
params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
|
| 26 |
+
params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
|
| 27 |
+
params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
|
| 28 |
+
params['threshold'] = float(config.get(model_name,'THRESHOLD'))
|
| 29 |
+
params['top_n'] = int(config.get(model_name,'TOP_KEY'))
|
| 30 |
+
|
| 31 |
+
return params
|
utils/netzero_classifier.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from haystack.nodes import TransformersDocumentClassifier
|
| 2 |
+
from haystack.schema import Document
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
from typing_extensions import Literal
|
| 5 |
+
import logging
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from pandas import DataFrame, Series
|
| 8 |
+
from utils.config import getconfig
|
| 9 |
+
from utils.preprocessing import processingpipeline
|
| 10 |
+
import streamlit as st
|
| 11 |
+
|
| 12 |
+
# Labels dictionary ###
|
| 13 |
+
_lab_dict = {
|
| 14 |
+
'NEGATIVE':'NO NETZERO TARGET',
|
| 15 |
+
'NETZERO':'NETZERO TARGET',
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
@st.cache_resource
|
| 19 |
+
def load_netzeroClassifier(config_file:str = None, classifier_name:str = None):
|
| 20 |
+
"""
|
| 21 |
+
loads the document classifier using haystack, where the name/path of model
|
| 22 |
+
in HF-hub as string is used to fetch the model object.Either configfile or
|
| 23 |
+
model should be passed.
|
| 24 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
| 25 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
| 26 |
+
Params
|
| 27 |
+
--------
|
| 28 |
+
config_file: config file path from which to read the model name
|
| 29 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
| 30 |
+
found then will look for configfile, else raise error.
|
| 31 |
+
Return: document classifier model
|
| 32 |
+
"""
|
| 33 |
+
if not classifier_name:
|
| 34 |
+
if not config_file:
|
| 35 |
+
logging.warning("Pass either model name or config file")
|
| 36 |
+
return
|
| 37 |
+
else:
|
| 38 |
+
config = getconfig(config_file)
|
| 39 |
+
classifier_name = config.get('netzero','MODEL')
|
| 40 |
+
|
| 41 |
+
logging.info("Loading netzero classifier")
|
| 42 |
+
doc_classifier = TransformersDocumentClassifier(
|
| 43 |
+
model_name_or_path=classifier_name,
|
| 44 |
+
task="text-classification")
|
| 45 |
+
|
| 46 |
+
return doc_classifier
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def runNetZeroPreprocessingPipeline(file_name:str, file_path:str,
|
| 50 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
| 51 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 52 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
| 53 |
+
"""
|
| 54 |
+
creates the pipeline and runs the preprocessing pipeline,
|
| 55 |
+
the params for pipeline are fetched from paramconfig
|
| 56 |
+
Params
|
| 57 |
+
------------
|
| 58 |
+
file_name: filename, in case of streamlit application use
|
| 59 |
+
st.session_state['filename']
|
| 60 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
| 61 |
+
split_by: document splitting strategy either as word or sentence
|
| 62 |
+
split_length: when synthetically creating the paragrpahs from document,
|
| 63 |
+
it defines the length of paragraph.
|
| 64 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
| 65 |
+
splititng of text.
|
| 66 |
+
split_overlap: Number of words or sentences that overlap when creating
|
| 67 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
| 68 |
+
when read in together with others. Therefore the overlap is used.
|
| 69 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
| 70 |
+
Return
|
| 71 |
+
--------------
|
| 72 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
| 73 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
| 74 |
+
need to use the List of Haystack Document, which can be fetched by
|
| 75 |
+
key = 'documents' on output.
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
netzero_processing_pipeline = processingpipeline()
|
| 79 |
+
|
| 80 |
+
output_netzero_pre = netzero_processing_pipeline.run(file_paths = file_path,
|
| 81 |
+
params= {"FileConverter": {"file_path": file_path, \
|
| 82 |
+
"file_name": file_name},
|
| 83 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
| 84 |
+
"split_by": split_by, \
|
| 85 |
+
"split_length":split_length,\
|
| 86 |
+
"split_overlap": split_overlap, \
|
| 87 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
| 88 |
+
|
| 89 |
+
return output_netzero_pre
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@st.cache_data
|
| 93 |
+
def netzero_classification(haystack_doc:List[Document],
|
| 94 |
+
threshold:float = 0.8,
|
| 95 |
+
classifier_model:TransformersDocumentClassifier= None
|
| 96 |
+
)->Tuple[DataFrame,Series]:
|
| 97 |
+
"""
|
| 98 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
| 99 |
+
most appropriate label for each text. these labels are in terms of if text
|
| 100 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
| 101 |
+
Params
|
| 102 |
+
---------
|
| 103 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
| 104 |
+
contains the list of paragraphs in different format,here the list of
|
| 105 |
+
Haystack Documents is used.
|
| 106 |
+
threshold: threshold value for the model to keep the results from classifier
|
| 107 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
| 108 |
+
however if not then looks for model in streamlit session.
|
| 109 |
+
In case of streamlit avoid passing the model directly.
|
| 110 |
+
Returns
|
| 111 |
+
----------
|
| 112 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
| 113 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
| 114 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
| 115 |
+
"""
|
| 116 |
+
logging.info("Working on Netzero Extraction")
|
| 117 |
+
if not classifier_model:
|
| 118 |
+
classifier_model = st.session_state['netzero_classifier']
|
| 119 |
+
|
| 120 |
+
results = classifier_model.predict(haystack_doc)
|
| 121 |
+
labels_= [(l.meta['classification']['label'],
|
| 122 |
+
l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
|
| 123 |
+
|
| 124 |
+
df = DataFrame(labels_, columns=["Target Label","Relevancy", "page","text"])
|
| 125 |
+
|
| 126 |
+
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
| 127 |
+
df.index += 1
|
| 128 |
+
# df =df[df['Relevancy']>threshold]
|
| 129 |
+
df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
|
| 130 |
+
|
| 131 |
+
# creating the dataframe for value counts of Labels, along with 'title' of Labels
|
| 132 |
+
# count_df = df['Target Label'].value_counts()
|
| 133 |
+
# count_df = count_df.rename('count')
|
| 134 |
+
# count_df = count_df.rename_axis('Target Label').reset_index()
|
| 135 |
+
# count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
| 136 |
+
|
| 137 |
+
return df
|
utils/preprocessing.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from haystack.nodes.base import BaseComponent
|
| 2 |
+
from haystack.schema import Document
|
| 3 |
+
from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
|
| 4 |
+
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
| 5 |
+
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
| 6 |
+
from typing_extensions import Literal
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import logging
|
| 9 |
+
import re
|
| 10 |
+
import string
|
| 11 |
+
from haystack.pipelines import Pipeline
|
| 12 |
+
|
| 13 |
+
def useOCR(file_path: str)-> Text:
|
| 14 |
+
"""
|
| 15 |
+
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
| 16 |
+
|
| 17 |
+
Params
|
| 18 |
+
----------
|
| 19 |
+
file_path: file_path of uploade file, returned by add_upload function in
|
| 20 |
+
uploadAndExample.py
|
| 21 |
+
|
| 22 |
+
Returns the text file as string.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
converter = PDFToTextOCRConverter(remove_numeric_tables=True,
|
| 27 |
+
valid_languages=["eng"])
|
| 28 |
+
docs = converter.convert(file_path=file_path, meta=None)
|
| 29 |
+
return docs[0].content
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class FileConverter(BaseComponent):
|
| 35 |
+
"""
|
| 36 |
+
Wrapper class to convert uploaded document into text by calling appropriate
|
| 37 |
+
Converter class, will use internally haystack PDFToTextOCR in case of image
|
| 38 |
+
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
| 39 |
+
label/output class for image.
|
| 40 |
+
|
| 41 |
+
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
| 42 |
+
2. https://docs.haystack.deepset.ai/docs/file_converters
|
| 43 |
+
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
| 44 |
+
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
outgoing_edges = 1
|
| 50 |
+
|
| 51 |
+
def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
|
| 52 |
+
id_hash_keys: Optional[List[str]] = None,
|
| 53 |
+
) -> Tuple[dict,str]:
|
| 54 |
+
""" this is required method to invoke the component in
|
| 55 |
+
the pipeline implementation.
|
| 56 |
+
|
| 57 |
+
Params
|
| 58 |
+
----------
|
| 59 |
+
file_name: name of file
|
| 60 |
+
file_path: file_path of uploade file, returned by add_upload function in
|
| 61 |
+
uploadAndExample.py
|
| 62 |
+
|
| 63 |
+
See the links provided in Class docstring/description to see other params
|
| 64 |
+
|
| 65 |
+
Return
|
| 66 |
+
---------
|
| 67 |
+
output: dictionary, with key as identifier and value could be anything
|
| 68 |
+
we need to return. In this case its the List of Hasyatck Document
|
| 69 |
+
|
| 70 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
if file_name.endswith('.pdf'):
|
| 74 |
+
converter = PDFToTextConverter(remove_numeric_tables=True)
|
| 75 |
+
if file_name.endswith('.txt'):
|
| 76 |
+
converter = TextConverter(remove_numeric_tables=True)
|
| 77 |
+
if file_name.endswith('.docx'):
|
| 78 |
+
converter = DocxToTextConverter()
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logging.error(e)
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
documents = []
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# encoding is empty, probably should be utf-8
|
| 89 |
+
document = converter.convert(
|
| 90 |
+
file_path=file_path, meta=None,
|
| 91 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
| 92 |
+
)[0]
|
| 93 |
+
|
| 94 |
+
text = document.content
|
| 95 |
+
|
| 96 |
+
# in case of scanned/images only PDF the content might contain only
|
| 97 |
+
# the page separator (\f or \x0c). We check if is so and use
|
| 98 |
+
# use the OCR to get the text.
|
| 99 |
+
filtered = re.sub(r'\x0c', '', text)
|
| 100 |
+
|
| 101 |
+
if filtered == "":
|
| 102 |
+
logging.info("Using OCR")
|
| 103 |
+
text = useOCR(file_path)
|
| 104 |
+
|
| 105 |
+
documents.append(Document(content=text,
|
| 106 |
+
meta={"name": file_name},
|
| 107 |
+
id_hash_keys=id_hash_keys))
|
| 108 |
+
|
| 109 |
+
logging.info('file conversion succesful')
|
| 110 |
+
output = {'documents': documents}
|
| 111 |
+
return output, 'output_1'
|
| 112 |
+
|
| 113 |
+
def run_batch():
|
| 114 |
+
"""
|
| 115 |
+
we dont have requirement to process the multiple files in one go
|
| 116 |
+
therefore nothing here, however to use the custom node we need to have
|
| 117 |
+
this method for the class.
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
return
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def basic(s:str, remove_punc:bool = False):
|
| 124 |
+
|
| 125 |
+
"""
|
| 126 |
+
Performs basic cleaning of text.
|
| 127 |
+
|
| 128 |
+
Params
|
| 129 |
+
----------
|
| 130 |
+
s: string to be processed
|
| 131 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
| 132 |
+
|
| 133 |
+
Returns: processed string: see comments in the source code for more info
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
# Remove URLs
|
| 137 |
+
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
| 138 |
+
s = re.sub(r"http\S+", " ", s)
|
| 139 |
+
|
| 140 |
+
# Remove new line characters
|
| 141 |
+
s = re.sub('\n', ' ', s)
|
| 142 |
+
|
| 143 |
+
# Remove punctuations
|
| 144 |
+
if remove_punc == True:
|
| 145 |
+
translator = str.maketrans(' ', ' ', string.punctuation)
|
| 146 |
+
s = s.translate(translator)
|
| 147 |
+
# Remove distracting single quotes and dotted pattern
|
| 148 |
+
s = re.sub("\'", " ", s)
|
| 149 |
+
s = s.replace("..","")
|
| 150 |
+
|
| 151 |
+
return s.strip()
|
| 152 |
+
|
| 153 |
+
def paraLengthCheck(paraList, max_len = 512):
|
| 154 |
+
new_para_list = []
|
| 155 |
+
for passage in paraList:
|
| 156 |
+
if len(passage.split()) > max_len:
|
| 157 |
+
iterations = int(len(passage.split())/max_len)
|
| 158 |
+
# # st.write("Splitting")
|
| 159 |
+
for i in range(iterations):
|
| 160 |
+
temp = " ".join(passage.split()[max_len*i:max_len*(i+1)])
|
| 161 |
+
new_para_list.append(temp)
|
| 162 |
+
temp = " ".join(passage.split()[max_len*(i+1):])
|
| 163 |
+
new_para_list.append(temp)
|
| 164 |
+
else:
|
| 165 |
+
new_para_list.append(passage)
|
| 166 |
+
|
| 167 |
+
return new_para_list
|
| 168 |
+
|
| 169 |
+
class UdfPreProcessor(BaseComponent):
|
| 170 |
+
"""
|
| 171 |
+
class to preprocess the document returned by FileConverter. It will check
|
| 172 |
+
for splitting strategy and splits the document by word or sentences and then
|
| 173 |
+
synthetically create the paragraphs.
|
| 174 |
+
|
| 175 |
+
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
| 176 |
+
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
| 177 |
+
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
| 178 |
+
|
| 179 |
+
"""
|
| 180 |
+
outgoing_edges = 1
|
| 181 |
+
|
| 182 |
+
def run(self, documents:List[Document], remove_punc:bool=False,
|
| 183 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
| 184 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 185 |
+
split_overlap:int = 0):
|
| 186 |
+
|
| 187 |
+
""" this is required method to invoke the component in
|
| 188 |
+
the pipeline implementation.
|
| 189 |
+
|
| 190 |
+
Params
|
| 191 |
+
----------
|
| 192 |
+
documents: documents from the output dictionary returned by Fileconverter
|
| 193 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
| 194 |
+
split_by: document splitting strategy either as word or sentence
|
| 195 |
+
split_length: when synthetically creating the paragrpahs from document,
|
| 196 |
+
it defines the length of paragraph.
|
| 197 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
| 198 |
+
splititng of text.
|
| 199 |
+
split_overlap: Number of words or sentences that overlap when creating
|
| 200 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
| 201 |
+
when read in together with others. Therefore the overlap is used.
|
| 202 |
+
|
| 203 |
+
Return
|
| 204 |
+
---------
|
| 205 |
+
output: dictionary, with key as identifier and value could be anything
|
| 206 |
+
we need to return. In this case the output will contain 4 objects
|
| 207 |
+
the paragraphs text list as List, Haystack document, Dataframe and
|
| 208 |
+
one raw text file.
|
| 209 |
+
|
| 210 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
| 211 |
+
|
| 212 |
+
"""
|
| 213 |
+
|
| 214 |
+
if split_by == 'sentence':
|
| 215 |
+
split_respect_sentence_boundary = False
|
| 216 |
+
|
| 217 |
+
else:
|
| 218 |
+
split_respect_sentence_boundary = split_respect_sentence_boundary
|
| 219 |
+
|
| 220 |
+
preprocessor = PreProcessor(
|
| 221 |
+
clean_empty_lines=True,
|
| 222 |
+
clean_whitespace=True,
|
| 223 |
+
clean_header_footer=True,
|
| 224 |
+
split_by=split_by,
|
| 225 |
+
split_length=split_length,
|
| 226 |
+
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
| 227 |
+
split_overlap=split_overlap,
|
| 228 |
+
|
| 229 |
+
# will add page number only in case of PDF not for text/docx file.
|
| 230 |
+
add_page_number=True
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
for i in documents:
|
| 234 |
+
# # basic cleaning before passing it to preprocessor.
|
| 235 |
+
# i = basic(i)
|
| 236 |
+
docs_processed = preprocessor.process([i])
|
| 237 |
+
for item in docs_processed:
|
| 238 |
+
item.content = basic(item.content, remove_punc= remove_punc)
|
| 239 |
+
|
| 240 |
+
df = pd.DataFrame(docs_processed)
|
| 241 |
+
all_text = " ".join(df.content.to_list())
|
| 242 |
+
para_list = df.content.to_list()
|
| 243 |
+
logging.info('document split into {} paragraphs'.format(len(para_list)))
|
| 244 |
+
output = {'documents': docs_processed,
|
| 245 |
+
'dataframe': df,
|
| 246 |
+
'text': all_text,
|
| 247 |
+
'paraList': para_list
|
| 248 |
+
}
|
| 249 |
+
return output, "output_1"
|
| 250 |
+
def run_batch():
|
| 251 |
+
"""
|
| 252 |
+
we dont have requirement to process the multiple files in one go
|
| 253 |
+
therefore nothing here, however to use the custom node we need to have
|
| 254 |
+
this method for the class.
|
| 255 |
+
"""
|
| 256 |
+
return
|
| 257 |
+
|
| 258 |
+
def processingpipeline():
|
| 259 |
+
"""
|
| 260 |
+
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
| 261 |
+
from utils.preprocessing
|
| 262 |
+
|
| 263 |
+
"""
|
| 264 |
+
|
| 265 |
+
preprocessing_pipeline = Pipeline()
|
| 266 |
+
file_converter = FileConverter()
|
| 267 |
+
custom_preprocessor = UdfPreProcessor()
|
| 268 |
+
|
| 269 |
+
preprocessing_pipeline.add_node(component=file_converter,
|
| 270 |
+
name="FileConverter", inputs=["File"])
|
| 271 |
+
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
| 272 |
+
name ='UdfPreProcessor', inputs=["FileConverter"])
|
| 273 |
+
|
| 274 |
+
return preprocessing_pipeline
|
| 275 |
+
|
utils/sector_classifier.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from haystack.schema import Document
|
| 2 |
+
from typing import List, Tuple
|
| 3 |
+
from typing_extensions import Literal
|
| 4 |
+
import logging
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from pandas import DataFrame, Series
|
| 7 |
+
from utils.config import getconfig
|
| 8 |
+
from utils.preprocessing import processingpipeline
|
| 9 |
+
import streamlit as st
|
| 10 |
+
from haystack.nodes import TransformersDocumentClassifier
|
| 11 |
+
from transformers import pipeline
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# # Labels dictionary ###
|
| 15 |
+
# _lab_dict = {
|
| 16 |
+
# 'NEGATIVE':'NO NETZERO TARGET',
|
| 17 |
+
# 'NETZERO':'NETZERO TARGET',
|
| 18 |
+
# }
|
| 19 |
+
|
| 20 |
+
@st.cache_resource
|
| 21 |
+
def load_sectorClassifier(config_file:str = None, classifier_name:str = None):
|
| 22 |
+
"""
|
| 23 |
+
loads the document classifier using haystack, where the name/path of model
|
| 24 |
+
in HF-hub as string is used to fetch the model object.Either configfile or
|
| 25 |
+
model should be passed.
|
| 26 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
| 27 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
| 28 |
+
Params
|
| 29 |
+
--------
|
| 30 |
+
config_file: config file path from which to read the model name
|
| 31 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
| 32 |
+
found then will look for configfile, else raise error.
|
| 33 |
+
Return: document classifier model
|
| 34 |
+
"""
|
| 35 |
+
if not classifier_name:
|
| 36 |
+
if not config_file:
|
| 37 |
+
logging.warning("Pass either model name or config file")
|
| 38 |
+
return
|
| 39 |
+
else:
|
| 40 |
+
config = getconfig(config_file)
|
| 41 |
+
classifier_name = config.get('sector','MODEL')
|
| 42 |
+
|
| 43 |
+
logging.info("Loading sector classifier")
|
| 44 |
+
# we are using the pipeline as the model is multilabel and DocumentClassifier
|
| 45 |
+
# from Haystack doesnt support multilabel
|
| 46 |
+
# in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
|
| 47 |
+
# if not then it will automatically use softmax, which is not a desired thing.
|
| 48 |
+
# doc_classifier = TransformersDocumentClassifier(
|
| 49 |
+
# model_name_or_path=classifier_name,
|
| 50 |
+
# task="text-classification",
|
| 51 |
+
# top_k = None)
|
| 52 |
+
|
| 53 |
+
doc_classifier = pipeline("text-classification",
|
| 54 |
+
model=classifier_name,
|
| 55 |
+
return_all_scores=True,
|
| 56 |
+
function_to_apply= "sigmoid")
|
| 57 |
+
|
| 58 |
+
return doc_classifier
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def runSectorPreprocessingPipeline(file_name:str, file_path:str,
|
| 62 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
| 63 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 64 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
| 65 |
+
"""
|
| 66 |
+
creates the pipeline and runs the preprocessing pipeline,
|
| 67 |
+
the params for pipeline are fetched from paramconfig
|
| 68 |
+
Params
|
| 69 |
+
------------
|
| 70 |
+
file_name: filename, in case of streamlit application use
|
| 71 |
+
st.session_state['filename']
|
| 72 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
| 73 |
+
split_by: document splitting strategy either as word or sentence
|
| 74 |
+
split_length: when synthetically creating the paragrpahs from document,
|
| 75 |
+
it defines the length of paragraph.
|
| 76 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
| 77 |
+
splititng of text.
|
| 78 |
+
split_overlap: Number of words or sentences that overlap when creating
|
| 79 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
| 80 |
+
when read in together with others. Therefore the overlap is used.
|
| 81 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
| 82 |
+
Return
|
| 83 |
+
--------------
|
| 84 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
| 85 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
| 86 |
+
need to use the List of Haystack Document, which can be fetched by
|
| 87 |
+
key = 'documents' on output.
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
sector_processing_pipeline = processingpipeline()
|
| 91 |
+
|
| 92 |
+
output_sector_pre = sector_processing_pipeline.run(file_paths = file_path,
|
| 93 |
+
params= {"FileConverter": {"file_path": file_path, \
|
| 94 |
+
"file_name": file_name},
|
| 95 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
| 96 |
+
"split_by": split_by, \
|
| 97 |
+
"split_length":split_length,\
|
| 98 |
+
"split_overlap": split_overlap, \
|
| 99 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
| 100 |
+
|
| 101 |
+
return output_sector_pre
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
@st.cache_data
|
| 105 |
+
def sector_classification(haystack_doc:List[Document],
|
| 106 |
+
threshold:float = 0.8,
|
| 107 |
+
classifier_model:TransformersDocumentClassifier= None
|
| 108 |
+
)->Tuple[DataFrame,Series]:
|
| 109 |
+
"""
|
| 110 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
| 111 |
+
most appropriate label for each text. these labels are in terms of if text
|
| 112 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
| 113 |
+
Params
|
| 114 |
+
---------
|
| 115 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
| 116 |
+
contains the list of paragraphs in different format,here the list of
|
| 117 |
+
Haystack Documents is used.
|
| 118 |
+
threshold: threshold value for the model to keep the results from classifier
|
| 119 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
| 120 |
+
however if not then looks for model in streamlit session.
|
| 121 |
+
In case of streamlit avoid passing the model directly.
|
| 122 |
+
Returns
|
| 123 |
+
----------
|
| 124 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
| 125 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
| 126 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
| 127 |
+
"""
|
| 128 |
+
logging.info("Working on Sector Identification")
|
| 129 |
+
if not classifier_model:
|
| 130 |
+
classifier_model = st.session_state['sector_classifier']
|
| 131 |
+
|
| 132 |
+
predictions = classifier_model(haystack_doc)
|
| 133 |
+
list_ = []
|
| 134 |
+
for i in range(len(predictions)):
|
| 135 |
+
|
| 136 |
+
temp = predictions[i]
|
| 137 |
+
placeholder = {}
|
| 138 |
+
for j in range(len(temp)):
|
| 139 |
+
placeholder[temp[j]['label']] = temp[j]['score']
|
| 140 |
+
list_.append(placeholder)
|
| 141 |
+
labels_ = [{**{'text':haystack_doc[l]},**list_[l]} for l in range(len(predictions))]
|
| 142 |
+
# labels_= [{**l.meta['classification']['details'],**{'text':l.content}} for l in results]
|
| 143 |
+
df = DataFrame.from_dict(labels_)
|
| 144 |
+
df = df.round(2)
|
| 145 |
+
|
| 146 |
+
return df
|
utils/target_classifier.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from haystack.nodes import TransformersDocumentClassifier
|
| 2 |
+
from haystack.schema import Document
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
from typing_extensions import Literal
|
| 5 |
+
import logging
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from pandas import DataFrame, Series
|
| 8 |
+
from utils.config import getconfig
|
| 9 |
+
from utils.preprocessing import processingpipeline
|
| 10 |
+
import streamlit as st
|
| 11 |
+
|
| 12 |
+
## Labels dictionary ###
|
| 13 |
+
_lab_dict = {
|
| 14 |
+
'LABEL_0':'NO TARGET INFO',
|
| 15 |
+
'LABEL_1':'ECONOMY-WIDE TARGET',
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
@st.cache_resource
|
| 19 |
+
def load_targetClassifier(config_file:str = None, classifier_name:str = None):
|
| 20 |
+
"""
|
| 21 |
+
loads the document classifier using haystack, where the name/path of model
|
| 22 |
+
in HF-hub as string is used to fetch the model object.Either configfile or
|
| 23 |
+
model should be passed.
|
| 24 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
| 25 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
| 26 |
+
Params
|
| 27 |
+
--------
|
| 28 |
+
config_file: config file path from which to read the model name
|
| 29 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
| 30 |
+
found then will look for configfile, else raise error.
|
| 31 |
+
Return: document classifier model
|
| 32 |
+
"""
|
| 33 |
+
if not classifier_name:
|
| 34 |
+
if not config_file:
|
| 35 |
+
logging.warning("Pass either model name or config file")
|
| 36 |
+
return
|
| 37 |
+
else:
|
| 38 |
+
config = getconfig(config_file)
|
| 39 |
+
classifier_name = config.get('target','MODEL')
|
| 40 |
+
|
| 41 |
+
logging.info("Loading classifier")
|
| 42 |
+
doc_classifier = TransformersDocumentClassifier(
|
| 43 |
+
model_name_or_path=classifier_name,
|
| 44 |
+
task="text-classification")
|
| 45 |
+
|
| 46 |
+
return doc_classifier
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def runTargetPreprocessingPipeline(file_name:str, file_path:str,
|
| 50 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
| 51 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 52 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
| 53 |
+
"""
|
| 54 |
+
creates the pipeline and runs the preprocessing pipeline,
|
| 55 |
+
the params for pipeline are fetched from paramconfig
|
| 56 |
+
Params
|
| 57 |
+
------------
|
| 58 |
+
file_name: filename, in case of streamlit application use
|
| 59 |
+
st.session_state['filename']
|
| 60 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
| 61 |
+
split_by: document splitting strategy either as word or sentence
|
| 62 |
+
split_length: when synthetically creating the paragrpahs from document,
|
| 63 |
+
it defines the length of paragraph.
|
| 64 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
| 65 |
+
splititng of text.
|
| 66 |
+
split_overlap: Number of words or sentences that overlap when creating
|
| 67 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
| 68 |
+
when read in together with others. Therefore the overlap is used.
|
| 69 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
| 70 |
+
Return
|
| 71 |
+
--------------
|
| 72 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
| 73 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
| 74 |
+
need to use the List of Haystack Document, which can be fetched by
|
| 75 |
+
key = 'documents' on output.
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
target_processing_pipeline = processingpipeline()
|
| 79 |
+
|
| 80 |
+
output_target_pre = target_processing_pipeline.run(file_paths = file_path,
|
| 81 |
+
params= {"FileConverter": {"file_path": file_path, \
|
| 82 |
+
"file_name": file_name},
|
| 83 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
| 84 |
+
"split_by": split_by, \
|
| 85 |
+
"split_length":split_length,\
|
| 86 |
+
"split_overlap": split_overlap, \
|
| 87 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
| 88 |
+
|
| 89 |
+
return output_target_pre
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@st.cache_data
|
| 93 |
+
def target_classification(haystack_doc:List[Document],
|
| 94 |
+
threshold:float = 0.8,
|
| 95 |
+
classifier_model:TransformersDocumentClassifier= None
|
| 96 |
+
)->Tuple[DataFrame,Series]:
|
| 97 |
+
"""
|
| 98 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
| 99 |
+
most appropriate label for each text. these labels are in terms of if text
|
| 100 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
| 101 |
+
Params
|
| 102 |
+
---------
|
| 103 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
| 104 |
+
contains the list of paragraphs in different format,here the list of
|
| 105 |
+
Haystack Documents is used.
|
| 106 |
+
threshold: threshold value for the model to keep the results from classifier
|
| 107 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
| 108 |
+
however if not then looks for model in streamlit session.
|
| 109 |
+
In case of streamlit avoid passing the model directly.
|
| 110 |
+
Returns
|
| 111 |
+
----------
|
| 112 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
| 113 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
| 114 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
| 115 |
+
"""
|
| 116 |
+
logging.info("Working on Target Extraction")
|
| 117 |
+
if not classifier_model:
|
| 118 |
+
classifier_model = st.session_state['target_classifier']
|
| 119 |
+
|
| 120 |
+
results = classifier_model.predict(haystack_doc)
|
| 121 |
+
labels_= [(l.meta['classification']['label'],
|
| 122 |
+
l.meta['classification']['score'],l.meta['page'],l.content,) for l in results]
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
df = DataFrame(labels_, columns=["Target Label","Relevancy","page","text"])
|
| 126 |
+
|
| 127 |
+
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
| 128 |
+
df.index += 1
|
| 129 |
+
# df =df[df['Relevancy']>threshold]
|
| 130 |
+
df['Label_def'] = df['Target Label'].apply(lambda i: _lab_dict[i])
|
| 131 |
+
|
| 132 |
+
# creating the dataframe for value counts of Labels, along with 'title' of Labels
|
| 133 |
+
# count_df = df['Target Label'].value_counts()
|
| 134 |
+
# count_df = count_df.rename('count')
|
| 135 |
+
# count_df = count_df.rename_axis('Target Label').reset_index()
|
| 136 |
+
# count_df['Label_def'] = count_df['Target Label'].apply(lambda x: _lab_dict[x])
|
| 137 |
+
|
| 138 |
+
return df
|
utils/uploadAndExample.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import tempfile
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
def add_upload(choice):
|
| 6 |
+
"""
|
| 7 |
+
Provdies the user with choice to either 'Upload Document' or 'Try Example'.
|
| 8 |
+
Based on user choice runs streamlit processes and save the path and name of
|
| 9 |
+
the 'file' to streamlit session_state which then can be fetched later.
|
| 10 |
+
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
if choice == 'Upload Document':
|
| 14 |
+
uploaded_file = st.sidebar.file_uploader('Upload the File',
|
| 15 |
+
type=['pdf', 'docx', 'txt'])
|
| 16 |
+
if uploaded_file is not None:
|
| 17 |
+
with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
|
| 18 |
+
bytes_data = uploaded_file.getvalue()
|
| 19 |
+
temp.write(bytes_data)
|
| 20 |
+
st.session_state['filename'] = uploaded_file.name
|
| 21 |
+
st.session_state['filepath'] = temp.name
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
else:
|
| 25 |
+
# listing the options
|
| 26 |
+
with open('docStore/sample/files.json','r') as json_file:
|
| 27 |
+
files = json.load(json_file)
|
| 28 |
+
|
| 29 |
+
option = st.sidebar.selectbox('Select the example document',
|
| 30 |
+
list(files.keys()))
|
| 31 |
+
file_name = file_path = files[option]
|
| 32 |
+
st.session_state['filename'] = file_name
|
| 33 |
+
st.session_state['filepath'] = file_path
|