Spaces:
Sleeping
Sleeping
Update utils/sdg_classifier.py
Browse files- utils/sdg_classifier.py +13 -13
utils/sdg_classifier.py
CHANGED
|
@@ -95,7 +95,7 @@ def classification(haystack_doc:List[Document],
|
|
| 95 |
the number of times it is covered/discussed/count_of_paragraphs.
|
| 96 |
|
| 97 |
"""
|
| 98 |
-
logging.info("Working on
|
| 99 |
if not classifier_model:
|
| 100 |
if check_streamlit():
|
| 101 |
classifier_model = st.session_state['vulnerability_classifier']
|
|
@@ -109,27 +109,27 @@ def classification(haystack_doc:List[Document],
|
|
| 109 |
labels_= [(l.meta['classification']['label'],
|
| 110 |
l.meta['classification']['score'],l.content,) for l in results]
|
| 111 |
|
| 112 |
-
df = DataFrame(labels_, columns=["
|
| 113 |
|
| 114 |
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
| 115 |
df.index += 1
|
| 116 |
df =df[df['Relevancy']>threshold]
|
| 117 |
|
| 118 |
# creating the dataframe for value counts of SDG, along with 'title' of SDGs
|
| 119 |
-
x = df['
|
| 120 |
x = x.rename('count')
|
| 121 |
-
x = x.rename_axis('
|
| 122 |
-
x["Vulnerability"] = pd.to_numeric(x["
|
| 123 |
x = x.sort_values(by=['count'], ascending=False)
|
| 124 |
-
x['
|
| 125 |
-
x['
|
| 126 |
|
| 127 |
-
df['
|
| 128 |
-
df = df.sort_values('
|
| 129 |
|
| 130 |
return df, x
|
| 131 |
|
| 132 |
-
def
|
| 133 |
split_by: Literal["sentence", "word"] = 'sentence',
|
| 134 |
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 135 |
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
|
@@ -163,9 +163,9 @@ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
|
|
| 163 |
|
| 164 |
"""
|
| 165 |
|
| 166 |
-
|
| 167 |
|
| 168 |
-
|
| 169 |
params= {"FileConverter": {"file_path": file_path, \
|
| 170 |
"file_name": file_name},
|
| 171 |
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
|
@@ -174,4 +174,4 @@ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
|
|
| 174 |
"split_overlap": split_overlap, \
|
| 175 |
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
| 176 |
|
| 177 |
-
return
|
|
|
|
| 95 |
the number of times it is covered/discussed/count_of_paragraphs.
|
| 96 |
|
| 97 |
"""
|
| 98 |
+
logging.info("Working on vulnerability Classification")
|
| 99 |
if not classifier_model:
|
| 100 |
if check_streamlit():
|
| 101 |
classifier_model = st.session_state['vulnerability_classifier']
|
|
|
|
| 109 |
labels_= [(l.meta['classification']['label'],
|
| 110 |
l.meta['classification']['score'],l.content,) for l in results]
|
| 111 |
|
| 112 |
+
df = DataFrame(labels_, columns=["vulnerability","Relevancy","text"])
|
| 113 |
|
| 114 |
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
| 115 |
df.index += 1
|
| 116 |
df =df[df['Relevancy']>threshold]
|
| 117 |
|
| 118 |
# creating the dataframe for value counts of SDG, along with 'title' of SDGs
|
| 119 |
+
x = df['vulnerability'].value_counts()
|
| 120 |
x = x.rename('count')
|
| 121 |
+
x = x.rename_axis('vulnerability').reset_index()
|
| 122 |
+
x["Vulnerability"] = pd.to_numeric(x["vulnerability"])
|
| 123 |
x = x.sort_values(by=['count'], ascending=False)
|
| 124 |
+
x['vulnerability_name'] = x['vulnerability'].apply(lambda x: _lab_dict[x])
|
| 125 |
+
x['vulnerability_Num'] = x['vulnerability'].apply(lambda x: "vulnerability "+str(x))
|
| 126 |
|
| 127 |
+
df['vulnerability'] = pd.to_numeric(df['vulnerability'])
|
| 128 |
+
df = df.sort_values('vulnerability')
|
| 129 |
|
| 130 |
return df, x
|
| 131 |
|
| 132 |
+
def runPreprocessingPipeline(file_name:str, file_path:str,
|
| 133 |
split_by: Literal["sentence", "word"] = 'sentence',
|
| 134 |
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 135 |
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
|
|
|
| 163 |
|
| 164 |
"""
|
| 165 |
|
| 166 |
+
processing_pipeline = processingpipeline()
|
| 167 |
|
| 168 |
+
output_pre = processing_pipeline.run(file_paths = file_path,
|
| 169 |
params= {"FileConverter": {"file_path": file_path, \
|
| 170 |
"file_name": file_name},
|
| 171 |
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
|
|
|
| 174 |
"split_overlap": split_overlap, \
|
| 175 |
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
| 176 |
|
| 177 |
+
return output_pre
|