m. polinsky
commited on
Update digestor.py
Browse files- digestor.py +3 -43
digestor.py
CHANGED
|
@@ -174,6 +174,7 @@ class Digestor:
|
|
| 174 |
# API CALLS: consider placing the code from query() into here. * * * *
|
| 175 |
for chunk in chunklist:
|
| 176 |
safe = False
|
|
|
|
| 177 |
with Timer(name=f"{stubhead}_query_time", logger=None):
|
| 178 |
while not safe and repeat < 4:
|
| 179 |
try: # make these digest params.
|
|
@@ -190,7 +191,8 @@ class Digestor:
|
|
| 190 |
print("Summarization error, repeating...")
|
| 191 |
print(e)
|
| 192 |
repeat+=1
|
| 193 |
-
|
|
|
|
| 194 |
return collection_bin
|
| 195 |
|
| 196 |
|
|
@@ -207,46 +209,4 @@ class Digestor:
|
|
| 207 |
for each in self.summaries:
|
| 208 |
digest.append(' '.join(each.summary_text))
|
| 209 |
|
| 210 |
-
# Create dict to write out digest data for analysis
|
| 211 |
-
out_data = {}
|
| 212 |
-
datetime_str = f"""{dt.now()}"""
|
| 213 |
-
choices_str = ', '.join(self.user_choices)
|
| 214 |
-
digest_str = '\n\n'.join(digest)
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
# This is a long comprehension to store all the fields and values in each summary.
|
| 218 |
-
# integer: {
|
| 219 |
-
# name_of_field:value except for source,
|
| 220 |
-
# which is unhashable so needs explicit handling.
|
| 221 |
-
# }
|
| 222 |
-
summaries = { # k is a summary tuple, i,p = enumerate(k)
|
| 223 |
-
# Here we take the first dozen words of the first summary chunk as key
|
| 224 |
-
c: {
|
| 225 |
-
# field name : value unless its the source
|
| 226 |
-
k._fields[i]:p if k._fields[i]!='source'
|
| 227 |
-
else
|
| 228 |
-
{
|
| 229 |
-
'name': k.source.source_name,
|
| 230 |
-
'source_url': k.source.source_url,
|
| 231 |
-
'Summarization" Checkpoint': k.source.source_summarization_checkpoint,
|
| 232 |
-
'NER Checkpoint': k.source.source_ner_checkpoint,
|
| 233 |
-
} for i,p in enumerate(k)
|
| 234 |
-
} for c,k in enumerate(self.summaries)}
|
| 235 |
-
|
| 236 |
-
out_data['timestamp'] = datetime_str
|
| 237 |
-
out_data['choices'] = choices_str
|
| 238 |
-
out_data['digest_text'] = digest_str
|
| 239 |
-
out_data['article_count'] = len(self.summaries)
|
| 240 |
-
out_data['digest_length'] = len(digest_str.split(" "))
|
| 241 |
-
out_data['digest_time'] = self.timer.timers['digest_time']
|
| 242 |
-
out_data['sum_params'] = {
|
| 243 |
-
'token_limit':self.token_limit,
|
| 244 |
-
'word_limit':self.word_limit,
|
| 245 |
-
'params':self.SUMMARIZATION_PARAMETERS,
|
| 246 |
-
}
|
| 247 |
-
out_data['summaries'] = summaries
|
| 248 |
-
|
| 249 |
-
|
| 250 |
self.text = digest_str
|
| 251 |
-
|
| 252 |
-
return out_data
|
|
|
|
| 174 |
# API CALLS: consider placing the code from query() into here. * * * *
|
| 175 |
for chunk in chunklist:
|
| 176 |
safe = False
|
| 177 |
+
summarized_chunk = None
|
| 178 |
with Timer(name=f"{stubhead}_query_time", logger=None):
|
| 179 |
while not safe and repeat < 4:
|
| 180 |
try: # make these digest params.
|
|
|
|
| 191 |
print("Summarization error, repeating...")
|
| 192 |
print(e)
|
| 193 |
repeat+=1
|
| 194 |
+
if summarizaed_chunk is not None:
|
| 195 |
+
collection_bin.append(summarized_chunk)
|
| 196 |
return collection_bin
|
| 197 |
|
| 198 |
|
|
|
|
| 209 |
for each in self.summaries:
|
| 210 |
digest.append(' '.join(each.summary_text))
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
self.text = digest_str
|
|
|
|
|
|