Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| import bs4 | |
| import lxml | |
| import os | |
| from huggingface_hub import InferenceClient,HfApi | |
| import random | |
| import json | |
| import datetime | |
| import xmltodict | |
| from textblob import TextBlob | |
| os.system("python -m textblob.download_corpora") | |
| from prompts import ( | |
| GET_KEYWORD, | |
| COMPRESS_HISTORY_PROMPT, | |
| COMPRESS_DATA_PROMPT, | |
| COMPRESS_DATA_PROMPT_SMALL, | |
| PREFIX_ALT, | |
| PREFIX, | |
| ) | |
| client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") | |
| reponame="Omnibus/tmp" | |
| save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/' | |
| token_self = os.environ['HF_TOKEN2'] | |
| api=HfApi(token=token_self) | |
| def parse_action(string: str): | |
| print("PARSING:") | |
| print(string) | |
| assert string.startswith("action:") | |
| idx = string.find("action_input=") | |
| print(idx) | |
| if idx == -1: | |
| print ("idx == -1") | |
| print (string[8:]) | |
| return string[8:], None | |
| print ("last return:") | |
| print (string[8 : idx - 1]) | |
| print (string[idx + 13 :].strip("'").strip('"')) | |
| return string[8 : idx - 1], string[idx + 13 :].strip("'").strip('"') | |
| MAX_HISTORY = 100 | |
| MAX_DATA = 40000 | |
| def format_prompt(message, history): | |
| prompt = "<s>" | |
| for user_prompt, bot_response in history: | |
| prompt += f"[INST] {user_prompt} [/INST]" | |
| prompt += f" {bot_response}</s> " | |
| prompt += f"[INST] {message} [/INST]" | |
| return prompt | |
| def run_gpt( | |
| prompt_template, | |
| stop_tokens, | |
| max_tokens, | |
| seed, | |
| purpose, | |
| prefix_tog, | |
| **prompt_kwargs, | |
| ): | |
| timestamp=datetime.datetime.now() | |
| print(seed) | |
| generate_kwargs = dict( | |
| temperature=0.9, | |
| max_new_tokens=max_tokens, | |
| top_p=0.95, | |
| repetition_penalty=1.0, | |
| do_sample=True, | |
| seed=seed, | |
| ) | |
| print(f'prefix_tog:: {prefix_tog}') | |
| if prefix_tog == "normal": | |
| content = PREFIX.format( | |
| timestamp=timestamp, | |
| purpose=purpose, | |
| ) + prompt_template.format(**prompt_kwargs) | |
| if prefix_tog == "alternate": | |
| content = PREFIX_ALT + prompt_template.format(**prompt_kwargs) | |
| #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) | |
| formatted_prompt = format_prompt(f'{content}', **prompt_kwargs['history']) | |
| stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
| resp = "" | |
| for response in stream: | |
| resp += response.token.text | |
| #yield resp | |
| return resp | |
| NEWS_REPORTER="""You are attempting to complete the task | |
| task: Concatenate the input News Articles into a more concise JSON file. Keep all relevant data points. | |
| Data: | |
| {new_data} | |
| Compile the data above into a JSON formatted output that contains all data relevant to the task | |
| Include datapoints that will provide greater accuracy in completing the task | |
| Include all relevant information in great detail | |
| Return ONLY the JSON data | |
| output format: | |
| {output_format} | |
| """ | |
| output_format="""{"title": "title of the first article","description": "description of the article","article": "your custom written article","links": "all source links that have contributed to the article"},{"title": "title of the second article","description": "description of the article","article": "your custom written article","links": "all source links that have contributed to the article"}""" | |
| def summarize(inp,history,seed_slider,data=None): | |
| json_box=[] | |
| if inp == "": | |
| inp = "Process this data" | |
| #inp = format_prompt(inp,history) | |
| task = "Compile a detailed report" | |
| history.clear() | |
| yield "",[(inp,"Working on it...")],None | |
| if data != "Error" and data != "": | |
| timestamp=datetime.datetime.now() | |
| #seed=random.randint(1,1000000000) | |
| seed=seed_slider | |
| print(seed) | |
| generate_kwargs = dict( | |
| temperature=0.9, | |
| max_new_tokens=10240, | |
| top_p=0.95, | |
| repetition_penalty=1.0, | |
| do_sample=True, | |
| seed=seed, | |
| ) | |
| out = str(data) | |
| rl = len(out) | |
| print(f'rl:: {rl}') | |
| c=1 | |
| for i in str(out): | |
| if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<": | |
| c +=1 | |
| print (f'c:: {c}') | |
| divr=int(c)/MAX_DATA | |
| divi=int(divr)+1 if divr != int(divr) else int(divr) | |
| chunk = int(int(c)/divr) | |
| print(f'chunk:: {chunk}') | |
| print(f'divr:: {divr}') | |
| print (f'divi:: {divi}') | |
| s=0 | |
| e=chunk | |
| print(f'e:: {e}') | |
| out_box =[] | |
| resp = "" | |
| for z in range(divi): | |
| print(f's:e :: {s}:{e}') | |
| mes= f'Working on data chunk: {s}:{e}' | |
| new_data = out[s:e] | |
| #yield "", [(inp,f'{mes}\n{new_history}')] | |
| #content = NEWS_REPORTER.format(output_format=output_format,new_data=str(new_data.replace("{","").replace("}",""))) | |
| content = NEWS_REPORTER.format(output_format=output_format,new_data=str(new_data)) | |
| formatted=format_prompt(content,history) | |
| stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
| for response in stream: | |
| resp += response.token.text | |
| yield "", [(inp,resp)],None | |
| out_json=resp.replace("\n","").replace("```","") | |
| out_box.append(out_json.strip("</s>")) | |
| #out_box=eval(out_box) | |
| print("ADDING") | |
| yield "", [(inp,resp)],out_box | |
| e=e+chunk | |
| s=s+chunk | |
| else: | |
| rawp = "Provide a valid data source" | |
| history.append((inp,rawp)) | |
| yield "", history,None | |
| def find_rss(): | |
| r = requests.get(f'{save_data}rss1/0-phrase-index.json') | |
| r2 = requests.get(f'{save_data}rss1/0-hash-list.json') | |
| if r.status_code==200: | |
| phrase_dict = json.loads(r.text) | |
| else: phrase_dict={} | |
| if r2.status_code==200: | |
| hash_list = json.loads(r2.text) | |
| else: hash_list=[] | |
| timestamp=str(datetime.datetime.now()) | |
| timename=timestamp.replace(" ","--").replace(":","-").replace(".","-") | |
| error_box=[] | |
| error_box_schema={"Name":"","Error":"","Keys":"","Other":""} | |
| lod="" | |
| out_box=[] | |
| valid_box=[] | |
| yield [],[(None,"loading sources")],None | |
| with open ('valid_feeds.json','r') as j: | |
| cont = json.loads(j.read()) | |
| #print(cont) | |
| j.close() | |
| cnt=0 | |
| for ea in cont: | |
| try: | |
| #lod="" | |
| #print (ea['link']) | |
| if ea.get('link') is not None: | |
| rss_url=ea['link'] | |
| else: | |
| rss_url=ea['URL'] | |
| link_box=[] | |
| r = requests.get(f'{rss_url}') | |
| if r.status_code == 200: | |
| try: | |
| if ".json" in rss_url: | |
| lod = json.loads(r.text) | |
| if ".xml" in rss_url: | |
| lod = xmltodict.parse(r.content) | |
| if ".rss" in rss_url: | |
| lod = xmltodict.parse(r.content) | |
| else: | |
| try: | |
| lod = xmltodict.parse(r.content) | |
| except Exception as e: | |
| lod=f'{rss_url} ::ERROR:: {e}' | |
| error_box.append({"Name":rss_url,"Error":e,"Error Code":1}) | |
| if ea.get('section') is not None: | |
| section = ea['section'] | |
| else: section = "" | |
| valid_box.append({"source":ea['source'],"link":ea['link'],"section":section,"description":''}) | |
| except Exception as e: | |
| lod=f'{rss_url} ::ERROR:: {e}' | |
| error_box.append({"Name":rss_url,"Error":e,"Error Code":2}) | |
| else: | |
| lod = f'{rss_url} ::ERROR::COULD NOT CONNECT:: {r.status_code}' | |
| error_box.append({"Name":rss_url,"Error":f'Status Code:{r.status_code}',"Error Code":3}) | |
| pass | |
| try: | |
| #print(lod['rss']['channel']['item'][0].keys()) | |
| #print(lod['rss'].keys()) | |
| print("##############") | |
| #print(lod['rss'].keys()) | |
| print("##############") | |
| for i,ea in enumerate(lod['rss']['channel']['item']): | |
| try: | |
| r_link = ea['link'] | |
| if ea.get('title') != None: | |
| r_title = ea['title'] | |
| else: r_title= ea['source'] | |
| if ea.get('description') != None: | |
| r_description = ea['description'] | |
| else: r_description = "" | |
| if ea.get('pubDate') != None: | |
| r_date = ea['pubDate'] | |
| else: r_date = "" | |
| if ea.get('media') != None: | |
| r_media = ea['media'] | |
| print("MEDIA") | |
| print(r_media) | |
| else: r_media = "" | |
| tt = TextBlob(r_title) | |
| tt_phrases=tt.noun_phrases | |
| td = TextBlob(r_description) | |
| td_phrases=td.noun_phrases | |
| phrases=tt_phrases+td_phrases | |
| lod_hash = {"title":r_title, "date":r_date, "description":r_description,"link":r_link} | |
| hash_val=str(hash(str(lod_hash))) | |
| if not hash_val in hash_list: | |
| for phrase in phrases: | |
| if phrase_dict.get(phrase)!=None: | |
| phrase_dict[phrase.lower()].append([timename,cnt]) | |
| else: | |
| phrase_dict[phrase.lower()]=[[timename,cnt]] | |
| lods = {"num":cnt,"title":r_title, "date":r_date, "description":r_description,"link":r_link, "noun_phrases":phrases} | |
| cnt+=1 | |
| except Exception as e: | |
| print(f"Exception::{ea}") | |
| error_box.append({"Name":rss_url,"Keys":lod['rss']['channel']['item'],"Error":e,"Error Code":4}) | |
| print(e) | |
| pass | |
| #lods = {"title":"ERROR", "description":{e},"link":"ERROR"} | |
| if not hash_val in hash_list: | |
| link_box.append(lods) | |
| hash_list.append(hash_val) | |
| if link_box: | |
| lod={lod['rss']['channel']['title']:link_box} | |
| out_box.append(lod) | |
| except Exception as e: | |
| error_box.append({"Name":rss_url,"Error":e,"Error Code":5}) | |
| print(f'Exception::{e}') | |
| except Exception as e: | |
| error_box.append({"Name":rss_url,"Error":e,"Error Code":6}) | |
| print(f'Exception::{e}') | |
| pass | |
| ''' | |
| json_object_valid = json.dumps(valid_box, indent=4) | |
| with open("tmp3.json", "w") as outfile3: | |
| outfile3.write(json_object_valid) | |
| api.upload_file( | |
| path_or_fileobj="tmp3.json", | |
| path_in_repo=f"/rss/valid-{timename}.json", | |
| repo_id=reponame, | |
| #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
| token=token_self, | |
| repo_type="dataset", | |
| ) | |
| yield out_box,[(None,'')],error_box | |
| ''' | |
| print("DONE") | |
| json_object2 = json.dumps(hash_list, indent=4) | |
| #json_object = json.dumps(out_box,indent=4) | |
| with open("tmp32.json", "w") as outfile2: | |
| outfile2.write(json_object2) | |
| api.upload_file( | |
| path_or_fileobj="tmp32.json", | |
| path_in_repo=f"/rss1/0-hash-list.json", | |
| repo_id=reponame, | |
| #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
| token=token_self, | |
| repo_type="dataset", | |
| ) | |
| json_object1 = json.dumps(phrase_dict, indent=4) | |
| #json_object = json.dumps(out_box,indent=4) | |
| with open("tmp3.json", "w") as outfile1: | |
| outfile1.write(json_object1) | |
| api.upload_file( | |
| path_or_fileobj="tmp3.json", | |
| path_in_repo=f"/rss1/0-phrase-index.json", | |
| repo_id=reponame, | |
| #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
| token=token_self, | |
| repo_type="dataset", | |
| ) | |
| json_object = json.dumps(out_box, indent=4) | |
| #json_object = json.dumps(out_box,indent=4) | |
| with open("tmp2.json", "w") as outfile: | |
| outfile.write(json_object) | |
| api.upload_file( | |
| path_or_fileobj="tmp2.json", | |
| path_in_repo=f"/rss1/{timename}.json", | |
| repo_id=reponame, | |
| #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
| token=token_self, | |
| repo_type="dataset", | |
| ) | |
| yield out_box,[(None,f'Source is current as of:\n{timestamp} UTC\n\nThe current Date and Time is:\n{timestamp} UTC')],error_box | |
| def load_data(rss_url=None): | |
| timestamp=str(datetime.datetime.now()) | |
| yield None,None,None,[(None,f'Loading data source, please wait')] | |
| if rss_url: | |
| yield None,None,None,[(None,f'Loading data from {rss_url}, please wait')] | |
| r = requests.get(f'{rss_url}') | |
| if r.status_code == 200: | |
| lod={} | |
| try: | |
| if ".json" in rss_url: | |
| lod = json.loads(r.text) | |
| if ".xml" in rss_url: | |
| lod = xmltodict.parse(r.content) | |
| if ".rss" in rss_url: | |
| lod = xmltodict.parse(r.content) | |
| else: | |
| try: | |
| lod = xmltodict.parse(r.content) | |
| except Exception as e: | |
| yield None,None,None, [(None, f'{rss_url} ::ERROR:: {e}')] | |
| except Exception as e: | |
| yield None, None,None,[(None, f'{rss_url} ::ERROR:: {e}')] | |
| yield lod,None,None,[(None,f'Source is current as of:\n{timestamp} UTC')] | |
| else: | |
| yield None,None,None, [(None, f'{rss_url} ::ERROR::COULD NOT CONNECT:: {r.status_code}')] | |
| if not rss_url: | |
| yield None,None,None,[(None,f'Loading data from database, please wait')] | |
| r1 = requests.get(f'{save_data}rss1/0-phrase-index.json') | |
| if r1.status_code==200: | |
| print("STATUS GOOD") | |
| lod1 = json.loads(r1.text) | |
| lod2 =list(lod1.keys()) | |
| mes="Index: Loaded" | |
| print(mes) | |
| else: | |
| lod1=None | |
| lod2=None | |
| mes="Index: Not Found" | |
| f_ist = (api.list_repo_files(repo_id=f'{reponame}', repo_type="dataset")) | |
| f_ist.sort(reverse=True) | |
| #print(f_ist) | |
| r = requests.get(f'{save_data}{f_ist[0]}') | |
| lod = json.loads(r.text) | |
| filename=f_ist[0].split("/")[1].split(".json")[0].replace("--"," ") | |
| print (filename) | |
| filename_start = filename.split(" ")[0] | |
| filename_end = filename.split(" ")[1] | |
| filename_end = filename_end.replace("-"[0],":").replace("-"[0],":").replace("-"[0],".") | |
| #filename_end_far=filename_end.split(":")[2] | |
| print (filename) | |
| #yield lod,lod1,gr.update(choices=[z for z in lod2]),[(None,f'Source is current as of:\n{filename_start} {filename_end} UTC\n\nThe current Date and Time is:\n{timestamp} UTC\n{mes}')] | |
| yield lod,lod1,lod2,[(None,f'Source is current as of:\n{filename_start} {filename_end} UTC\n\nThe current Date and Time is:\n{timestamp} UTC\n{mes}')] | |
| def load_html(conv,ind,files): | |
| ht=f"""<div class="div_box">""" | |
| #print(files) | |
| key_list=list(files.keys()) | |
| print(key_list) | |
| for second in key_list: | |
| r = requests.get(f'{save_data}rss1/{second}.json') | |
| if r.status_code==200: | |
| file_json=json.loads(r.text) | |
| print("LOADED") | |
| #print(file_json) | |
| for th in file_json: | |
| #print(th) | |
| for dd in th: | |
| print(dd) | |
| ''' | |
| for vv in files[second]: | |
| print(vv) | |
| try: | |
| if th[dd][vv]: | |
| #tok=file_json[[th][dd]] | |
| tok1=th[dd][vv] | |
| print(tok1) | |
| ht+=f"""<pre class="bpost"><div class="bhead"><h1>{dd}</h1><h2>{tok1['title']}</h2><br><h5>{tok1['description']}</h5><br>{tok1['date']}<br>{tok1['link']}<br>{tok1['noun_phrases']}</div></pre>""" | |
| else: | |
| print("PASSING") | |
| except Exception as e: | |
| print(e) | |
| ''' | |
| for bb in th[dd]: | |
| print(bb) | |
| print(files[second]) | |
| #if bb['num'] in files[second]: | |
| if bb['num'] in files[second]: | |
| print("YES") | |
| #for ea in files[second]: | |
| # print(ea) | |
| ht+=f"""<pre class="bpost"><div class="bhead"><h1>{dd}</h1><h2>{bb['title']}</h2><br><h5>{bb['description']}</h5><br>{bb['date']}<br><a href='{bb['link']}'>{bb['link']}</a></div></pre>""" | |
| ''' | |
| print("LOAD HTML") | |
| #print(conv) | |
| ht="" | |
| ht+=f"""<div class="div_box">""" | |
| #this = list(conv.keys()) | |
| for th in conv[:5]: | |
| print(th) | |
| for ba in list(th.keys()): | |
| print(ba) | |
| for ea in th[ba]: | |
| print(ea) | |
| ht+=f"""<pre class="bpost"><div class="bhead"><h1>{ba}</h1><h2>{ea['title']}</h2><br><h5>{ea['description']}</h5><br>{ea['date']}<br>{ea['link']}<br>{ea['noun_phrases']}</div></pre>""" | |
| ''' | |
| ht+=f"""</div>""" | |
| with open('index.html','r') as h: | |
| html=h.read() | |
| html = html.replace("$body",f"{ht}") | |
| h.close() | |
| return html | |
| def search_fn(inp,data,data_keys): | |
| collected=[] | |
| out_dict={} | |
| cnt = 0 | |
| mes=f'{cnt} Found' | |
| print(data_keys) | |
| for ea in data_keys: | |
| if inp.lower() in ea.lower(): | |
| print("DATA") | |
| print(data[ea]) | |
| print(ea) | |
| for zz in data[ea]: | |
| #print(zz) | |
| print(zz[0]) | |
| print(zz[1]) | |
| if out_dict.get(zz[0]) != None and not zz[1] in out_dict: | |
| print("YES") | |
| out_dict[zz[0]].append(zz[1]) | |
| else: | |
| out_dict[zz[0]]=[zz[1]] | |
| cnt+=1 | |
| #collected.append(data[ea]) | |
| #cnt=len(list(out_dict.keys())) | |
| mes=f'{cnt} Found' | |
| yield mes,out_dict,out_dict | |
| with gr.Blocks() as app: | |
| loaded_data=gr.State() | |
| loaded_index=gr.State() | |
| loaded_keys=gr.State() | |
| selected=gr.State() | |
| out_html=gr.HTML() | |
| cb = gr.Chatbot(height=600, show_share_button=True, show_copy_button=True) | |
| with gr.Row(): | |
| phrase_dd=gr.Dropdown(label="Search",choices=[]) | |
| go_btn=gr.Button("Get News") | |
| with gr.Row(): | |
| inst = gr.Textbox(label="Search") | |
| sub_btn=gr.Button("Search") | |
| mes_html=gr.HTML() | |
| with gr.Row(): | |
| rss_custom=gr.Textbox(label="URL for RSS feed (.rss,.xml)") | |
| load_btn = gr.Button("Load RSS") | |
| with gr.Row(): | |
| load_page=gr.Button("Load HTML") | |
| seed_slider=gr.Slider(label="Seed",step=1,minimum=1,maximum=9999999999999999999,value=1,interactive=True) | |
| with gr.Accordion(open=False): | |
| u_btn=gr.Button("Update [RSS Data]") | |
| keyw = gr.Button("Use Keyword [Experimental]") | |
| with gr.Row(): | |
| out_json = gr.JSON() | |
| #error_box=gr.JSON() | |
| error_box=gr.JSON() | |
| fil = gr.Textbox() | |
| go_btn.click(load_html,[loaded_data,loaded_index,selected],out_html) | |
| sub_btn.click(search_fn,[inst,loaded_index,loaded_keys],[mes_html,selected,error_box]) | |
| #inst.change(search_fn,[inst,loaded_index,loaded_keys],[mes_html,selected]) | |
| load_page.click(load_html,loaded_data,out_html) | |
| #keyw.click(get_records,[inst,out_json],[inst,cb]) | |
| app.load(load_data,rss_custom,[loaded_data,loaded_index,loaded_keys,cb]) | |
| #load_btn.click(load_data,rss_custom,[loaded_data,cb]) | |
| u_btn.click(find_rss,None,[out_json,cb,error_box]) | |
| #sub_btn.click(summarize,[inst,cb,seed_slider,out_json],[inst,cb,error_box]) | |
| app.queue(default_concurrency_limit=20).launch() | |