Context-News

Sleeping

File size: 20,750 Bytes

51bd6f0
 
 
 
 
 
 
 
 
 
38e60db
a156ae0
38e60db
07422cb
51bd6f0
9cf40df
51bd6f0
 
 
f590ea3
51bd6f0
 
536027d
eabc351
 
38e60db
536027d
 
51bd6f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9835c49
51bd6f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f590ea3
51bd6f0
 
 
 
 
 
 
 
 
 
 
 
 
fc8e812
f590ea3
 
 
 
 
 
fcf4b1e
f590ea3
51bd6f0
 
29191b2
51bd6f0
 
 
 
 
 
 
 
 
 
126dfaf
6f68250
d127b48
6f68250
 
 
 
 
5396dfd
 
ce9f05c
126dfaf
93ba80e
2e5cb72
041f01f
126dfaf
 
 
 
 
 
 
 
 
 
 
2e5cb72
 
126dfaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b08b82d
 
c08d20c
126dfaf
 
 
 
 
 
 
58f6aaf
126dfaf
 
 
 
 
 
 
 
 
 
 
 
20807ab
5d542ef
041f01f
 
 
5d542ef
 
 
126dfaf
 
 
 
 
 
 
 
 
 
 
041f01f
 
126dfaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d542ef
126dfaf
 
13671d6
 
970e214
5d542ef
 
 
 
 
 
 
 
 
126dfaf
 
 
5189ca7
 
49631f7
 
 
 
 
 
 
 
041f01f
126dfaf
 
 
 
 
 
5d542ef
 
 
34d1ca8
 
 
126dfaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d542ef
 
 
 
 
 
 
 
 
 
 
 
 
1fb2d29
041f01f
 
 
 
 
90c09b5
041f01f
 
 
 
 
126dfaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f929215
126dfaf
f929215
126dfaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7a16f2
126dfaf
b7a16f2
f929215
126dfaf
f929215
126dfaf
f929215
20807ab
43622dd
e0e4e8a
43622dd
f929215
43622dd
e0e4e8a
43622dd
f929215
 
43622dd
126dfaf
 
 
 
 
 
 
 
 
 
 
 
3383f94
 
206c944
0ad7db1
3383f94
89f1822
6406a7a
d5af420
 
cda2258
 
70e8d7d
 
f923e8a
9580629
89f1822
f923e8a
73c93f6
8133d64
d64db1b
8133d64
1fbf151
cf03a2e
757132d
 
 
 
 
 
 
 
 
 
 
 
89f1822
 
2e4e181
090e7a4
2e4e181
89f1822
d64db1b
 
963b1bb
d64db1b
cda2258
9b8267e
2b0e32d
206c944
 
9f8bfc7
13671d6
2b0e32d
9d1ad21
9edbf2b
1386158
9edbf2b
13671d6
cda2258
206c944
 
 
 
 
 
 
f929215
8d70b86
0ad7db1
7b53cb1
 
fe33986
3383f94
8d70b86
88c0db0
 
f3380fe
5286c84
1eca8e0
74db455
f3380fe
1eca8e0
757132d
88c0db0
d72e6ed
d5af420
 
73c93f6
0ad7db1
73c93f6
0ad7db1
80bfcb7
3383f94
51bd6f0
4e1f6da
43622dd
f929215
3383f94
4e1f6da
561efb9
cda2258
 
6b908be
51bd6f0
3383f94
 
7b53cb1
51bd6f0
853f578
eabc351
4e1f6da
 
 
853f578
9cf40df
 
51bd6f0
 
729d28e
b610a83
54c821f
cda2258
3383f94
80bfcb7
b703140
4e1f6da
 
3383f94
4e1f6da
54c821f
4f3d971
3cfee22

import gradio as gr
import requests
import bs4
import lxml
import os
from huggingface_hub import InferenceClient,HfApi
import random
import json
import datetime
import xmltodict
from textblob import TextBlob
os.system("python -m textblob.download_corpora")


from prompts import (
    GET_KEYWORD,
    COMPRESS_HISTORY_PROMPT,
    COMPRESS_DATA_PROMPT,
    COMPRESS_DATA_PROMPT_SMALL,
    PREFIX_ALT,
    PREFIX,
)
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
reponame="Omnibus/tmp"
save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
token_self = os.environ['HF_TOKEN2']
api=HfApi(token=token_self)


def parse_action(string: str):
    print("PARSING:")
    print(string)
    assert string.startswith("action:")
    idx = string.find("action_input=")
    print(idx)
    if idx == -1:
        print ("idx == -1")
        print (string[8:])
        return string[8:], None

    print ("last return:")
    print (string[8 : idx - 1])
    print (string[idx + 13 :].strip("'").strip('"'))
    return string[8 : idx - 1], string[idx + 13 :].strip("'").strip('"')

MAX_HISTORY = 100
MAX_DATA = 40000

def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt

def run_gpt(
    prompt_template,
    stop_tokens,
    max_tokens,
    seed,
    purpose,
    prefix_tog,
    **prompt_kwargs,
):
    timestamp=datetime.datetime.now()

    print(seed)
    generate_kwargs = dict(
        temperature=0.9,
        max_new_tokens=max_tokens,
        top_p=0.95,
        repetition_penalty=1.0,
        do_sample=True,
        seed=seed,
    )
    print(f'prefix_tog:: {prefix_tog}')
    if prefix_tog == "normal":
        content = PREFIX.format(
            timestamp=timestamp,
            purpose=purpose,
        ) + prompt_template.format(**prompt_kwargs)
    if prefix_tog == "alternate":
        content = PREFIX_ALT + prompt_template.format(**prompt_kwargs)
              
    
    #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
    formatted_prompt = format_prompt(f'{content}', **prompt_kwargs['history'])

    stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
    resp = ""
    for response in stream:
        resp += response.token.text
        #yield resp

    return resp



NEWS_REPORTER="""You are attempting to complete the task
task: Concatenate the input News Articles into a more concise JSON file.  Keep all relevant data points.
Data:
{new_data}
Compile the data above into a JSON formatted output that contains all data relevant to the task
Include datapoints that will provide greater accuracy in completing the task
Include all relevant information in great detail 
Return ONLY the JSON data
output format:
{output_format}
"""
output_format="""{"title": "title of the first article","description": "description of the article","article": "your custom written article","links": "all source links that have contributed to the article"},{"title": "title of the second article","description": "description of the article","article": "your custom written article","links": "all source links that have contributed to the article"}"""
def summarize(inp,history,seed_slider,data=None):
    
    json_box=[]
    if inp == "":
        inp = "Process this data"
    #inp = format_prompt(inp,history)
    task = "Compile a detailed report"
    
    history.clear()
    yield "",[(inp,"Working on it...")],None

    if data != "Error" and data != "":
        timestamp=datetime.datetime.now()
        #seed=random.randint(1,1000000000)
        seed=seed_slider
        print(seed)
        generate_kwargs = dict(
            temperature=0.9,
            max_new_tokens=10240,
            top_p=0.95,
            repetition_penalty=1.0,
            do_sample=True,
            seed=seed,
        )
        out = str(data)
        rl = len(out)
        print(f'rl:: {rl}')
        c=1
        for i in str(out):
            if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
                c +=1
        print (f'c:: {c}')
        divr=int(c)/MAX_DATA
        divi=int(divr)+1 if divr != int(divr) else int(divr)
        chunk = int(int(c)/divr)
        print(f'chunk:: {chunk}')
        print(f'divr:: {divr}')
        print (f'divi:: {divi}')
        s=0
        e=chunk
        print(f'e:: {e}')
        out_box =[]
        resp = ""
        for z in range(divi):
            print(f's:e :: {s}:{e}')
            mes= f'Working on data chunk: {s}:{e}'
            new_data = out[s:e]
            #yield "", [(inp,f'{mes}\n{new_history}')]
            
            #content = NEWS_REPORTER.format(output_format=output_format,new_data=str(new_data.replace("{","").replace("}","")))
            content = NEWS_REPORTER.format(output_format=output_format,new_data=str(new_data))
            formatted=format_prompt(content,history)
            stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
            for response in stream:
                resp += response.token.text            
                yield "", [(inp,resp)],None
            
            out_json=resp.replace("\n","").replace("```","")
            out_box.append(out_json.strip("</s>"))
            #out_box=eval(out_box)
            print("ADDING")
            yield "", [(inp,resp)],out_box
                           
            e=e+chunk
            s=s+chunk
    else:
        rawp = "Provide a valid data source"
        history.append((inp,rawp))
        yield "", history,None


def find_rss():
    r = requests.get(f'{save_data}rss1/0-phrase-index.json')
    r2 = requests.get(f'{save_data}rss1/0-hash-list.json')
    if r.status_code==200:
        phrase_dict = json.loads(r.text)    
    else: phrase_dict={}
    if r2.status_code==200:
        hash_list = json.loads(r2.text)    
    else: hash_list=[]        
    timestamp=str(datetime.datetime.now())
    timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
    error_box=[]
    error_box_schema={"Name":"","Error":"","Keys":"","Other":""}
    lod=""
    out_box=[]
    valid_box=[]
    yield [],[(None,"loading sources")],None
    with open ('valid_feeds.json','r') as j:
        cont = json.loads(j.read())
        #print(cont)
    j.close() 
    cnt=0
    for ea in cont:
        try:
            #lod=""
            #print (ea['link'])
            if ea.get('link') is not None:
                rss_url=ea['link']
            else:
                rss_url=ea['URL']
            link_box=[]
            r = requests.get(f'{rss_url}') 
            if r.status_code == 200:
                try:
                    if ".json" in rss_url:
                        lod = json.loads(r.text)
                    if ".xml" in rss_url:
                        lod = xmltodict.parse(r.content)
                    if ".rss" in rss_url:
                        lod = xmltodict.parse(r.content)
                    else:
                        try:
                            lod = xmltodict.parse(r.content)
                        except Exception as e:
                            lod=f'{rss_url} ::ERROR:: {e}'
                            error_box.append({"Name":rss_url,"Error":e,"Error Code":1})
                    if ea.get('section') is not None:
                        section = ea['section']
                    else: section = ""
                    valid_box.append({"source":ea['source'],"link":ea['link'],"section":section,"description":''})
                except Exception as e:
                    lod=f'{rss_url} ::ERROR:: {e}'
                    error_box.append({"Name":rss_url,"Error":e,"Error Code":2})
                    
            else: 
                lod = f'{rss_url} ::ERROR::COULD NOT CONNECT:: {r.status_code}'
                error_box.append({"Name":rss_url,"Error":f'Status Code:{r.status_code}',"Error Code":3})
                pass
            try:
                #print(lod['rss']['channel']['item'][0].keys())
                #print(lod['rss'].keys())
                print("##############")
                #print(lod['rss'].keys())
                print("##############")
                
                for i,ea in enumerate(lod['rss']['channel']['item']):
                    try:
                        r_link = ea['link']
                        if ea.get('title') != None:
                            r_title = ea['title']
                        else: r_title= ea['source']

                        if ea.get('description') != None:
                            r_description = ea['description']
                        else: r_description = ""
                        if ea.get('pubDate') != None:
                            r_date = ea['pubDate']
                        else: r_date = ""      
                        if ea.get('media') != None:
                            r_media = ea['media']
                            print("MEDIA")
                            print(r_media)
                        else: r_media = ""  
                            
                        tt = TextBlob(r_title)
                        tt_phrases=tt.noun_phrases 
                        td = TextBlob(r_description)
                        td_phrases=td.noun_phrases
                        phrases=tt_phrases+td_phrases
                        lod_hash = {"title":r_title, "date":r_date, "description":r_description,"link":r_link}
                        hash_val=str(hash(str(lod_hash)))
                        if not hash_val in hash_list:
                            
                            for phrase in phrases:
                                if phrase_dict.get(phrase)!=None:
                                    phrase_dict[phrase.lower()].append([timename,cnt])
                                else:
                                    phrase_dict[phrase.lower()]=[[timename,cnt]]
                        lods = {"num":cnt,"title":r_title, "date":r_date, "description":r_description,"link":r_link, "noun_phrases":phrases}
                        cnt+=1
                    except Exception as e:
                        print(f"Exception::{ea}")
                        error_box.append({"Name":rss_url,"Keys":lod['rss']['channel']['item'],"Error":e,"Error Code":4})
                        print(e)
                        pass
                        #lods = {"title":"ERROR", "description":{e},"link":"ERROR"}
                    if not hash_val in hash_list:
                        link_box.append(lods)
                        hash_list.append(hash_val)
                if link_box:
                    lod={lod['rss']['channel']['title']:link_box}
                    out_box.append(lod)
                
            except Exception as e:
                error_box.append({"Name":rss_url,"Error":e,"Error Code":5})
                print(f'Exception::{e}')
        except Exception as e:
            error_box.append({"Name":rss_url,"Error":e,"Error Code":6})
            print(f'Exception::{e}')
            pass
        '''
        json_object_valid = json.dumps(valid_box, indent=4)
        with open("tmp3.json", "w") as outfile3:
            outfile3.write(json_object_valid)
        api.upload_file(
        path_or_fileobj="tmp3.json",
        path_in_repo=f"/rss/valid-{timename}.json",
        repo_id=reponame,
        #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
        token=token_self,
        repo_type="dataset",
        )   
        yield out_box,[(None,'')],error_box
        '''
    print("DONE")

    json_object2 = json.dumps(hash_list, indent=4)
    #json_object = json.dumps(out_box,indent=4)
    with open("tmp32.json", "w") as outfile2:
        outfile2.write(json_object2)
    api.upload_file(
    path_or_fileobj="tmp32.json",
    path_in_repo=f"/rss1/0-hash-list.json",
    repo_id=reponame,
    #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
    token=token_self,
    repo_type="dataset",
    )
    json_object1 = json.dumps(phrase_dict, indent=4)
    #json_object = json.dumps(out_box,indent=4)
    with open("tmp3.json", "w") as outfile1:
        outfile1.write(json_object1)
    api.upload_file(
    path_or_fileobj="tmp3.json",
    path_in_repo=f"/rss1/0-phrase-index.json",
    repo_id=reponame,
    #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
    token=token_self,
    repo_type="dataset",
    )
    json_object = json.dumps(out_box, indent=4)
    #json_object = json.dumps(out_box,indent=4)
    with open("tmp2.json", "w") as outfile:
        outfile.write(json_object)
    api.upload_file(
    path_or_fileobj="tmp2.json",
    path_in_repo=f"/rss1/{timename}.json",
    repo_id=reponame,
    #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
    token=token_self,
    repo_type="dataset",
    )
 
    yield out_box,[(None,f'Source is current as of:\n{timestamp} UTC\n\nThe current Date and Time is:\n{timestamp} UTC')],error_box

    
def load_data(rss_url=None):
    timestamp=str(datetime.datetime.now())
    yield None,None,None,[(None,f'Loading data source, please wait')]
    if rss_url:
        yield None,None,None,[(None,f'Loading data from {rss_url}, please wait')]
        
        r = requests.get(f'{rss_url}') 
        if r.status_code == 200:
            lod={}
            try:
                if ".json" in rss_url:
                    lod = json.loads(r.text)
                if ".xml" in rss_url:
                    lod = xmltodict.parse(r.content)
                if ".rss" in rss_url:
                    lod = xmltodict.parse(r.content)
                else:
                    try:
                        lod = xmltodict.parse(r.content)
                    except Exception as e:
                        yield None,None,None, [(None, f'{rss_url} ::ERROR:: {e}')]
            except Exception as e:
                yield None, None,None,[(None, f'{rss_url} ::ERROR:: {e}')]
            yield lod,None,None,[(None,f'Source is current as of:\n{timestamp} UTC')]
        else: 
            yield None,None,None, [(None, f'{rss_url} ::ERROR::COULD NOT CONNECT:: {r.status_code}')]
    if not rss_url:
        yield None,None,None,[(None,f'Loading data from database, please wait')]
        r1 = requests.get(f'{save_data}rss1/0-phrase-index.json') 
        if r1.status_code==200:
            print("STATUS GOOD")
            lod1 = json.loads(r1.text)
            lod2 =list(lod1.keys())
            mes="Index: Loaded"
            print(mes)
        else:
            lod1=None
            lod2=None
            mes="Index: Not Found"
        f_ist = (api.list_repo_files(repo_id=f'{reponame}', repo_type="dataset"))
        f_ist.sort(reverse=True)
        #print(f_ist)
        r = requests.get(f'{save_data}{f_ist[0]}') 
        lod = json.loads(r.text)
        filename=f_ist[0].split("/")[1].split(".json")[0].replace("--"," ")
        print (filename)
        filename_start = filename.split(" ")[0]
        filename_end = filename.split(" ")[1]
        filename_end = filename_end.replace("-"[0],":").replace("-"[0],":").replace("-"[0],".")
        #filename_end_far=filename_end.split(":")[2]
        print (filename)
        #yield lod,lod1,gr.update(choices=[z for z in lod2]),[(None,f'Source is current as of:\n{filename_start} {filename_end} UTC\n\nThe current Date and Time is:\n{timestamp} UTC\n{mes}')]
        yield lod,lod1,lod2,[(None,f'Source is current as of:\n{filename_start} {filename_end} UTC\n\nThe current Date and Time is:\n{timestamp} UTC\n{mes}')]

def load_html(conv,ind,files):
    ht=f"""<div class="div_box">"""
    #print(files)
    key_list=list(files.keys())
    print(key_list)
    for second in key_list:
        r = requests.get(f'{save_data}rss1/{second}.json') 
        if r.status_code==200:
            file_json=json.loads(r.text)
            print("LOADED")
            #print(file_json)
            for th in file_json:
                #print(th)
                for dd in th:
                    print(dd)
                    
                    
                    '''
                    for vv in files[second]:
                        print(vv)
                        try:
                            if th[dd][vv]:
                                #tok=file_json[[th][dd]]
                                tok1=th[dd][vv]
                                print(tok1)
                                
                                ht+=f"""<pre class="bpost"><div class="bhead"><h1>{dd}</h1><h2>{tok1['title']}</h2><br><h5>{tok1['description']}</h5><br>{tok1['date']}<br>{tok1['link']}<br>{tok1['noun_phrases']}</div></pre>"""
                            else:
                                print("PASSING")
                        except Exception as e:
                            print(e)
                    '''
                    for bb in th[dd]:
                        print(bb)
                        print(files[second])
                        #if bb['num'] in files[second]:
                        if bb['num'] in files[second]:
                            print("YES")                
                            #for ea in files[second]:
                            #    print(ea)
                            ht+=f"""<pre class="bpost"><div class="bhead"><h1>{dd}</h1><h2>{bb['title']}</h2><br><h5>{bb['description']}</h5><br>{bb['date']}<br><a href='{bb['link']}'>{bb['link']}</a></div></pre>"""

    '''
    print("LOAD HTML")
    #print(conv)
    ht=""
    ht+=f"""<div class="div_box">"""
    #this = list(conv.keys())
    for th in conv[:5]:
        print(th)
        for ba in list(th.keys()):
            print(ba)
            for ea in th[ba]:
                print(ea)
                ht+=f"""<pre class="bpost"><div class="bhead"><h1>{ba}</h1><h2>{ea['title']}</h2><br><h5>{ea['description']}</h5><br>{ea['date']}<br>{ea['link']}<br>{ea['noun_phrases']}</div></pre>"""
    '''                
    ht+=f"""</div>"""                
 
    with open('index.html','r') as h:
        html=h.read()
        html = html.replace("$body",f"{ht}")
    h.close()
    return html
def search_fn(inp,data,data_keys):
    collected=[]
    out_dict={}
    cnt = 0
    mes=f'{cnt} Found'
    print(data_keys)
    for ea in data_keys:
        if inp.lower() in ea.lower():
            print("DATA")
            print(data[ea])
            print(ea)
            for zz in data[ea]:
                
                #print(zz)
                print(zz[0])
                print(zz[1])
                if out_dict.get(zz[0]) != None and not zz[1] in out_dict:
                    print("YES")
                    out_dict[zz[0]].append(zz[1])
                else:
                    out_dict[zz[0]]=[zz[1]]
                cnt+=1
            #collected.append(data[ea])
    #cnt=len(list(out_dict.keys()))
    mes=f'{cnt} Found'
    yield mes,out_dict,out_dict
    
with gr.Blocks() as app:
    loaded_data=gr.State()
    loaded_index=gr.State()
    loaded_keys=gr.State()
    selected=gr.State()
    out_html=gr.HTML()
    cb = gr.Chatbot(height=600, show_share_button=True, show_copy_button=True)
    with gr.Row():
        phrase_dd=gr.Dropdown(label="Search",choices=[])
        go_btn=gr.Button("Get News")
    with gr.Row():
        inst = gr.Textbox(label="Search")
        sub_btn=gr.Button("Search")
    mes_html=gr.HTML()
    with gr.Row():
        rss_custom=gr.Textbox(label="URL for RSS feed (.rss,.xml)")
        load_btn = gr.Button("Load RSS")
    with gr.Row():
        load_page=gr.Button("Load HTML")
    seed_slider=gr.Slider(label="Seed",step=1,minimum=1,maximum=9999999999999999999,value=1,interactive=True)        
    with gr.Accordion(open=False):
        u_btn=gr.Button("Update [RSS Data]")
        keyw = gr.Button("Use Keyword [Experimental]")
    with gr.Row():
        out_json = gr.JSON()
        #error_box=gr.JSON()
        error_box=gr.JSON()
    fil = gr.Textbox()

    go_btn.click(load_html,[loaded_data,loaded_index,selected],out_html)
    sub_btn.click(search_fn,[inst,loaded_index,loaded_keys],[mes_html,selected,error_box])
    #inst.change(search_fn,[inst,loaded_index,loaded_keys],[mes_html,selected])
    load_page.click(load_html,loaded_data,out_html)
    #keyw.click(get_records,[inst,out_json],[inst,cb])
    app.load(load_data,rss_custom,[loaded_data,loaded_index,loaded_keys,cb])
    #load_btn.click(load_data,rss_custom,[loaded_data,cb])
    u_btn.click(find_rss,None,[out_json,cb,error_box])
    #sub_btn.click(summarize,[inst,cb,seed_slider,out_json],[inst,cb,error_box])
app.queue(default_concurrency_limit=20).launch()