Spaces:
Running
Running
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """ | |
| Sample application to demo the `TextGraphs` library. | |
| see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md | |
| """ | |
| import asyncio | |
| import sys # pylint: disable=W0611 | |
| import traceback | |
| import time | |
| import typing | |
| from icecream import ic # pylint: disable=E0401 | |
| from pyinstrument import Profiler # pylint: disable=E0401 | |
| import matplotlib.pyplot as plt # pylint: disable=E0401 | |
| import pandas as pd # pylint: disable=E0401 | |
| import textgraphs | |
| if __name__ == "__main__": | |
| SRC_TEXT: str = """ | |
| Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. | |
| After the war, Werner fled to America to become famous. | |
| """ | |
| ## set up | |
| ## NB: profiler raises handler exceptions when `concur = False` | |
| debug: bool = False # True | |
| concur: bool = True # False | |
| profile: bool = True # False | |
| if profile: | |
| profiler: Profiler = Profiler() | |
| profiler.start() | |
| try: | |
| start_time: float = time.time() | |
| tg: textgraphs.TextGraphs = textgraphs.TextGraphs( | |
| factory = textgraphs.PipelineFactory( | |
| spacy_model = textgraphs.SPACY_MODEL, | |
| ner = None, #textgraphs.NERSpanMarker(), | |
| kg = textgraphs.KGWikiMedia( | |
| spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API, | |
| dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API, | |
| dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API, | |
| wikidata_api = textgraphs.WIKIDATA_API, | |
| ), | |
| infer_rels = [ | |
| textgraphs.InferRel_OpenNRE( | |
| model = textgraphs.OPENNRE_MODEL, | |
| max_skip = textgraphs.MAX_SKIP, | |
| min_prob = textgraphs.OPENNRE_MIN_PROB, | |
| ), | |
| textgraphs.InferRel_Rebel( | |
| lang = "en_XX", | |
| mrebel_model = textgraphs.MREBEL_MODEL, | |
| ), | |
| ], | |
| ), | |
| ) | |
| duration: float = round(time.time() - start_time, 3) | |
| print(f"{duration:7.3f} sec: set up") | |
| ## NLP parse | |
| start_time = time.time() | |
| pipe: textgraphs.Pipeline = tg.create_pipeline( | |
| SRC_TEXT.strip(), | |
| ) | |
| duration = round(time.time() - start_time, 3) | |
| print(f"{duration:7.3f} sec: parse text") | |
| ## collect graph elements from the parse | |
| start_time = time.time() | |
| tg.collect_graph_elements( | |
| pipe, | |
| debug = debug, | |
| ) | |
| duration = round(time.time() - start_time, 3) | |
| print(f"{duration:7.3f} sec: collect elements") | |
| ## perform entity linking | |
| start_time = time.time() | |
| tg.perform_entity_linking( | |
| pipe, | |
| debug = debug, | |
| ) | |
| duration = round(time.time() - start_time, 3) | |
| print(f"{duration:7.3f} sec: entity linking") | |
| ## perform concurrent relation extraction | |
| start_time = time.time() | |
| if concur: | |
| try: | |
| loop = asyncio.get_running_loop() | |
| except RuntimeError: | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| inferred_edges: list = loop.run_until_complete( | |
| tg.infer_relations_async( | |
| pipe, | |
| debug = debug, | |
| ) | |
| ) | |
| else: | |
| inferred_edges = tg.infer_relations( | |
| pipe, | |
| debug = debug, | |
| ) | |
| duration = round(time.time() - start_time, 3) | |
| print(f"{duration:7.3f} sec: relation extraction") | |
| n_list: list = list(tg.nodes.values()) | |
| df_rel: pd.DataFrame = pd.DataFrame.from_dict([ | |
| { | |
| "src": n_list[edge.src_node].text, | |
| "dst": n_list[edge.dst_node].text, | |
| "rel": pipe.kg.normalize_prefix(edge.rel), | |
| "weight": edge.prob, | |
| } | |
| for edge in inferred_edges | |
| ]) | |
| ic(df_rel) | |
| ## construct the _lemma graph_ | |
| start_time = time.time() | |
| tg.construct_lemma_graph( | |
| debug = debug, | |
| ) | |
| duration = round(time.time() - start_time, 3) | |
| print(f"{duration:7.3f} sec: construct graph") | |
| ## rank the extracted phrases | |
| start_time = time.time() | |
| tg.calc_phrase_ranks( | |
| pr_alpha = textgraphs.PAGERANK_ALPHA, | |
| debug = debug, | |
| ) | |
| duration = round(time.time() - start_time, 3) | |
| print(f"{duration:7.3f} sec: rank phrases") | |
| ## show the extracted phrase results | |
| ic(tg.get_phrases_as_df()) | |
| if debug: # pylint: disable=W0101 | |
| for key, node in tg.nodes.items(): | |
| print(key, node) | |
| for key, edge in tg.edges.items(): | |
| print(key, edge) | |
| except Exception as ex: # pylint: disable=W0718 | |
| ic(ex) | |
| traceback.print_exc() | |
| ## transform graph data to a _graph of relations_ | |
| start_time = time.time() | |
| gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations( | |
| tg, | |
| ) | |
| gor.seeds( | |
| debug = False, # True | |
| ) | |
| gor.construct_gor( | |
| debug = False, # True | |
| ) | |
| _scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores( | |
| debug = False, # True | |
| ) | |
| duration = round(time.time() - start_time, 3) | |
| print(f"{duration:7.3f} sec: graph of relations") | |
| gor.render_gor_plt(_scores) | |
| plt.show() | |
| #sys.exit(0) | |
| ###################################################################### | |
| ## stack profiler report | |
| if profile: | |
| profiler.stop() | |
| profiler.print() | |
| ## output lemma graph as JSON | |
| with open("lemma.json", "w", encoding = "utf-8") as fp: | |
| fp.write(tg.dump_lemma_graph()) | |