Spaces:

DerwenAI
/

textgraphs

Running

textgraphs / demo.py

Paco Nathan

A new start

91eaff6 almost 2 years ago

5.99 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Sample application to demo the `TextGraphs` library.

	see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md
	"""

	import asyncio
	import sys # pylint: disable=W0611
	import traceback
	import time
	import typing

	from icecream import ic # pylint: disable=E0401
	from pyinstrument import Profiler # pylint: disable=E0401
	import matplotlib.pyplot as plt # pylint: disable=E0401
	import pandas as pd # pylint: disable=E0401

	import textgraphs


	if __name__ == "__main__":
	SRC_TEXT: str = """
	Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
	After the war, Werner fled to America to become famous.
	"""

	## set up
	## NB: profiler raises handler exceptions when `concur = False`
	debug: bool = False # True
	concur: bool = True # False
	profile: bool = True # False

	if profile:
	profiler: Profiler = Profiler()
	profiler.start()

	try:
	start_time: float = time.time()

	tg: textgraphs.TextGraphs = textgraphs.TextGraphs(
	factory = textgraphs.PipelineFactory(
	spacy_model = textgraphs.SPACY_MODEL,
	ner = None, #textgraphs.NERSpanMarker(),
	kg = textgraphs.KGWikiMedia(
	spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API,
	dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API,
	dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API,
	wikidata_api = textgraphs.WIKIDATA_API,
	),
	infer_rels = [
	textgraphs.InferRel_OpenNRE(
	model = textgraphs.OPENNRE_MODEL,
	max_skip = textgraphs.MAX_SKIP,
	min_prob = textgraphs.OPENNRE_MIN_PROB,
	),
	textgraphs.InferRel_Rebel(
	lang = "en_XX",
	mrebel_model = textgraphs.MREBEL_MODEL,
	),
	],
	),
	)

	duration: float = round(time.time() - start_time, 3)
	print(f"{duration:7.3f} sec: set up")


	## NLP parse
	start_time = time.time()

	pipe: textgraphs.Pipeline = tg.create_pipeline(
	SRC_TEXT.strip(),
	)

	duration = round(time.time() - start_time, 3)
	print(f"{duration:7.3f} sec: parse text")


	## collect graph elements from the parse
	start_time = time.time()

	tg.collect_graph_elements(
	pipe,
	debug = debug,
	)

	duration = round(time.time() - start_time, 3)
	print(f"{duration:7.3f} sec: collect elements")


	## perform entity linking
	start_time = time.time()

	tg.perform_entity_linking(
	pipe,
	debug = debug,
	)

	duration = round(time.time() - start_time, 3)
	print(f"{duration:7.3f} sec: entity linking")


	## perform concurrent relation extraction
	start_time = time.time()

	if concur:
	try:
	loop = asyncio.get_running_loop()
	except RuntimeError:
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)

	inferred_edges: list = loop.run_until_complete(
	tg.infer_relations_async(
	pipe,
	debug = debug,
	)
	)
	else:
	inferred_edges = tg.infer_relations(
	pipe,
	debug = debug,
	)

	duration = round(time.time() - start_time, 3)
	print(f"{duration:7.3f} sec: relation extraction")

	n_list: list = list(tg.nodes.values())

	df_rel: pd.DataFrame = pd.DataFrame.from_dict([
	{
	"src": n_list[edge.src_node].text,
	"dst": n_list[edge.dst_node].text,
	"rel": pipe.kg.normalize_prefix(edge.rel),
	"weight": edge.prob,
	}
	for edge in inferred_edges
	])

	ic(df_rel)


	## construct the _lemma graph_
	start_time = time.time()

	tg.construct_lemma_graph(
	debug = debug,
	)

	duration = round(time.time() - start_time, 3)
	print(f"{duration:7.3f} sec: construct graph")


	## rank the extracted phrases
	start_time = time.time()

	tg.calc_phrase_ranks(
	pr_alpha = textgraphs.PAGERANK_ALPHA,
	debug = debug,
	)

	duration = round(time.time() - start_time, 3)
	print(f"{duration:7.3f} sec: rank phrases")


	## show the extracted phrase results
	ic(tg.get_phrases_as_df())

	if debug: # pylint: disable=W0101
	for key, node in tg.nodes.items():
	print(key, node)

	for key, edge in tg.edges.items():
	print(key, edge)

	except Exception as ex: # pylint: disable=W0718
	ic(ex)
	traceback.print_exc()


	## transform graph data to a _graph of relations_
	start_time = time.time()

	gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations(
	tg,
	)

	gor.seeds(
	debug = False, # True
	)

	gor.construct_gor(
	debug = False, # True
	)

	_scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores(
	debug = False, # True
	)

	duration = round(time.time() - start_time, 3)
	print(f"{duration:7.3f} sec: graph of relations")

	gor.render_gor_plt(_scores)
	plt.show()

	#sys.exit(0)


	######################################################################
	## stack profiler report
	if profile:
	profiler.stop()
	profiler.print()

	## output lemma graph as JSON
	with open("lemma.json", "w", encoding = "utf-8") as fp:
	fp.write(tg.dump_lemma_graph())