Spaces:

RyanTietjen
/

Paper-Fragmentation

Build error

App Files Files Community

Paper-Fragmentation / app.py

RyanTietjen

Upload 4 files

d85b3ec verified about 1 year ago

raw

history blame

8.77 kB

	"""
	Ryan Tietjen
	Sep 2024
	Demo application for paper abstract fragmentaion demonstration
	"""
	import gradio as gr
	import tensorflow as tf
	from tensorflow import keras
	from keras import layers
	from timeit import default_timer as timer
	from process_input import split_abstract
	from process_input import split_abstract_original
	from process_input import split_sentences_by_characters
	import pandas as pd
	import tensorflow_hub as hub
	from model import EmbeddingLayer
	from process_input import encode_labels


	sample_list = []
	example1 = f"""The aim of this study was to describe the electrocardiographic ( ECG ) evolutionary changes after an acute myocardial infarction ( AMI ) and to evaluate their correlation with left ventricular function and remodeling.
	The QRS complex changes after AMI have been correlated with infarct size and left ventricular function.
	By contrast , the significance of T wave changes is controversial.
	We studied 536 patients enrolled in the GISSI-3-Echo substudy who underwent ECG and echocardiographic studies at 24 to 48 h ( S1 ) , at hospital discharge ( S2 ) , at six weeks ( S3 ) and six months ( S4 ) after AMI.
	The number of Qwaves ( nQ ) and QRS quantitative score ( QRSs ) did not change over time.
	From S2 to S4 , the number of negative T waves ( nT NEG ) decreased ( p < 0.0001 ) , wall motion abnormalities ( % WMA ) improved ( p < 0.001 ) , ventricular volumes increased ( p < 0.0001 ) while ejection fraction remained stable.
	According to the T wave changes after hospital discharge , patients were divided into four groups : stable positive T waves ( group 1 , n = 35 ) , patients who showed a decrease > or = 1 in nT NEG ( group 2 , n = 361 ) , patients with no change in nT NEG ( group 3 , n = 64 ) and those with an increase > or = 1 in nT NEG ( group 4 , n = 76 ).
	The QRSs and nQ remained stable in all groups.
	Groups 3 and 4 showed less recovery in % WMA , more pronounced ventricular enlargement and progressive decline in ejection fraction than groups 1 and 2 ( interaction time x groups p < 0.0001 ).
	The analysis of serial ECG can predict postinfarct left ventricular remodeling.
	Normalization of negative T waves during the follow-up appears more strictly related to recovery of regional dysfunction than QRS changes.
	Lack of resolution and late appearance of new negative T predict unfavorable remodeling with progressive deterioration of ventricular function."""
	sample_list.append(example1)

	def format_non_empty_lists(objective, background, methods, results, conclusion):
	"""
	This function checks each provided list and formats a string with the list name and its contents
	only if the list is not empty.

	Parameters:
	- objective (list): List containing sentences classified as 'Objective'.
	- background (list): List containing sentences classified as 'Background'.
	- methods (list): List containing sentences classified as 'Methods'.
	- results (list): List containing sentences classified as 'Results'.
	- conclusion (list): List containing sentences classified as 'Conclusion'.

	Returns:
	- str: A formatted string that contains the non-empty list names and their contents.
	"""

	output = ""
	lists = {
	'Objective': objective,
	'Background': background,
	'Methods': methods,
	'Results': results,
	'Conclusion': conclusion
	}

	for name, content in lists.items():
	if content: # Check if the list is not empty
	output += f"{name}:\n" # Append the category name followed by a newline
	for item in content:
	output += f" - {item}\n" # Append each item in the list, formatted as a list

	output += "\n" # Append a newline for better separation between categories

	return output.strip()

	def fragment_single_abstract(abstract):
	"""
	Processes a single abstract by fragmenting it into structured sections based on predefined categories
	such as Objective, Methods, Results, Conclusions, and Background. The function utilizes a pre-trained Keras model
	to predict the category of each sentence in the abstract.

	The process involves several steps:
	1. Splitting the abstract into sentences.
	2. Encoding these sentences using a custom embedding layer.
	3. Classifying each sentence into one of the predefined categories.
	4. Grouping the sentences by their predicted categories.

	Parameters:
	abstract (str): The abstract text that needs to be processed and categorized.

	Returns:
	tuple: A tuple containing two elements:
	- A dictionary with keys as the category names ('Objective', 'Background', 'Methods', 'Results', 'Conclusions')
	and values as lists of sentences belonging to these categories. Only non-empty categories are returned.
	- The time taken to process the abstract (in seconds).

	Example:
	```python
	abstract_text = "This study aims to evaluate the effectiveness of..."
	categorized_abstract, processing_time = fragment_single_abstract(abstract_text)
	print("Categorized Abstract:", categorized_abstract)
	print("Processing Time:", processing_time)
	```

	Note:
	- This function assumes that a Keras model 'test.keras' and a custom embedding layer 'EmbeddingLayer'
	are available and correctly configured to be loaded.
	- The function uses pandas for data manipulation, TensorFlow for machine learning operations,
	and TensorFlow's data API for batching and prefetching data for model predictions.
	"""
	start_time = timer()

	original_abstract = split_abstract_original(abstract)
	df_original = pd.DataFrame(original_abstract)
	sentences_original = df_original["text"].tolist()

	abstract_split = split_abstract(abstract)
	df = pd.DataFrame(abstract_split)
	sentences = df["text"].tolist()
	labels = encode_labels(df["target"])

	objective = []
	background = []
	methods = []
	results = []
	conclusion = []

	embed_layer = EmbeddingLayer()
	model = tf.keras.models.load_model("20k_5_epochs.keras", custom_objects={'EmbeddingLayer': embed_layer})

	data_by_character = split_sentences_by_characters(sentences)
	line_numbers = tf.one_hot(df["line_number"].to_numpy(), depth=15)
	total_line_numbers = tf.one_hot(df["total_lines"].to_numpy(), depth=20)

	sentences_dataset = tf.data.Dataset.from_tensor_slices((line_numbers, total_line_numbers, sentences, data_by_character))
	labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
	dataset = tf.data.Dataset.zip((sentences_dataset, labels_dataset)).batch(32).prefetch(tf.data.AUTOTUNE)

	predictions = tf.argmax(model.predict(dataset), axis=1)

	for i, prediction in enumerate(predictions):
	if prediction == 0:
	objective.append(sentences_original[i])
	elif prediction == 1:
	methods.append(sentences_original[i])
	elif prediction == 2:
	results.append(sentences_original[i])
	elif prediction == 3:
	conclusion.append(sentences_original[i])
	elif prediction == 4:
	background.append(sentences_original[i])

	end_time = timer()

	return format_non_empty_lists(objective, background, methods, results, conclusion), end_time - start_time



	title = "Paper Abstract Fragmentation With TensorFlow by Ryan Tietjen"
	description = f"""
	This app will take the abstract of a paper and break it down into five categories: objective, background, methods, results, and conclusion.
	The dataset used can be found in the [PubMed 200k RCT]("https://arxiv.org/abs/1710.06071") and in [this repo](https://github.com/Franck-Dernoncourt/pubmed-rct). The model architecture
	was based off of ["Neural Networks for Joint Sentence Classification in Medical Paper Abstracts."](https://arxiv.org/pdf/1612.05251)

	This project achieved a testing accuracy of 88.12% and a F1 score of 87.92%. For the whole project, please visit [my GitHub](https://github.com/RyanTietjen/Paper-Fragmentation).

	How to use:

	-Paste the given abstract into the box below.

	-Make sure to separate each sentence by a new line (this helps avoid ambiguity).

	-Click submit, and allow the model to run!
	"""

	demo = gr.Interface(
	fn=fragment_single_abstract,
	inputs=gr.Textbox(lines=10, placeholder="Enter abstract here..."),
	outputs=[
	gr.Textbox(label="Fragmented Abstract"),
	gr.Number(label="Time to process (s)"),
	],
	examples=sample_list,
	title=title,
	description=description,
	)


	demo.launch(share=False)