Spaces:
Build error
Build error
| from transformers import LongformerTokenizer, EncoderDecoderModel | |
| from .base_single_doc_model import SingleDocSummModel | |
| class LongformerModel(SingleDocSummModel): | |
| # static variables | |
| model_name = "Longformer" | |
| is_extractive = False | |
| is_neural = True | |
| def __init__(self): | |
| super(LongformerModel, self).__init__() | |
| self.model = EncoderDecoderModel.from_pretrained( | |
| "patrickvonplaten/longformer2roberta-cnn_dailymail-fp16" | |
| ) | |
| self.tokenizer = LongformerTokenizer.from_pretrained( | |
| "allenai/longformer-base-4096" | |
| ) | |
| def summarize(self, corpus, queries=None): | |
| self.assert_summ_input_type(corpus, queries) | |
| summaries = list(map(lambda doc: self.summarize_single(doc), corpus)) | |
| return summaries | |
| def summarize_single(self, document): | |
| # Tokenizes document and returns PyTorch torch.Tensor object with length attribute | |
| tokenized_sequence = self.tokenizer( | |
| document, | |
| return_tensors="pt", | |
| return_length=True, | |
| truncation=True, | |
| max_length=4096, | |
| ) | |
| print( | |
| f"Longformer model: processing document of {tokenized_sequence.length} tokens" | |
| ) | |
| input_ids = tokenized_sequence.input_ids | |
| # output_ids is tensor with one layer: output_ids[0] extracts tensor layer for decoding | |
| output_ids = self.model.generate(input_ids) | |
| return self.tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| def show_capability(cls) -> None: | |
| basic_description = cls.generate_basic_description() | |
| more_details = ( | |
| "A Longformer2Roberta model finetuned on CNN-DM dataset for summarization.\n\n" | |
| "Strengths:\n - Correctly handles longer (> 2000 tokens) corpus.\n\n" | |
| "Weaknesses:\n - Less accurate on contexts outside training domain.\n\n" | |
| "Initialization arguments:\n " | |
| " - `corpus`: Unlabelled corpus of documents.\n" | |
| ) | |
| print(f"{basic_description} \n {'#'*20} \n {more_details}") | |