Upload model
Browse files- README.md +3 -3
- config.json +4 -0
- create_section_files.py +150 -0
- modelling_cxrmate_ed.py +33 -9
- section_parser.py +281 -0
README.md
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
| 2 |
library_name: transformers
|
| 3 |
tags:
|
| 4 |
- chest X-ray report generation
|
|
@@ -16,9 +19,6 @@ tags:
|
|
| 16 |
- patient records
|
| 17 |
- mimic-cxr
|
| 18 |
- mimic-iv-ed
|
| 19 |
-
license: apache-2.0
|
| 20 |
-
language:
|
| 21 |
-
- en
|
| 22 |
---
|
| 23 |
|
| 24 |
# CXRMate-ED: The Impact of Auxiliary Patient Data on Automated Chest X-Ray Report Generation and How to Incorporate It
|
|
|
|
| 1 |
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
license: apache-2.0
|
| 5 |
library_name: transformers
|
| 6 |
tags:
|
| 7 |
- chest X-ray report generation
|
|
|
|
| 19 |
- patient records
|
| 20 |
- mimic-cxr
|
| 21 |
- mimic-iv-ed
|
|
|
|
|
|
|
|
|
|
| 22 |
---
|
| 23 |
|
| 24 |
# CXRMate-ED: The Impact of Auxiliary Patient Data on Automated Chest X-Ray Report Generation and How to Incorporate It
|
config.json
CHANGED
|
@@ -85,6 +85,10 @@
|
|
| 85 |
"rms_norm_eps": 1e-06,
|
| 86 |
"rope_scaling": null,
|
| 87 |
"rope_theta": 10000.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
"sep_token_id": null,
|
| 89 |
"suppress_tokens": null,
|
| 90 |
"task_specific_params": null,
|
|
|
|
| 85 |
"rms_norm_eps": 1e-06,
|
| 86 |
"rope_scaling": null,
|
| 87 |
"rope_theta": 10000.0,
|
| 88 |
+
"section_ids": [
|
| 89 |
+
12,
|
| 90 |
+
13
|
| 91 |
+
],
|
| 92 |
"sep_token_id": null,
|
| 93 |
"suppress_tokens": null,
|
| 94 |
"task_specific_params": null,
|
create_section_files.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
# local folder import
|
| 8 |
+
from .section_parser import custom_mimic_cxr_rules, section_text
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def list_rindex(l, s):
|
| 12 |
+
"""
|
| 13 |
+
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/txt/create_section_files.py
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
"""Helper function: *last* matching element in a list"""
|
| 17 |
+
return len(l) - l[-1::-1].index(s) - 1
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def create_section_files(reports_path, output_path, no_split):
|
| 21 |
+
"""
|
| 22 |
+
Modification of: https://github.com/MIT-LCP/mimic-cxr/blob/master/txt/create_section_files.py
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
reports_path = Path(reports_path)
|
| 26 |
+
output_path = Path(output_path)
|
| 27 |
+
|
| 28 |
+
if not output_path.exists():
|
| 29 |
+
output_path.mkdir()
|
| 30 |
+
|
| 31 |
+
# not all reports can be automatically sectioned
|
| 32 |
+
# we load in some dictionaries which have manually determined sections
|
| 33 |
+
custom_section_names, custom_indices = custom_mimic_cxr_rules()
|
| 34 |
+
|
| 35 |
+
# get all higher up folders (p00, p01, etc)
|
| 36 |
+
p_grp_folders = os.listdir(reports_path)
|
| 37 |
+
p_grp_folders = [p for p in p_grp_folders
|
| 38 |
+
if p.startswith('p') and len(p) == 3]
|
| 39 |
+
p_grp_folders.sort()
|
| 40 |
+
|
| 41 |
+
# patient_studies will hold the text for use in NLP labeling
|
| 42 |
+
patient_studies = []
|
| 43 |
+
|
| 44 |
+
# study_sections will have an element for each study
|
| 45 |
+
# this element will be a list, each element having text for a specific section
|
| 46 |
+
study_sections = []
|
| 47 |
+
for p_grp in p_grp_folders:
|
| 48 |
+
# get patient folders, usually around ~6k per group folder
|
| 49 |
+
cxr_path = reports_path / p_grp
|
| 50 |
+
p_folders = os.listdir(cxr_path)
|
| 51 |
+
p_folders = [p for p in p_folders if p.startswith('p')]
|
| 52 |
+
p_folders.sort()
|
| 53 |
+
|
| 54 |
+
# For each patient in this grouping folder
|
| 55 |
+
print(p_grp)
|
| 56 |
+
for p in tqdm(p_folders):
|
| 57 |
+
patient_path = cxr_path / p
|
| 58 |
+
|
| 59 |
+
# get the filename for all their free-text reports
|
| 60 |
+
studies = os.listdir(patient_path)
|
| 61 |
+
studies = [s for s in studies
|
| 62 |
+
if s.endswith('.txt') and s.startswith('s')]
|
| 63 |
+
|
| 64 |
+
for s in studies:
|
| 65 |
+
# load in the free-text report
|
| 66 |
+
with open(patient_path / s, 'r') as fp:
|
| 67 |
+
text = ''.join(fp.readlines())
|
| 68 |
+
|
| 69 |
+
# get study string name without the txt extension
|
| 70 |
+
s_stem = s[0:-4]
|
| 71 |
+
|
| 72 |
+
# custom rules for some poorly formatted reports
|
| 73 |
+
if s_stem in custom_indices:
|
| 74 |
+
idx = custom_indices[s_stem]
|
| 75 |
+
patient_studies.append([s_stem, text[idx[0]:idx[1]]])
|
| 76 |
+
continue
|
| 77 |
+
|
| 78 |
+
# split text into sections
|
| 79 |
+
sections, section_names, section_idx = section_text(text)
|
| 80 |
+
|
| 81 |
+
# check to see if this has mis-named sections
|
| 82 |
+
# e.g. sometimes the impression is in the comparison section
|
| 83 |
+
if s_stem in custom_section_names:
|
| 84 |
+
sn = custom_section_names[s_stem]
|
| 85 |
+
idx = list_rindex(section_names, sn)
|
| 86 |
+
patient_studies.append([s_stem, sections[idx].strip()])
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
# grab the *last* section with the given title
|
| 90 |
+
# prioritizes impression > findings, etc.
|
| 91 |
+
|
| 92 |
+
# "last_paragraph" is text up to the end of the report
|
| 93 |
+
# many reports are simple, and have a single section
|
| 94 |
+
# header followed by a few paragraphs
|
| 95 |
+
# these paragraphs are grouped into section "last_paragraph"
|
| 96 |
+
|
| 97 |
+
# note also comparison seems unusual but if no other sections
|
| 98 |
+
# exist the radiologist has usually written the report
|
| 99 |
+
# in the comparison section
|
| 100 |
+
idx = -1
|
| 101 |
+
for sn in ('impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'):
|
| 102 |
+
if sn in section_names:
|
| 103 |
+
idx = list_rindex(section_names, sn)
|
| 104 |
+
break
|
| 105 |
+
|
| 106 |
+
if idx == -1:
|
| 107 |
+
# we didn't find any sections we can use :(
|
| 108 |
+
patient_studies.append([s_stem, ''])
|
| 109 |
+
print(f'no impression/findings: {patient_path / s}')
|
| 110 |
+
else:
|
| 111 |
+
# store the text of the conclusion section
|
| 112 |
+
patient_studies.append([s_stem, sections[idx].strip()])
|
| 113 |
+
|
| 114 |
+
study_sectioned = [s_stem]
|
| 115 |
+
for sn in ('impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'):
|
| 116 |
+
if sn in section_names:
|
| 117 |
+
idx = list_rindex(section_names, sn)
|
| 118 |
+
study_sectioned.append(sections[idx].strip())
|
| 119 |
+
else:
|
| 120 |
+
study_sectioned.append(None)
|
| 121 |
+
study_sections.append(study_sectioned)
|
| 122 |
+
# write distinct files to facilitate modular processing
|
| 123 |
+
if len(patient_studies) > 0:
|
| 124 |
+
# write out a single CSV with the sections
|
| 125 |
+
with open(output_path / 'mimic_cxr_sectioned.csv', 'w') as fp:
|
| 126 |
+
csvwriter = csv.writer(fp)
|
| 127 |
+
# write header
|
| 128 |
+
csvwriter.writerow(['study', 'impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'])
|
| 129 |
+
for row in study_sections:
|
| 130 |
+
csvwriter.writerow(row)
|
| 131 |
+
|
| 132 |
+
if no_split:
|
| 133 |
+
# write all the reports out to a single file
|
| 134 |
+
with open(output_path / f'mimic_cxr_sections.csv', 'w') as fp:
|
| 135 |
+
csvwriter = csv.writer(fp)
|
| 136 |
+
for row in patient_studies:
|
| 137 |
+
csvwriter.writerow(row)
|
| 138 |
+
else:
|
| 139 |
+
# write ~22 files with ~10k reports each
|
| 140 |
+
n = 0
|
| 141 |
+
jmp = 10000
|
| 142 |
+
|
| 143 |
+
while n < len(patient_studies):
|
| 144 |
+
n_fn = n // jmp
|
| 145 |
+
with open(output_path / f'mimic_cxr_{n_fn:02d}.csv', 'w') as fp:
|
| 146 |
+
csvwriter = csv.writer(fp)
|
| 147 |
+
for row in patient_studies[n:n+jmp]:
|
| 148 |
+
csvwriter.writerow(row)
|
| 149 |
+
n += jmp
|
| 150 |
+
|
modelling_cxrmate_ed.py
CHANGED
|
@@ -1,12 +1,8 @@
|
|
| 1 |
-
import csv
|
| 2 |
-
import functools
|
| 3 |
import math
|
| 4 |
import os
|
| 5 |
-
import re
|
| 6 |
-
from collections import OrderedDict
|
| 7 |
from glob import glob
|
| 8 |
from pathlib import Path
|
| 9 |
-
from typing import
|
| 10 |
|
| 11 |
import duckdb
|
| 12 |
import pandas as pd
|
|
@@ -24,10 +20,11 @@ from transformers.models.vision_encoder_decoder.configuration_vision_encoder_dec
|
|
| 24 |
)
|
| 25 |
from transformers.utils import logging
|
| 26 |
|
|
|
|
| 27 |
from .dataset import StudyIDEDStayIDSubset
|
| 28 |
from .modelling_uniformer import MultiUniFormerWithProjectionHead
|
| 29 |
from .records import EDCXRSubjectRecords
|
| 30 |
-
from .tables import ed_module_tables
|
| 31 |
|
| 32 |
logger = logging.get_logger(__name__)
|
| 33 |
|
|
@@ -940,7 +937,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
| 940 |
"Please download them using wget -r -N -c -np --reject dcm --user <username> --ask-password https://physionet.org/files/mimic-cxr/2.0.0/"""
|
| 941 |
|
| 942 |
print('Extracting sections from reports...')
|
| 943 |
-
|
| 944 |
reports_path=os.path.join(physionet_dir, 'mimic-cxr', '2.0.0', 'files'),
|
| 945 |
output_path=sectioned_dir,
|
| 946 |
no_split=True,
|
|
@@ -1009,8 +1006,8 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
| 1009 |
|
| 1010 |
connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr AS SELECT * FROM df")
|
| 1011 |
|
| 1012 |
-
# Create lookup tables
|
| 1013 |
-
for k, v in ed_module_tables.items():
|
| 1014 |
if v.load and v.index_columns:
|
| 1015 |
start_idx = 0
|
| 1016 |
for i in v.index_columns_source:
|
|
@@ -1127,3 +1124,30 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
|
|
| 1127 |
f'No. of training dicom_ids, study_ids, & subject_ids: {dataset.num_dicom_ids},',
|
| 1128 |
f'{dataset.num_study_ids}, & {dataset.num_subject_ids}.',
|
| 1129 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import math
|
| 2 |
import os
|
|
|
|
|
|
|
| 3 |
from glob import glob
|
| 4 |
from pathlib import Path
|
| 5 |
+
from typing import Optional, Tuple, Union
|
| 6 |
|
| 7 |
import duckdb
|
| 8 |
import pandas as pd
|
|
|
|
| 20 |
)
|
| 21 |
from transformers.utils import logging
|
| 22 |
|
| 23 |
+
from .create_section_files import create_section_files
|
| 24 |
from .dataset import StudyIDEDStayIDSubset
|
| 25 |
from .modelling_uniformer import MultiUniFormerWithProjectionHead
|
| 26 |
from .records import EDCXRSubjectRecords
|
| 27 |
+
from .tables import ed_module_tables, mimic_cxr_tables
|
| 28 |
|
| 29 |
logger = logging.get_logger(__name__)
|
| 30 |
|
|
|
|
| 937 |
"Please download them using wget -r -N -c -np --reject dcm --user <username> --ask-password https://physionet.org/files/mimic-cxr/2.0.0/"""
|
| 938 |
|
| 939 |
print('Extracting sections from reports...')
|
| 940 |
+
create_section_files(
|
| 941 |
reports_path=os.path.join(physionet_dir, 'mimic-cxr', '2.0.0', 'files'),
|
| 942 |
output_path=sectioned_dir,
|
| 943 |
no_split=True,
|
|
|
|
| 1006 |
|
| 1007 |
connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr AS SELECT * FROM df")
|
| 1008 |
|
| 1009 |
+
# Create lookup tables:
|
| 1010 |
+
for k, v in (ed_module_tables | mimic_cxr_tables).items():
|
| 1011 |
if v.load and v.index_columns:
|
| 1012 |
start_idx = 0
|
| 1013 |
for i in v.index_columns_source:
|
|
|
|
| 1124 |
f'No. of training dicom_ids, study_ids, & subject_ids: {dataset.num_dicom_ids},',
|
| 1125 |
f'{dataset.num_study_ids}, & {dataset.num_subject_ids}.',
|
| 1126 |
)
|
| 1127 |
+
return dataset
|
| 1128 |
+
|
| 1129 |
+
@staticmethod
|
| 1130 |
+
def collate_fn(batch):
|
| 1131 |
+
keys = set().union(*(d.keys() for d in batch))
|
| 1132 |
+
batch = {j: [i.setdefault(j, None) for i in batch] for j in keys}
|
| 1133 |
+
batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
|
| 1134 |
+
|
| 1135 |
+
for k in keys:
|
| 1136 |
+
if 'index_value_feats' in k:
|
| 1137 |
+
|
| 1138 |
+
total_indices = next(i for i in batch[k] if i is not None).shape[-1]
|
| 1139 |
+
batch[k] = [i if i is not None else torch.empty(0, total_indices) for i in batch[k]]
|
| 1140 |
+
batch[k] = torch.nn.utils.rnn.pad_sequence(batch[k], batch_first=True, padding_value=-1) # Pad value of -1 is not ideal. Need to use something else.
|
| 1141 |
+
token_type_id_name = k.replace('_feats', '_token_type_ids')
|
| 1142 |
+
batch[token_type_id_name] = [i if i is not None else torch.empty(0, dtype=torch.long) for i in batch[token_type_id_name]]
|
| 1143 |
+
batch[token_type_id_name] = torch.nn.utils.rnn.pad_sequence(
|
| 1144 |
+
batch[token_type_id_name], batch_first=True, padding_value=0,
|
| 1145 |
+
)
|
| 1146 |
+
mask_name = k.replace('_feats', '_mask')
|
| 1147 |
+
batch[mask_name] = (batch[k] != -1).any(dim=-1).int()
|
| 1148 |
+
|
| 1149 |
+
if 'time_delta' in k and 'index_value' in k:
|
| 1150 |
+
batch[k] = [i if i is not None else torch.empty(0, 1) for i in batch[k]]
|
| 1151 |
+
batch[k] = torch.nn.utils.rnn.pad_sequence(batch[k], batch_first=True, padding_value=0)
|
| 1152 |
+
|
| 1153 |
+
return batch
|
section_parser.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def section_text(text):
|
| 5 |
+
"""
|
| 6 |
+
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
"""Splits text into sections.
|
| 10 |
+
|
| 11 |
+
Assumes text is in a radiology report format, e.g.:
|
| 12 |
+
|
| 13 |
+
COMPARISON: Chest radiograph dated XYZ.
|
| 14 |
+
|
| 15 |
+
IMPRESSION: ABC...
|
| 16 |
+
|
| 17 |
+
Given text like this, it will output text from each section,
|
| 18 |
+
where the section type is determined by the all caps header.
|
| 19 |
+
|
| 20 |
+
Returns a three element tuple:
|
| 21 |
+
sections - list containing the text of each section
|
| 22 |
+
section_names - a normalized version of the section name
|
| 23 |
+
section_idx - list of start indices of the text in the section
|
| 24 |
+
"""
|
| 25 |
+
p_section = re.compile(
|
| 26 |
+
r'\n ([A-Z ()/,-]+):\s', re.DOTALL)
|
| 27 |
+
|
| 28 |
+
sections = list()
|
| 29 |
+
section_names = list()
|
| 30 |
+
section_idx = list()
|
| 31 |
+
|
| 32 |
+
idx = 0
|
| 33 |
+
s = p_section.search(text, idx)
|
| 34 |
+
|
| 35 |
+
if s:
|
| 36 |
+
sections.append(text[0:s.start(1)])
|
| 37 |
+
section_names.append('preamble')
|
| 38 |
+
section_idx.append(0)
|
| 39 |
+
|
| 40 |
+
while s:
|
| 41 |
+
current_section = s.group(1).lower()
|
| 42 |
+
# get the start of the text for this section
|
| 43 |
+
idx_start = s.end()
|
| 44 |
+
# skip past the first newline to avoid some bad parses
|
| 45 |
+
idx_skip = text[idx_start:].find('\n')
|
| 46 |
+
if idx_skip == -1:
|
| 47 |
+
idx_skip = 0
|
| 48 |
+
|
| 49 |
+
s = p_section.search(text, idx_start + idx_skip)
|
| 50 |
+
|
| 51 |
+
if s is None:
|
| 52 |
+
idx_end = len(text)
|
| 53 |
+
else:
|
| 54 |
+
idx_end = s.start()
|
| 55 |
+
|
| 56 |
+
sections.append(text[idx_start:idx_end])
|
| 57 |
+
section_names.append(current_section)
|
| 58 |
+
section_idx.append(idx_start)
|
| 59 |
+
|
| 60 |
+
else:
|
| 61 |
+
sections.append(text)
|
| 62 |
+
section_names.append('full report')
|
| 63 |
+
section_idx.append(0)
|
| 64 |
+
|
| 65 |
+
section_names = normalize_section_names(section_names)
|
| 66 |
+
|
| 67 |
+
# remove empty sections
|
| 68 |
+
# this handles when the report starts with a finding-like statement
|
| 69 |
+
# .. but this statement is not a section, more like a report title
|
| 70 |
+
# e.g. p10/p10103318/s57408307
|
| 71 |
+
# CHEST, PA LATERAL:
|
| 72 |
+
#
|
| 73 |
+
# INDICATION: This is the actual section ....
|
| 74 |
+
# it also helps when there are multiple findings sections
|
| 75 |
+
# usually one is empty
|
| 76 |
+
for i in reversed(range(len(section_names))):
|
| 77 |
+
if section_names[i] in ('impression', 'findings'):
|
| 78 |
+
if sections[i].strip() == '':
|
| 79 |
+
sections.pop(i)
|
| 80 |
+
section_names.pop(i)
|
| 81 |
+
section_idx.pop(i)
|
| 82 |
+
|
| 83 |
+
if ('impression' not in section_names) & ('findings' not in section_names):
|
| 84 |
+
# create a new section for the final paragraph
|
| 85 |
+
if '\n \n' in sections[-1]:
|
| 86 |
+
sections.append('\n \n'.join(sections[-1].split('\n \n')[1:]))
|
| 87 |
+
sections[-2] = sections[-2].split('\n \n')[0]
|
| 88 |
+
section_names.append('last_paragraph')
|
| 89 |
+
section_idx.append(section_idx[-1] + len(sections[-2]))
|
| 90 |
+
|
| 91 |
+
return sections, section_names, section_idx
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def normalize_section_names(section_names):
|
| 95 |
+
"""
|
| 96 |
+
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
# first, lower case all
|
| 100 |
+
section_names = [s.lower().strip() for s in section_names]
|
| 101 |
+
|
| 102 |
+
frequent_sections = {
|
| 103 |
+
"preamble": "preamble", # 227885
|
| 104 |
+
"impression": "impression", # 187759
|
| 105 |
+
"comparison": "comparison", # 154647
|
| 106 |
+
"indication": "indication", # 153730
|
| 107 |
+
"findings": "findings", # 149842
|
| 108 |
+
"examination": "examination", # 94094
|
| 109 |
+
"technique": "technique", # 81402
|
| 110 |
+
"history": "history", # 45624
|
| 111 |
+
"comparisons": "comparison", # 8686
|
| 112 |
+
"clinical history": "history", # 7121
|
| 113 |
+
"reason for examination": "indication", # 5845
|
| 114 |
+
"notification": "notification", # 5749
|
| 115 |
+
"reason for exam": "indication", # 4430
|
| 116 |
+
"clinical information": "history", # 4024
|
| 117 |
+
"exam": "examination", # 3907
|
| 118 |
+
"clinical indication": "indication", # 1945
|
| 119 |
+
"conclusion": "impression", # 1802
|
| 120 |
+
"chest, two views": "findings", # 1735
|
| 121 |
+
"recommendation(s)": "recommendations", # 1700
|
| 122 |
+
"type of examination": "examination", # 1678
|
| 123 |
+
"reference exam": "comparison", # 347
|
| 124 |
+
"patient history": "history", # 251
|
| 125 |
+
"addendum": "addendum", # 183
|
| 126 |
+
"comparison exam": "comparison", # 163
|
| 127 |
+
"date": "date", # 108
|
| 128 |
+
"comment": "comment", # 88
|
| 129 |
+
"findings and impression": "impression", # 87
|
| 130 |
+
"wet read": "wet read", # 83
|
| 131 |
+
"comparison film": "comparison", # 79
|
| 132 |
+
"recommendations": "recommendations", # 72
|
| 133 |
+
"findings/impression": "impression", # 47
|
| 134 |
+
"pfi": "history",
|
| 135 |
+
'recommendation': 'recommendations',
|
| 136 |
+
'wetread': 'wet read',
|
| 137 |
+
'ndication': 'impression', # 1
|
| 138 |
+
'impresson': 'impression', # 2
|
| 139 |
+
'imprression': 'impression', # 1
|
| 140 |
+
'imoression': 'impression', # 1
|
| 141 |
+
'impressoin': 'impression', # 1
|
| 142 |
+
'imprssion': 'impression', # 1
|
| 143 |
+
'impresion': 'impression', # 1
|
| 144 |
+
'imperssion': 'impression', # 1
|
| 145 |
+
'mpression': 'impression', # 1
|
| 146 |
+
'impession': 'impression', # 3
|
| 147 |
+
'findings/ impression': 'impression', # ,1
|
| 148 |
+
'finding': 'findings', # ,8
|
| 149 |
+
'findins': 'findings',
|
| 150 |
+
'findindgs': 'findings', # ,1
|
| 151 |
+
'findgings': 'findings', # ,1
|
| 152 |
+
'findngs': 'findings', # ,1
|
| 153 |
+
'findnings': 'findings', # ,1
|
| 154 |
+
'finidngs': 'findings', # ,2
|
| 155 |
+
'idication': 'indication', # ,1
|
| 156 |
+
'reference findings': 'findings', # ,1
|
| 157 |
+
'comparision': 'comparison', # ,2
|
| 158 |
+
'comparsion': 'comparison', # ,1
|
| 159 |
+
'comparrison': 'comparison', # ,1
|
| 160 |
+
'comparisions': 'comparison' # ,1
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
p_findings = [
|
| 164 |
+
'chest',
|
| 165 |
+
'portable',
|
| 166 |
+
'pa and lateral',
|
| 167 |
+
'lateral and pa',
|
| 168 |
+
'ap and lateral',
|
| 169 |
+
'lateral and ap',
|
| 170 |
+
'frontal and',
|
| 171 |
+
'two views',
|
| 172 |
+
'frontal view',
|
| 173 |
+
'pa view',
|
| 174 |
+
'ap view',
|
| 175 |
+
'one view',
|
| 176 |
+
'lateral view',
|
| 177 |
+
'bone window',
|
| 178 |
+
'frontal upright',
|
| 179 |
+
'frontal semi-upright',
|
| 180 |
+
'ribs',
|
| 181 |
+
'pa and lat'
|
| 182 |
+
]
|
| 183 |
+
p_findings = re.compile('({})'.format('|'.join(p_findings)))
|
| 184 |
+
|
| 185 |
+
main_sections = [
|
| 186 |
+
'impression', 'findings', 'history', 'comparison',
|
| 187 |
+
'addendum'
|
| 188 |
+
]
|
| 189 |
+
for i, s in enumerate(section_names):
|
| 190 |
+
if s in frequent_sections:
|
| 191 |
+
section_names[i] = frequent_sections[s]
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
main_flag = False
|
| 195 |
+
for m in main_sections:
|
| 196 |
+
if m in s:
|
| 197 |
+
section_names[i] = m
|
| 198 |
+
main_flag = True
|
| 199 |
+
break
|
| 200 |
+
if main_flag:
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
m = p_findings.search(s)
|
| 204 |
+
if m is not None:
|
| 205 |
+
section_names[i] = 'findings'
|
| 206 |
+
|
| 207 |
+
# if it looks like it is describing the entire study
|
| 208 |
+
# it's equivalent to findings
|
| 209 |
+
# group similar phrasings for impression
|
| 210 |
+
|
| 211 |
+
return section_names
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def custom_mimic_cxr_rules():
|
| 215 |
+
"""
|
| 216 |
+
Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
|
| 217 |
+
"""
|
| 218 |
+
custom_section_names = {
|
| 219 |
+
's50913680': 'recommendations', # files/p11/p11851243/s50913680.txt
|
| 220 |
+
's59363654': 'examination', # files/p12/p12128253/s59363654.txt
|
| 221 |
+
's59279892': 'technique', # files/p13/p13150370/s59279892.txt
|
| 222 |
+
's59768032': 'recommendations', # files/p13/p13249077/s59768032.txt
|
| 223 |
+
's57936451': 'indication', # files/p14/p14325424/s57936451.txt
|
| 224 |
+
's50058765': 'indication', # files/p14/p14731346/s50058765.txt
|
| 225 |
+
's53356173': 'examination', # files/p15/p15898350/s53356173.txt
|
| 226 |
+
's53202765': 'technique', # files/p16/p16076182/s53202765.txt
|
| 227 |
+
's50808053': 'technique', # files/p16/p16631485/s50808053.txt
|
| 228 |
+
's51966317': 'indication', # files/p10/p10817099/s51966317.txt
|
| 229 |
+
's50743547': 'examination', # files/p11/p11388341/s50743547.txt
|
| 230 |
+
's56451190': 'note', # files/p11/p11842879/s56451190.txt
|
| 231 |
+
's59067458': 'recommendations', # files/p11/p11984647/s59067458.txt
|
| 232 |
+
's59215320': 'examination', # files/p12/p12408912/s59215320.txt
|
| 233 |
+
's55124749': 'indication', # files/p12/p12428492/s55124749.txt
|
| 234 |
+
's54365831': 'indication', # files/p13/p13876470/s54365831.txt
|
| 235 |
+
's59087630': 'recommendations', # files/p14/p14267880/s59087630.txt
|
| 236 |
+
's58157373': 'recommendations', # files/p15/p15032392/s58157373.txt
|
| 237 |
+
's56482935': 'recommendations', # files/p15/p15388421/s56482935.txt
|
| 238 |
+
's58375018': 'recommendations', # files/p15/p15505556/s58375018.txt
|
| 239 |
+
's54654948': 'indication', # files/p17/p17090359/s54654948.txt
|
| 240 |
+
's55157853': 'examination', # files/p18/p18975498/s55157853.txt
|
| 241 |
+
's51491012': 'history', # files/p19/p19314266/s51491012.txt
|
| 242 |
+
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
custom_indices = {
|
| 246 |
+
's50525523': [201, 349], # files/p10/p10602608/s50525523.txt
|
| 247 |
+
's57564132': [233, 554], # files/p10/p10637168/s57564132.txt
|
| 248 |
+
's59982525': [313, 717], # files/p11/p11989982/s59982525.txt
|
| 249 |
+
's53488209': [149, 475], # files/p12/p12458657/s53488209.txt
|
| 250 |
+
's54875119': [234, 988], # files/p13/p13687044/s54875119.txt
|
| 251 |
+
's50196495': [59, 399], # files/p13/p13894879/s50196495.txt
|
| 252 |
+
's56579911': [59, 218], # files/p15/p15394326/s56579911.txt
|
| 253 |
+
's52648681': [292, 631], # files/p15/p15666238/s52648681.txt
|
| 254 |
+
's59889364': [172, 453], # files/p15/p15835529/s59889364.txt
|
| 255 |
+
's53514462': [73, 377], # files/p16/p16297706/s53514462.txt
|
| 256 |
+
's59505494': [59, 450], # files/p16/p16730991/s59505494.txt
|
| 257 |
+
's53182247': [59, 412], # files/p16/p16770442/s53182247.txt
|
| 258 |
+
's51410602': [47, 320], # files/p17/p17069955/s51410602.txt
|
| 259 |
+
's56412866': [522, 822], # files/p17/p17612000/s56412866.txt
|
| 260 |
+
's54986978': [59, 306], # files/p17/p17912487/s54986978.txt
|
| 261 |
+
's59003148': [262, 505], # files/p17/p17916384/s59003148.txt
|
| 262 |
+
's57150433': [61, 394], # files/p18/p18335791/s57150433.txt
|
| 263 |
+
's56760320': [219, 457], # files/p18/p18418794/s56760320.txt
|
| 264 |
+
's59562049': [158, 348], # files/p18/p18502016/s59562049.txt
|
| 265 |
+
's52674888': [145, 296], # files/p19/p19381919/s52674888.txt
|
| 266 |
+
's55258338': [192, 568], # files/p13/p13719117/s55258338.txt
|
| 267 |
+
's59330497': [140, 655], # files/p15/p15479218/s59330497.txt
|
| 268 |
+
's52119491': [179, 454], # files/p17/p17959278/s52119491.txt
|
| 269 |
+
# below have no findings at all in the entire report
|
| 270 |
+
's58235663': [0, 0], # files/p11/p11573679/s58235663.txt
|
| 271 |
+
's50798377': [0, 0], # files/p12/p12632853/s50798377.txt
|
| 272 |
+
's54168089': [0, 0], # files/p14/p14463099/s54168089.txt
|
| 273 |
+
's53071062': [0, 0], # files/p15/p15774521/s53071062.txt
|
| 274 |
+
's56724958': [0, 0], # files/p16/p16175671/s56724958.txt
|
| 275 |
+
's54231141': [0, 0], # files/p16/p16312859/s54231141.txt
|
| 276 |
+
's53607029': [0, 0], # files/p17/p17603668/s53607029.txt
|
| 277 |
+
's52035334': [0, 0], # files/p19/p19349312/s52035334.txt
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
return custom_section_names, custom_indices
|
| 281 |
+
|