Upload model

Browse files

Files changed (5) hide show

README.md +3 -3
config.json +4 -0
create_section_files.py +150 -0
modelling_cxrmate_ed.py +33 -9
section_parser.py +281 -0

README.md CHANGED Viewed

@@ -1,4 +1,7 @@
 ---
 library_name: transformers
 tags:
 - chest X-ray report generation
@@ -16,9 +19,6 @@ tags:
 - patient records
 - mimic-cxr
 - mimic-iv-ed
-license: apache-2.0
-language:
-- en
 ---
 # CXRMate-ED: The Impact of Auxiliary Patient Data on Automated Chest X-Ray Report Generation and How to Incorporate It

 ---
+language:
+- en
+license: apache-2.0
 library_name: transformers
 tags:
 - chest X-ray report generation
 - patient records
 - mimic-cxr
 - mimic-iv-ed
 ---
 # CXRMate-ED: The Impact of Auxiliary Patient Data on Automated Chest X-Ray Report Generation and How to Incorporate It

config.json CHANGED Viewed

@@ -85,6 +85,10 @@
     "rms_norm_eps": 1e-06,
     "rope_scaling": null,
     "rope_theta": 10000.0,
     "sep_token_id": null,
     "suppress_tokens": null,
     "task_specific_params": null,

     "rms_norm_eps": 1e-06,
     "rope_scaling": null,
     "rope_theta": 10000.0,
+    "section_ids": [
+      12,
+      13
+    ],
     "sep_token_id": null,
     "suppress_tokens": null,
     "task_specific_params": null,

create_section_files.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import csv
+import os
+from pathlib import Path
+from tqdm import tqdm
+# local folder import
+from .section_parser import custom_mimic_cxr_rules, section_text
+def list_rindex(l, s):
+    """
+    Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/txt/create_section_files.py
+    """
+    """Helper function: *last* matching element in a list"""
+    return len(l) - l[-1::-1].index(s) - 1
+def create_section_files(reports_path, output_path, no_split):
+    """
+    Modification of: https://github.com/MIT-LCP/mimic-cxr/blob/master/txt/create_section_files.py
+    """
+    reports_path = Path(reports_path)
+    output_path = Path(output_path)
+    if not output_path.exists():
+        output_path.mkdir()
+    # not all reports can be automatically sectioned
+    # we load in some dictionaries which have manually determined sections
+    custom_section_names, custom_indices = custom_mimic_cxr_rules()
+    # get all higher up folders (p00, p01, etc)
+    p_grp_folders = os.listdir(reports_path)
+    p_grp_folders = [p for p in p_grp_folders
+                     if p.startswith('p') and len(p) == 3]
+    p_grp_folders.sort()
+    # patient_studies will hold the text for use in NLP labeling
+    patient_studies = []
+    # study_sections will have an element for each study
+    # this element will be a list, each element having text for a specific section
+    study_sections = []
+    for p_grp in p_grp_folders:
+        # get patient folders, usually around ~6k per group folder
+        cxr_path = reports_path / p_grp
+        p_folders = os.listdir(cxr_path)
+        p_folders = [p for p in p_folders if p.startswith('p')]
+        p_folders.sort()
+        # For each patient in this grouping folder
+        print(p_grp)
+        for p in tqdm(p_folders):
+            patient_path = cxr_path / p
+            # get the filename for all their free-text reports
+            studies = os.listdir(patient_path)
+            studies = [s for s in studies
+                       if s.endswith('.txt') and s.startswith('s')]
+            for s in studies:
+                # load in the free-text report
+                with open(patient_path / s, 'r') as fp:
+                    text = ''.join(fp.readlines())
+                # get study string name without the txt extension
+                s_stem = s[0:-4]
+                # custom rules for some poorly formatted reports
+                if s_stem in custom_indices:
+                    idx = custom_indices[s_stem]
+                    patient_studies.append([s_stem, text[idx[0]:idx[1]]])
+                    continue
+                # split text into sections
+                sections, section_names, section_idx = section_text(text)
+                # check to see if this has mis-named sections
+                # e.g. sometimes the impression is in the comparison section
+                if s_stem in custom_section_names:
+                    sn = custom_section_names[s_stem]
+                    idx = list_rindex(section_names, sn)
+                    patient_studies.append([s_stem, sections[idx].strip()])
+                    continue
+                # grab the *last* section with the given title
+                # prioritizes impression > findings, etc.
+                # "last_paragraph" is text up to the end of the report
+                # many reports are simple, and have a single section
+                # header followed by a few paragraphs
+                # these paragraphs are grouped into section "last_paragraph"
+                # note also comparison seems unusual but if no other sections
+                # exist the radiologist has usually written the report
+                # in the comparison section
+                idx = -1
+                for sn in ('impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'):
+                    if sn in section_names:
+                        idx = list_rindex(section_names, sn)
+                        break
+                if idx == -1:
+                    # we didn't find any sections we can use :(
+                    patient_studies.append([s_stem, ''])
+                    print(f'no impression/findings: {patient_path / s}')
+                else:
+                    # store the text of the conclusion section
+                    patient_studies.append([s_stem, sections[idx].strip()])
+                study_sectioned = [s_stem]
+                for sn in ('impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'):
+                    if sn in section_names:
+                        idx = list_rindex(section_names, sn)
+                        study_sectioned.append(sections[idx].strip())
+                    else:
+                        study_sectioned.append(None)
+                study_sections.append(study_sectioned)
+    # write distinct files to facilitate modular processing
+    if len(patient_studies) > 0:
+        # write out a single CSV with the sections
+        with open(output_path / 'mimic_cxr_sectioned.csv', 'w') as fp:
+            csvwriter = csv.writer(fp)
+            # write header
+            csvwriter.writerow(['study', 'impression', 'findings', 'indication', 'history', 'last_paragraph', 'comparison'])
+            for row in study_sections:
+                csvwriter.writerow(row)
+        if no_split:
+            # write all the reports out to a single file
+            with open(output_path / f'mimic_cxr_sections.csv', 'w') as fp:
+                csvwriter = csv.writer(fp)
+                for row in patient_studies:
+                    csvwriter.writerow(row)
+        else:
+            # write ~22 files with ~10k reports each
+            n = 0
+            jmp = 10000
+            while n < len(patient_studies):
+                n_fn = n // jmp
+                with open(output_path / f'mimic_cxr_{n_fn:02d}.csv', 'w') as fp:
+                    csvwriter = csv.writer(fp)
+                    for row in patient_studies[n:n+jmp]:
+                        csvwriter.writerow(row)
+                n += jmp

modelling_cxrmate_ed.py CHANGED Viewed

@@ -1,12 +1,8 @@
-import csv
-import functools
 import math
 import os
-import re
-from collections import OrderedDict
 from glob import glob
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
 import duckdb
 import pandas as pd
@@ -24,10 +20,11 @@ from transformers.models.vision_encoder_decoder.configuration_vision_encoder_dec
 )
 from transformers.utils import logging
 from .dataset import StudyIDEDStayIDSubset
 from .modelling_uniformer import MultiUniFormerWithProjectionHead
 from .records import EDCXRSubjectRecords
-from .tables import ed_module_tables
 logger = logging.get_logger(__name__)
@@ -940,7 +937,7 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             "Please download them using wget -r -N -c -np --reject dcm --user <username> --ask-password https://physionet.org/files/mimic-cxr/2.0.0/"""
             print('Extracting sections from reports...')
-            create_sectioned_files(
                 reports_path=os.path.join(physionet_dir, 'mimic-cxr', '2.0.0', 'files'),
                 output_path=sectioned_dir,
                 no_split=True,
@@ -1009,8 +1006,8 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr AS SELECT * FROM df")
-            # Create lookup tables (do this only for ED tables, as the MIMIC-CXR metadata table is not useful):
-            for k, v in ed_module_tables.items():
                 if v.load and v.index_columns:
                     start_idx = 0
                     for i in v.index_columns_source:
@@ -1127,3 +1124,30 @@ class MIMICIVEDCXRMultimodalModel(VisionEncoderDecoderModel):
             f'No. of training dicom_ids, study_ids, & subject_ids: {dataset.num_dicom_ids},',
             f'{dataset.num_study_ids}, & {dataset.num_subject_ids}.',
         )

 import math
 import os
 from glob import glob
 from pathlib import Path
+from typing import Optional, Tuple, Union
 import duckdb
 import pandas as pd
 )
 from transformers.utils import logging
+from .create_section_files import create_section_files
 from .dataset import StudyIDEDStayIDSubset
 from .modelling_uniformer import MultiUniFormerWithProjectionHead
 from .records import EDCXRSubjectRecords
+from .tables import ed_module_tables, mimic_cxr_tables
 logger = logging.get_logger(__name__)
             "Please download them using wget -r -N -c -np --reject dcm --user <username> --ask-password https://physionet.org/files/mimic-cxr/2.0.0/"""
             print('Extracting sections from reports...')
+            create_section_files(
                 reports_path=os.path.join(physionet_dir, 'mimic-cxr', '2.0.0', 'files'),
                 output_path=sectioned_dir,
                 no_split=True,
             connect.sql(f"CREATE OR REPLACE TABLE mimic_cxr AS SELECT * FROM df")
+            # Create lookup tables:
+            for k, v in (ed_module_tables | mimic_cxr_tables).items():
                 if v.load and v.index_columns:
                     start_idx = 0
                     for i in v.index_columns_source:
             f'No. of training dicom_ids, study_ids, & subject_ids: {dataset.num_dicom_ids},',
             f'{dataset.num_study_ids}, & {dataset.num_subject_ids}.',
         )
+        return dataset
+    @staticmethod
+    def collate_fn(batch):
+        keys = set().union(*(d.keys() for d in batch))
+        batch = {j: [i.setdefault(j, None) for i in batch] for j in keys}
+        batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
+        for k in keys:
+            if 'index_value_feats' in k:
+                total_indices = next(i for i in batch[k] if i is not None).shape[-1]
+                batch[k] = [i if i is not None else torch.empty(0, total_indices) for i in batch[k]]
+                batch[k] = torch.nn.utils.rnn.pad_sequence(batch[k], batch_first=True, padding_value=-1)  # Pad value of -1 is not ideal. Need to use something else.
+                token_type_id_name = k.replace('_feats', '_token_type_ids')
+                batch[token_type_id_name] = [i if i is not None else torch.empty(0, dtype=torch.long) for i in batch[token_type_id_name]]
+                batch[token_type_id_name] = torch.nn.utils.rnn.pad_sequence(
+                    batch[token_type_id_name], batch_first=True, padding_value=0,
+                )
+                mask_name = k.replace('_feats', '_mask')
+                batch[mask_name] = (batch[k] != -1).any(dim=-1).int()
+            if 'time_delta' in k and 'index_value' in k:
+                batch[k] = [i if i is not None else torch.empty(0, 1) for i in batch[k]]
+                batch[k] = torch.nn.utils.rnn.pad_sequence(batch[k], batch_first=True, padding_value=0)
+        return batch

section_parser.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import re
+def section_text(text):
+    """
+    Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
+    """
+    """Splits text into sections.
+    Assumes text is in a radiology report format, e.g.:
+        COMPARISON:  Chest radiograph dated XYZ.
+        IMPRESSION:  ABC...
+    Given text like this, it will output text from each section,
+    where the section type is determined by the all caps header.
+    Returns a three element tuple:
+        sections - list containing the text of each section
+        section_names - a normalized version of the section name
+        section_idx - list of start indices of the text in the section
+    """
+    p_section = re.compile(
+        r'\n ([A-Z ()/,-]+):\s', re.DOTALL)
+    sections = list()
+    section_names = list()
+    section_idx = list()
+    idx = 0
+    s = p_section.search(text, idx)
+    if s:
+        sections.append(text[0:s.start(1)])
+        section_names.append('preamble')
+        section_idx.append(0)
+        while s:
+            current_section = s.group(1).lower()
+            # get the start of the text for this section
+            idx_start = s.end()
+            # skip past the first newline to avoid some bad parses
+            idx_skip = text[idx_start:].find('\n')
+            if idx_skip == -1:
+                idx_skip = 0
+            s = p_section.search(text, idx_start + idx_skip)
+            if s is None:
+                idx_end = len(text)
+            else:
+                idx_end = s.start()
+            sections.append(text[idx_start:idx_end])
+            section_names.append(current_section)
+            section_idx.append(idx_start)
+    else:
+        sections.append(text)
+        section_names.append('full report')
+        section_idx.append(0)
+    section_names = normalize_section_names(section_names)
+    # remove empty sections
+    # this handles when the report starts with a finding-like statement
+    #  .. but this statement is not a section, more like a report title
+    #  e.g. p10/p10103318/s57408307
+    #    CHEST, PA LATERAL:
+    #
+    #    INDICATION:   This is the actual section ....
+    # it also helps when there are multiple findings sections
+    # usually one is empty
+    for i in reversed(range(len(section_names))):
+        if section_names[i] in ('impression', 'findings'):
+            if sections[i].strip() == '':
+                sections.pop(i)
+                section_names.pop(i)
+                section_idx.pop(i)
+    if ('impression' not in section_names) & ('findings' not in section_names):
+        # create a new section for the final paragraph
+        if '\n \n' in sections[-1]:
+            sections.append('\n \n'.join(sections[-1].split('\n \n')[1:]))
+            sections[-2] = sections[-2].split('\n \n')[0]
+            section_names.append('last_paragraph')
+            section_idx.append(section_idx[-1] + len(sections[-2]))
+    return sections, section_names, section_idx
+def normalize_section_names(section_names):
+    """
+    Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
+    """
+    # first, lower case all
+    section_names = [s.lower().strip() for s in section_names]
+    frequent_sections = {
+        "preamble": "preamble",  # 227885
+        "impression": "impression",  # 187759
+        "comparison": "comparison",  # 154647
+        "indication": "indication",  # 153730
+        "findings": "findings",  # 149842
+        "examination": "examination",  # 94094
+        "technique": "technique",  # 81402
+        "history": "history",  # 45624
+        "comparisons": "comparison",  # 8686
+        "clinical history": "history",  # 7121
+        "reason for examination": "indication",  # 5845
+        "notification": "notification",  # 5749
+        "reason for exam": "indication",  # 4430
+        "clinical information": "history",  # 4024
+        "exam": "examination",  # 3907
+        "clinical indication": "indication",  # 1945
+        "conclusion": "impression",  # 1802
+        "chest, two views": "findings",  # 1735
+        "recommendation(s)": "recommendations",  # 1700
+        "type of examination": "examination",  # 1678
+        "reference exam": "comparison",  # 347
+        "patient history": "history",  # 251
+        "addendum": "addendum",  # 183
+        "comparison exam": "comparison",  # 163
+        "date": "date",  # 108
+        "comment": "comment",  # 88
+        "findings and impression": "impression",  # 87
+        "wet read": "wet read",  # 83
+        "comparison film": "comparison",  # 79
+        "recommendations": "recommendations",  # 72
+        "findings/impression": "impression",  # 47
+        "pfi": "history",
+        'recommendation': 'recommendations',
+        'wetread': 'wet read',
+        'ndication': 'impression',  # 1
+        'impresson': 'impression',  # 2
+        'imprression': 'impression',  # 1
+        'imoression': 'impression',  # 1
+        'impressoin': 'impression',  # 1
+        'imprssion': 'impression',  # 1
+        'impresion': 'impression',  # 1
+        'imperssion': 'impression',  # 1
+        'mpression': 'impression',  # 1
+        'impession': 'impression',  # 3
+        'findings/ impression': 'impression',  # ,1
+        'finding': 'findings',  # ,8
+        'findins': 'findings',
+        'findindgs': 'findings',  # ,1
+        'findgings': 'findings',  # ,1
+        'findngs': 'findings',  # ,1
+        'findnings': 'findings',  # ,1
+        'finidngs': 'findings',  # ,2
+        'idication': 'indication',  # ,1
+        'reference findings': 'findings',  # ,1
+        'comparision': 'comparison',  # ,2
+        'comparsion': 'comparison',  # ,1
+        'comparrison': 'comparison',  # ,1
+        'comparisions': 'comparison'  # ,1
+    }
+    p_findings = [
+        'chest',
+        'portable',
+        'pa and lateral',
+        'lateral and pa',
+        'ap and lateral',
+        'lateral and ap',
+        'frontal and',
+        'two views',
+        'frontal view',
+        'pa view',
+        'ap view',
+        'one view',
+        'lateral view',
+        'bone window',
+        'frontal upright',
+        'frontal semi-upright',
+        'ribs',
+        'pa and lat'
+    ]
+    p_findings = re.compile('({})'.format('|'.join(p_findings)))
+    main_sections = [
+        'impression', 'findings', 'history', 'comparison',
+        'addendum'
+    ]
+    for i, s in enumerate(section_names):
+        if s in frequent_sections:
+            section_names[i] = frequent_sections[s]
+            continue
+        main_flag = False
+        for m in main_sections:
+            if m in s:
+                section_names[i] = m
+                main_flag = True
+                break
+        if main_flag:
+            continue
+        m = p_findings.search(s)
+        if m is not None:
+            section_names[i] = 'findings'
+        # if it looks like it is describing the entire study
+        # it's equivalent to findings
+        # group similar phrasings for impression
+    return section_names
+def custom_mimic_cxr_rules():
+    """
+    Source: https://github.com/MIT-LCP/mimic-cxr/blob/master/LICENSE
+    """
+    custom_section_names = {
+        's50913680': 'recommendations',  # files/p11/p11851243/s50913680.txt
+        's59363654': 'examination',  # files/p12/p12128253/s59363654.txt
+        's59279892': 'technique',  # files/p13/p13150370/s59279892.txt
+        's59768032': 'recommendations',  # files/p13/p13249077/s59768032.txt
+        's57936451': 'indication',  # files/p14/p14325424/s57936451.txt
+        's50058765': 'indication',  # files/p14/p14731346/s50058765.txt
+        's53356173': 'examination',  # files/p15/p15898350/s53356173.txt
+        's53202765': 'technique',  # files/p16/p16076182/s53202765.txt
+        's50808053': 'technique',  # files/p16/p16631485/s50808053.txt
+        's51966317': 'indication',  # files/p10/p10817099/s51966317.txt
+        's50743547': 'examination',  # files/p11/p11388341/s50743547.txt
+        's56451190': 'note',  # files/p11/p11842879/s56451190.txt
+        's59067458': 'recommendations',  # files/p11/p11984647/s59067458.txt
+        's59215320': 'examination',  # files/p12/p12408912/s59215320.txt
+        's55124749': 'indication',  # files/p12/p12428492/s55124749.txt
+        's54365831': 'indication',  # files/p13/p13876470/s54365831.txt
+        's59087630': 'recommendations',  # files/p14/p14267880/s59087630.txt
+        's58157373': 'recommendations',  # files/p15/p15032392/s58157373.txt
+        's56482935': 'recommendations',  # files/p15/p15388421/s56482935.txt
+        's58375018': 'recommendations',  # files/p15/p15505556/s58375018.txt
+        's54654948': 'indication',  # files/p17/p17090359/s54654948.txt
+        's55157853': 'examination',  # files/p18/p18975498/s55157853.txt
+        's51491012': 'history',  # files/p19/p19314266/s51491012.txt
+    }
+    custom_indices = {
+        's50525523': [201, 349],  # files/p10/p10602608/s50525523.txt
+        's57564132': [233, 554],  # files/p10/p10637168/s57564132.txt
+        's59982525': [313, 717],  # files/p11/p11989982/s59982525.txt
+        's53488209': [149, 475],  # files/p12/p12458657/s53488209.txt
+        's54875119': [234, 988],  # files/p13/p13687044/s54875119.txt
+        's50196495': [59, 399],  # files/p13/p13894879/s50196495.txt
+        's56579911': [59, 218],  # files/p15/p15394326/s56579911.txt
+        's52648681': [292, 631],  # files/p15/p15666238/s52648681.txt
+        's59889364': [172, 453],  # files/p15/p15835529/s59889364.txt
+        's53514462': [73, 377],  # files/p16/p16297706/s53514462.txt
+        's59505494': [59, 450],  # files/p16/p16730991/s59505494.txt
+        's53182247': [59, 412],  # files/p16/p16770442/s53182247.txt
+        's51410602': [47, 320],  # files/p17/p17069955/s51410602.txt
+        's56412866': [522, 822],  # files/p17/p17612000/s56412866.txt
+        's54986978': [59, 306],  # files/p17/p17912487/s54986978.txt
+        's59003148': [262, 505],  # files/p17/p17916384/s59003148.txt
+        's57150433': [61, 394],  # files/p18/p18335791/s57150433.txt
+        's56760320': [219, 457],  # files/p18/p18418794/s56760320.txt
+        's59562049': [158, 348],  # files/p18/p18502016/s59562049.txt
+        's52674888': [145, 296],  # files/p19/p19381919/s52674888.txt
+        's55258338': [192, 568],  # files/p13/p13719117/s55258338.txt
+        's59330497': [140, 655],  # files/p15/p15479218/s59330497.txt
+        's52119491': [179, 454],  # files/p17/p17959278/s52119491.txt
+        # below have no findings at all in the entire report
+        's58235663': [0, 0],  # files/p11/p11573679/s58235663.txt
+        's50798377': [0, 0],  # files/p12/p12632853/s50798377.txt
+        's54168089': [0, 0],  # files/p14/p14463099/s54168089.txt
+        's53071062': [0, 0],  # files/p15/p15774521/s53071062.txt
+        's56724958': [0, 0],  # files/p16/p16175671/s56724958.txt
+        's54231141': [0, 0],  # files/p16/p16312859/s54231141.txt
+        's53607029': [0, 0],  # files/p17/p17603668/s53607029.txt
+        's52035334': [0, 0],  # files/p19/p19349312/s52035334.txt
+    }
+    return custom_section_names, custom_indices