Readme and license update

52025320 · Cooper Mellema · a0143a85 · 52025320 · 52025320 · 52025320
Commit 52025320 authored 2 years ago by Cooper Mellema
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2018 [IH1] The University of Texas Southwestern Medical Center.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted for academic and research use only (subject to the limitations in the disclaimer below) provided that the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
--- a/Readme.md
+++ b/Readme.md
+# Parkinson's Code Readme
+
+## Purpose
+Code to process and predict prognosis for the PDBP dataset, PPMI dataset, and internally acquired UTSW prospective dataset
\ No newline at end of file
--- a/index_files/PDBP_Prepped_For_LME.csv
+++ b/index_files/PDBP_Prepped_For_LME.csv
--- a/medication/PDBP_LEDD_Calculator.py
+++ b/medication/PDBP_LEDD_Calculator.py
@@ -109,7 +109,7 @@ def fColumnSort(pdMeds):
    pdMeds_Prior_OtherMeds = pdMeds[lsPrior_CommonCols + lsPrior_OtherMeds]
    pdMeds_Park_NeuroMeds = pdMeds[lsPark_CommonCols + lsPark_NeuroMeds]
    pdMeds_Park_OtherMeds = pdMeds[lsPark_CommonCols + lsPark_OtherMeds]
-    
+
    #rename columns
    lsNewCols = ['Site', 'Visit', 'Date', 'GUID', 'Age', 'Med', 'Dose', 'Unit', 'Frequency']
    pdMeds_Prior_NeuroMeds.columns = lsNewCols
@@ -162,7 +162,7 @@ def fSelectPDMeds(pdMeds):
    Return:
        pdMeds (pd dataframe): medication table with only PD meds included, all renamed to common naming convention
    """
-    dMedicationLookup = ledd.fLoadMedicationLookup('/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_MedicationSynonyms.json')
+    dMedicationLookup = ledd.fLoadMedicationLookup('/archive/bioinformatics/DLLab/CooperMellema/src/parkinsons/medication/PDBP_MedicationSynonyms.json')

    # rename medications, putting np.nan in for non-pd medications
    pdMeds[['Med']] = \
@@ -184,9 +184,9 @@ def fDoseCorrect(pdMeds):
    # Load the doses/day lookup table
    # Note: PRN medications are ignored, as we have no way of knowing how much was taken
    # Note 2: Some medications were given as a range, the average value was taken for these
-    dDosesPerDay = ledd.fLoadJson(sJsonPath='/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_DosesPerDay.json')
-    dDoseLookup = ledd.fLoadJson(sJsonPath='/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_DoseStrings.json')
-    
+    dDosesPerDay = ledd.fLoadJson(sJsonPath='/archive/bioinformatics/DLLab/CooperMellema/src/parkinsons/medication/PDBP_DosesPerDay.json')
+    dDoseLookup = ledd.fLoadJson(sJsonPath='/archive/bioinformatics/DLLab/CooperMellema/src/parkinsons/medication/PDBP_DoseStrings.json')
+
    # Fill in frequency strings
    # where there are defined values in the concomitant meds field, copy that value to the corresponding Parkinsonism_meds filed
    # the concom encoded tables used string frequencies, so we need to use another lookup table
@@ -243,7 +243,7 @@ def fUnitCorrect(pdMeds):
    Returns:
        pdMeds (dataframe): dataframe corrected to all mg units
    """
-    dUnitConversions = ledd.fLoadJson(sJsonPath='/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_UnitConvert.json')    
+    dUnitConversions = ledd.fLoadJson(sJsonPath='/archive/bioinformatics/DLLab/CooperMellema/src/parkinsons/medication/PDBP_UnitConvert.json')
    for sKey in [x for x in dUnitConversions.keys() if not '__' in x]:
        if sKey in pdMeds['Unit'].values:
            if type(dUnitConversions[sKey])==list:
@@ -266,9 +266,9 @@ def fDEPRECIATEDMedicationSort(pdMeds):
    # Load the doses/day lookup table
    # Note: PRN medications are ignored, as we have no way of knowing how much was taken
    # Note 2: Some medications were given as a range, the average value was taken for these
-    dDosesPerDay = ledd.fLoadJson(sJsonPath='/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_DosesPerDay.json')
-    dDoseLookup = ledd.fLoadJson(sJsonPath='/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_DoseStrings.json')
-    
+    dDosesPerDay = ledd.fLoadJson(sJsonPath='/archive/bioinformatics/DLLab/CooperMellema/src/parkinsons/medication/PDBP_DosesPerDay.json')
+    dDoseLookup = ledd.fLoadJson(sJsonPath='/archive/bioinformatics/DLLab/CooperMellema/src/parkinsons/medication/PDBP_DoseStrings.json')
+

    # Fill in Parkinsons meds from the PriorAndConcomitantMeds fields (naming conventions are off, rewriting everythin in Parkinsonism_Meds fields)
    for sConcom, sPD in [('Required Fields.GUID', 'Required.GUID'),\
@@ -320,7 +320,7 @@ def fDEPRECIATEDMedicationSort(pdMeds):
        else:
            pdMeds.loc[~pdMeds[f'PriorAndConcomitantMeds.{sConcom}'].isnull(), [f'Parkinsonism_Meds.{sPD}']] = \
                pdMeds.loc[~pdMeds[f'PriorAndConcomitantMeds.{sConcom}'].isnull(), [f'PriorAndConcomitantMeds.{sConcom}']].values
-        
+
    # fill in cols that should have been duplicated (only first row filled for
    # a given patient, i.e pt 1 has 3 entries, but info only in the first)
    lsFillCols=['Study ID',
@@ -331,7 +331,6 @@ def fDEPRECIATEDMedicationSort(pdMeds):
                'Parkinsonism_Meds.Required.VisitDate',
                'Parkinsonism_Meds.Required.AgeYrs',
                'Parkinsonism_Meds.Required.AgeRemaindrMonths',
-                
                'Parkinsonism_Meds.Required.AgeVal']
    pdMeds[lsFillCols]=pdMeds[lsFillCols].fillna(method='ffill')
    pdMeds=pdMeds.set_index(['Parkinsonism_Meds.Required.GUID'])
@@ -385,7 +384,7 @@ def fManualFixes(pdMeds):
    lsSuspectEntries.append(pdMeds.loc[(pdMeds['Med']=='Rotigotine').values & (pdMeds['Dose']>=20).values])
    pdMeds.loc[(pdMeds['Med']=='Rotigotine').values & (pdMeds['Dose']>=20).values, 'Dose'] = \
        pdMeds.loc[(pdMeds['Med']=='Rotigotine').values & (pdMeds['Dose']>=20).values, 'Dose'].values/10.0
-    
+
    # Patient PDZJ672JCJ, PDCU336WW9, 'PDWF428BU7', 'PDPP163NDH','PDHE978YX5', 'PDDM149NRN','PDWG274ZZ4' and 'NIHTD106ZTUEA'
    # have dosage for carb/levodopa recorded at 25,000 mg. I corrected these all to 25 mg based on the other doses nearby in 
    # the table (e.g the patient wouldn't be taking 13 mg pills
@@ -393,7 +392,7 @@ def fManualFixes(pdMeds):
    lsSuspectEntries.append(pdMeds.loc[pdMeds['Dose']>1000])
    pdMeds.loc[(pdMeds['Med']=='Carbidopa-Levodopa IR').values & (pdMeds['Dose']>1000).values, 'Dose'] = \
        pdMeds.loc[(pdMeds['Med']=='Carbidopa-Levodopa IR').values & (pdMeds['Dose']>1000).values, 'Dose'].values/1000.0
-    
+
    # Note: patient PDXM370FA1 is suspect as well, they appear to have gone from 0 LEDD at baseline to the highest at 12 months and
    # look like they have duplicated rows in the medication table

@@ -402,7 +401,7 @@ def fManualFixes(pdMeds):
    lsSuspectEntries.append(pdMeds.loc[(pdMeds['Med']=='Carbidopa-Levodopa IR').values & (pdMeds['Dose']<1).values])
    pdMeds.loc[(pdMeds['Med']=='Carbidopa-Levodopa IR').values & (pdMeds['Dose']<1).values, 'Dose'] = \
        pdMeds.loc[(pdMeds['Med']=='Carbidopa-Levodopa IR').values & (pdMeds['Dose']<1).values, 'Dose'].values*10.0
-    
+
    # For Selegiline, patient PDRR084TGM is receiving '50 mg' per dose. Based on typical selegiline levels,
    # this is likely 5 mg. (recommended dose for PD is in 5mg pills)
    lsSuspectEntries.append(pdMeds.loc[(pdMeds['Med']=='Selegiline').values & (pdMeds['Dose']>10).values])
@@ -602,7 +601,7 @@ def fCollectSuspect(pdErrors, pdLEDD_PerSub, pdMeds, pdManuallyAltered):
                }
            )
        ], ignore_index=True)
-    
+
    # collect all of the high delta LEDD in one visit
    # (change greater than 500 mg of LEDD) from one visit to next
    for i in range(len(pdLEDD_PerSub.index)):
@@ -672,12 +671,11 @@ if __name__ == "__main__":
        '24 months','30 months','36 months','42 months','60 months']]

    # save
-    pdLEDD_PerSub.to_csv('/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_Metadata/PDBP_LEDD.csv')
+    pdLEDD_PerSub.to_csv('/archive/bioinformatics/DLLab/CooperMellema/src/parkinsons/index_files/PDBP_LEDD.csv')

    # plot
    ledd.fPlotErrors(pdErrors)

    # collect suspect subjects and save
    pdSuspect = fCollectSuspect(pdErrors, pdLEDD_PerSub, pdMeds, pdManuallyAltered)
-    pdSuspect.to_csv('/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_Metadata/PDBP_Suspect_Patients.csv')
-    #pdSuspect=pd.read_csv('/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/PDBP_Metadata/PDBP_Suspect_Patients.csv', index_col=0)
\ No newline at end of file
+    pdSuspect.to_csv('/archive/bioinformatics/DLLab/CooperMellema/src/parkinsons/index_files/PDBP_LEDD_Suspect.csv')
--- a/models/PD_Regressor.py
+++ b/models/PD_Regressor.py
@@ -431,7 +431,7 @@ if __name__ == '__main__':
    # loop through each experiment:
    for sTarget in lsTargets:
        sSaveRoot=f'/project/bioinformatics/DLLab/Cooper/Code/Parkinsons/TrainedModels/{sTarget}Regression'
-        
+
        iModel=int(sIniFile.split('Dense_')[-1].split('.ini')[0])
        sSelection=None
        if 'UnivariateSelected' in sTarget:
@@ -460,7 +460,7 @@ if __name__ == '__main__':
                        for iInner in dSplits[iOuter].keys():
                            if not os.path.isfile(f'{sSaveRoot}/{sAtlas}/{sMetric}/{sLoss}_Optimized/Dense_{iModel}_Inner{iInner}_Outer{iOuter}.p'):
                                idxTr, idxVal, idxTe = dSplits[iOuter][iInner]
-                                
+
                                # separate data into tr, val, te
                                aTr, aVal, aTe, aTrY, aValY, aTeY = fFetchData(pdData, idxTr, idxVal, idxTe, sCol, sTarget, sSelection=sSelection)


--- a/pipeline/DLLab_Study_Pulses.json
+++ b/pipeline/DLLab_Study_Pulses.json
+{
+    "AAHead_Scout_64ch-head-coil": "anat_T1w_acq-scout",
+    "t1_mprage_sag_p3_1mm_iso": "anat_T1w_acq-sag",
+    "ABCD_fMRI_DistortionMap_PA": "fmap_epi_intendedfor_func_dir-PA",
+    "ABCD_fMRI_DistortionMap_AP": "fmap_epi_intendedfor_func_dir-AP",
+    "ABCD_fMRI_rest_Fixation": "func_bold_task-rest",
+    "ME_4e_2-2mm_G2M4_TR1500_42ms": "func_bold_task-rest_acq-multiband_echos-4_res-22mm_TR-1500",
+    "ME_4e_2.4mm_G2M4_TR1598": "func_bold_task-rest_acq-multiband_echos-4_res-24mm_TR-1598",
+    "ME_4e_2.4mm_G2M4_TR1598_PA": "func_bold_task-rest_acq-multiband_echos-4_res-24mm_TR-1598_dir-PA",
+    "ME_5e_2.4mm_G2M4_TR1598_AP": "func_bold_task-rest_acq-multiband_echos-5_res-24mm_TR-1598_dir-AP",
+    "ME_5e_2.4mm_G2M4_TR1598_PA": "func_bold_task-rest_acq-multiband_echos-5_res-24mm_TR-1598_dir-PA",
+    "ME_3e_2.4mm_G2M4_TR1274_MB4_AP":"func_bold_task-rest_acq-multiband_echos-3_res-24mm_TR-1274_dir-AP",
+    "ME_3e_2.4mm_G2M4_TR1274_MB4_PA": "func_bold_task-rest_acq-multiband_echos-3_res-24mm_TR-1274_dir-PA",
+    "ME_3e_2.0mm_G2M4_TR1500_MB6_AP": "func_bold_task-rest_acq-multiband_echos-3_res-2mm_TR-1500_dir-AP",
+    "ME_3e_2.0mm_G2M4_TR1500_MB6_PA": "func_bold_task-rest_acq-multiband_echos-3_res-2mm_TR-1500_dir-PA",
+    "ME_5e_2.4mm_G2M4_TR1355_MB6_AP": "func_bold_task-rest_acq-multiband_echos-5_res-24mm_TR-1355_dir-AP",
+    "ME_5e_2.4mm_G2M4_TR1355_MB6_PA": "func_bold_task-rest_acq-multiband_echos-5_res-24mm_TR-1355_dir-PA",
+    "3D_T2_Sagittal": "anat_T2w_acq-sag",
+    "qsm_Gradient mode Fast_0.8x0.8x0.8": "anat_QSM",
+    "ABCD_Task_Car vs Dog": "func_bold_task-gonogocardog",
+    "ABCD_Task_Cars vs Dogs": "func_bold_task-gonogocarsdogs",
+    "ABCD_Task_Living vs NonLiving": "func_bold_task-gonogoliving",
+    "ABCD_Task_Finger Tap": "func_bold_task-tap",
+    "ABCD_dMRI_DistortionMap_PA": "fmap_epi_intendedfor_dwi_dir-PA",,
+    "ABCD_dMRI_DistortionMap_AP": "fmap_epi_intendedfor_dwi_dir-AP",
+    "ABCD_dMRI_Changed_Rise_Time": "dwi",
+    "AMRI_GRE_STE_ref_2D": "anat_qsm",
+    "AMRI_GRE_STE_ref_2D_BDC": "anat_qsm_acq-BDC",
+    "AMRI_GRE_STE_0p8_S2x2": "anat_qsm_acq-S2x2",
+    "R2*_GRE": "anat_R2star",
+    "t2_space_dark-fluid_sag_c4_iso_fast": "anat_T2flair",
+    "MT_TR30_88slices": "anat_neuromelanin"
+}
\ No newline at end of file
--- a/processing/PDBP_Format_For_LME.py
+++ b/processing/PDBP_Format_For_LME.py
@@ -20,6 +20,7 @@ import os
 import pandas as pd
 import numpy as np
 import json
+from pathlib import Path

 def fGetData():
    """fetches the old LME data and the new LEDD scores to replace the old with
@@ -28,10 +29,10 @@ def fGetData():
        pdLME_Prepped_Data, pdNew_LEDD: dataframes with old and new data
    """
    # set paths to LEDD data
-    sRoot = os.getcwd()
+    sRoot = Path(os.getcwd()).parent

    # First, the pre-prepped LME data
-    sLME_Prepped_Data = os.path.join(sRoot, 'PDBP_Metadata', 'prepared_lme_data.csv')
+    sLME_Prepped_Data = os.path.join(sRoot, 'index_files', 'prepared_lme_data.csv')
    pdLME_Prepped_Data = pd.read_csv(sLME_Prepped_Data, index_col=1).drop(['Unnamed: 0'], axis=1)

    # Then, the UTSW data
@@ -39,7 +40,7 @@ def fGetData():
    pdUTSW_LEDD = pdUTSW_LEDD[['Site', 'Visit Description', 'Visit#', 'Total_LED']]

    # Then, the new LEDD data
-    sNew_LEDD = os.path.join(sRoot, 'PDBP_Metadata', 'PDBP_LEDD.csv')
+    sNew_LEDD = os.path.join(sRoot, 'index_files', 'PDBP_LEDD.csv')
    pdNew_LEDD = pd.read_csv(sNew_LEDD, index_col=0)

    #Then, the Metadata for the new LEDD data
@@ -47,7 +48,7 @@ def fGetData():
    pdMetadata=pd.read_csv(sMetadata)

    # then, the demographics metadata:
-    with open('./PDBP_Demographics_Paths.json', 'rb') as f:
+    with open(os.path.join(sRoot, 'configs', 'PDBP_Demographics_Paths.json'), 'rb') as f:
        dFiles=json.load(f)
    pdDemographics = pd.concat([pd.read_csv(x) for x in dFiles.values()])
    pdDemographics.set_index('Demographics.Required Fields.GUID', inplace=True)
@@ -63,7 +64,7 @@ def fGetData():
    ]]

    # numerically encode ethnicity and Female
-    with open('./PDBP_Ethnicity_Codes.json', 'rb') as f:
+    with open(os.path.join(sRoot, 'configs', 'PDBP_Ethnicity_Codes.json'), 'rb') as f:
        dEthnicities=json.load(f)
    pdDemographics['Ethnicity'] = [dEthnicities[x] for x in pdDemographics['Demographics.Demographics.RaceExpndCatPDBP']]
    pdDemographics.loc[pdDemographics['Demographics.Demographics.EthnUSACat']=='Hispanic or Latino', 'Ethnicity'] = 'Latino'
@@ -83,7 +84,6 @@ def fGetData():
    return pdLME_Prepped_Data, pdNew_LEDD, pdUTSW_LEDD, pdMetadata, pdDemographics

 def fReformatData(pdLME_Prepped_Data, pdNew_LEDD, pdUTSW_LEDD, pdMetadata, pdDemographics, bTotalUPDRS=False):
-    
    # flatten out Table
    pdNew_LEDD['GUID'] = pdNew_LEDD.index
    pdNew_LEDD = pdNew_LEDD.melt(['GUID','Site'], var_name='Visit', value_name='LEDD')
@@ -170,7 +170,7 @@ def fReformatData(pdLME_Prepped_Data, pdNew_LEDD, pdUTSW_LEDD, pdMetadata, pdDem
    # assign ID to each subject
    pdData['GUID'] = pdData.index
    pdData['ID'] = pdData['GUID'].astype('category').cat.codes + 1 + pdLME_Prepped_Data['ID'].max()
-        
+
    # select out proper columns
    pdData = pdData[['ID', 'Age', 'First_Score', 'First_LEDD', 'Months', 'Female',\
       'Ethnicity', 'Total.Score', 'LEDD', 'LEDD_2', 'PDQ39', 'MoCA', 'Schwab',\
@@ -186,7 +186,7 @@ def fReformatData(pdLME_Prepped_Data, pdNew_LEDD, pdUTSW_LEDD, pdMetadata, pdDem
    pdData['Age'] = pdData['Age2']
    dFirstLEDD = pdData.sort_values(by='Months').groupby(pdData.index).first()['LEDD'].to_dict()
    pdData['First_LEDD'] = [dFirstLEDD[x] if x in dFirstLEDD.keys() else np.nan for x in pdData.index]
-    
+
    # z-score necessary columns
    for sCol in ['First_LEDD', 'Age', 'GCO']:
        pdData[sCol] = (pdData[sCol] - pdData[sCol].mean())/pdData[sCol].std(ddof=0)
@@ -201,6 +201,6 @@ if __name__ == "__main__":
    pdLME_Formatted_Data = fReformatData(pdLME_Prepped_Data, pdNew_LEDD, pdUTSW_LEDD, pdMetadata, pdDemographics)

    # save trainable data
-    sRoot = os.getcwd()
+    sRoot = Path(os.getcwd()).parent
    pdLME_Formatted_Data.dropna(subset=['Total.Score', 'LEDD']).sort_values(by=['GUID', 'Months'])\
-        .to_csv(os.path.join(sRoot, 'PPMI_Metadata', 'PDBP_Prepped_For_LME.csv'))
+        .to_csv(os.path.join(sRoot, 'index_files', 'PDBP_Prepped_For_LME.csv'))