Merge develop into align branch

7c677cf1 · Gervaise Henry · fa22c087 · 1c3a166b · 7c677cf1 · 7c677cf1
Commit 7c677cf1 authored 5 years ago by Gervaise Henry
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,6 +26,15 @@ getData:
  - singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4
  - pytest -m getData

+parseMetadata:
+  stage: unit
+  script:
+  - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID
+  - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p ends
+  - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual
+  - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded
+  - singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p specie
+
 trimData:
  stage: unit
  script:

--- a/workflow/conf/aws_ondemand.config
+++ b/workflow/conf/aws_ondemand.config
@@ -13,14 +13,7 @@ process {
  cpus = 1
  memory = '1 GB'

-  withName:getBag {
-    container = 'bicf/gudmaprbkfilexfer:1.3'
-  }
-  withName:getData {
-    container = 'bicf/gudmaprbkfilexfer:1.3'
-  }
  withName:trimData {
-    container = 'bicf/trimgalore:1.1'
    cpus = 15
  }
 }
\ No newline at end of file
--- a/workflow/conf/aws_spot.config
+++ b/workflow/conf/aws_spot.config
@@ -13,14 +13,7 @@ process {
  cpus = 1
  memory = '1 GB'

-  withName:getBag {
-    container = 'bicf/gudmaprbkfilexfer:1.3'
-  }
-  withName:getData {
-    container = 'bicf/gudmaprbkfilexfer:1.3'
-  }
  withName:trimData {
-    container = 'bicf/trimgalore:1.1'
    cpus = 15
  }
 }
--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -9,6 +9,9 @@ process {
  withName:getData {
    executor = 'local'
  }
+  withName:parseMetadata {
+    executor = 'local'
+  }
  withName:trimData {
    queue = '256GB,256GBv1,384GB'
  }

--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -17,6 +17,9 @@ process {
  withName:getData {
    container = 'bicf/gudmaprbkfilexfer:1.3'
  }
+  withName:parseMetadata {
+    container = 'bicf/python3:1.3'
+  }
  withName:trimData {
    container = 'bicf/trimgalore:1.1'
  }

--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
 #!/usr/bin/env nextflow

 // Define input variables
-params.deriva = "${baseDir}/../test_data/credential.json"
-params.bdbag = "${baseDir}/../test_data/cookies.txt"
+params.deriva = "${baseDir}/../test_data/auth/credential.json"
+params.bdbag = "${baseDir}/../test_data/auth/cookies.txt"
 //params.repRID = "16-1ZX4"
 params.repRID = "Q-Y5JA"
-params.spikein = "false"
-params.reference = "mouse"

 params.outDir = "${baseDir}/../output"

@@ -17,60 +15,23 @@ deriva = Channel
 bdbag = Channel
  .fromPath(params.bdbag)
  .ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
-
 repRID = params.repRID
 outDir = params.outDir
 logsDir = "${outDir}/Logs"
-reference = params.reference
-
-if (params.spikein) {
-  if (params.reference == "human") {
-    reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2")
-  } else if (params.reference == "mouse") {
-    reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2")
-  } 
-} else if (params.reference == "mouse") {
-  reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
-} else if (params.reference == "human") {
-  reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2")
-} else {
-  print ("Warning: Reference genome not specified, defaulting to GRCm38.P6 with NO spike-in")
-  reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
-}

 // Define fixed files
 derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")

-/*
- * Pass in programs from the scripts directory, and any default files for later
- */
-
-
-/*
- *Checking the species and spike-in status
- */
-if (params.spikein) {
-  if (params.species == "human") {
-    reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2")
-  } else if (params.species == "mouse") {
-    reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2")
-  } 
-} else if (params.species == "mouse") {
-  reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
-} else if (params.species == "human") {
-  reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2")
-} else {
-  print ("Warning: Reference genome not specified, defaulting to GRCm38.P6 with NO spike-in")
-  reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
-}
+// Define script files
+script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
+script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")

 /*
 * splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
 */
 process getBag {
-  executor 'local'
  tag "${repRID}"
-  publishDir "${logsDir}/getBag", mode: 'copy', pattern: "${repRID}.getBag.err"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getBag.err"

  input:
    path credential, stageAs: 'credential.json' from deriva
@@ -85,8 +46,12 @@ process getBag {
    hostname >>${repRID}.getBag.err
    ulimit -a >>${repRID}.getBag.err
    export https_proxy=\${http_proxy}
+
+    # link credential file for authentication
    ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID}.getBag.err
    echo "LOG: deriva credentials linked" >>${repRID}.getBag.err
+
+    # deriva-download replicate RID
    deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 2>>${repRID}.getBag.err
    """
 }
@@ -96,10 +61,10 @@ process getBag {
 */
 process getData {
  tag "${repRID}"
-  publishDir "${logsDir}/getData", mode: 'copy', pattern: "${repRID}.getData.err"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getData.err"

  input:
-    executor 'local'
+    path script_bdbagFetch
    path cookies, stageAs: 'deriva-cookies.txt' from bdbag
    path bagit

@@ -110,49 +75,149 @@ process getData {
    file("**/Experiment.csv") into experimentMeta
    file ("${repRID}.getData.err")

-
  script:
    """
    hostname >>${repRID}.getData.err
    ulimit -a >>${repRID}.getData.err
    export https_proxy=\${http_proxy}
+    
+    # link deriva cookie for authentication
    ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID}.getData.err
    echo "LOG: deriva cookie linked" >>${repRID}.getData.err
+    
+    # get bagit basename
    replicate=\$(basename "${bagit}" | cut -d '.' -f1)
    echo "LOG: \${replicate}" >>${repRID}.getData.err
+    
+    # unzip bagit
    unzip ${bagit} 2>>${repRID}.getData.err
    echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err
-    sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID} 2>>${repRID}.getData.err
+    
+    # bagit fetch fastq's only and rename by repRID
+    sh ${script_bdbagFetch} \${replicate} ${repRID} 2>>${repRID}.getData.err
    echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err
    """
 }

+/*
+ * parseMetadata: parses metadata to extract experiment parameters
+*/
+process parseMetadata {
+  tag "${repRID}"
+  publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.parseMetadata.err"
+
+  input:
+    path script_parseMeta
+    val repRID
+    path fileMeta
+    path experimentSettingsMeta
+    path experimentMeta
+
+  output:
+    path 'design.csv' into metadata
+
+  script:
+    """
+    hostname >>${repRID}.parseMetadata.err
+    ulimit -a >>${repRID}.parseMetadata.err
+
+    # Check replicate RID metadata
+    rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p repRID)
+    echo "LOG: replicate RID metadata parsed: \${rep}" >>${repRID}.parseMetadata.err
+    
+    # Get endedness metadata
+    endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p endsMeta)
+    echo "LOG: endedness metadata parsed: \${endsMeta}" >>${repRID}.parseMetadata.err
+    
+    # Manually get endness
+    endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p endsManual)
+    echo "LOG: endedness manually detected: \${endsManual}" >>${repRID}.parseMetadata.err
+
+    # Get strandedness metadata
+    stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded)
+    echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID}.parseMetadata.err
+    
+    # Get spike-in metadata
+    spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p spike)
+    echo "LOG: spike-in metadata parsed: \${spike}" >>${repRID}.parseMetadata.err
+    
+    # Get species metadata
+    species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species)
+    echo "LOG: species metadata parsed: \${species}" >>${repRID}.parseMetadata.err
+
+    # Save design file
+    echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv
+    """
+}
+
+// Split metadata into separate channels
+rep = Channel.create()
+endsMeta = Channel.create()
+endsManual = Channel.create()
+stranded = Channel.create()
+spike = Channel.create()
+species = Channel.create()
+metadata.splitCsv(sep: ',', header: false).separate(
+  rep,
+  endsMeta,
+  endsManual,
+  stranded,
+  spike,
+  species
+)
+
+// Exit with no species
+if (species == "") {
+  print ("ERROR: Reference genome not specified")
+  exit 1
+}
+
+// Setting references
+if (spike) {
+  if (species == "Homo sapiens") {
+    reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2")
+  } else if (species == "Mus musculus") {
+    reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2")
+  }
+} else {
+  if (species == "Homo sapiens") {
+    reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2")
+  } else if (species == "Mus musculus") {
+    reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
+  }
+}
+
+
+
+
+
 /*
 * trimData: trims any adapter or non-host sequences from the data
 */
 process trimData {
  tag "${repRID}"
-  publishDir "${logsDir}/trimData", mode: 'copy', pattern: "\${repRID}.trimData.*"
+  publishDir "${logsDir}", mode: 'copy', pattern: "\${repRID}.trimData.*"

  input:
    file(fastq) from fastqs

  output:
-    tuple file ("params.csv"), file ("*.fq.gz") into fastqs_trimmed
+    path ("*.fq.gz") into fastqs_trimmed
    file ("${repRID}.trimData.log")
    file ("${repRID}.trimData.err")

  script:
    """
-    if [ '${fastq[1]}' == 'null' ]
+    hostname >>${repRID}.trimData.err
+    ulimit -a >>${repRID}.trimData.err
+
+    # trim fastqs
+    if [ '${endsManual}' == 'se' ]
    then
-      ends='se'
      trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err;
    else
-      ends='pe'
      trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err;
    fi
-    echo \${ends} > params.csv;
    """
 }

@@ -161,23 +226,21 @@ process trimData {
 */
 process alignReads {
  tag "align-${repRID}"
-  publishDir "${outDir}/tempOut/aligned", mode: "copy"
+  publishDir "${outDir}/aligned", mode: "copy"

  input:
-    set ends, fqs from fastqs_trimmed
-    file reference
+    path fqs from fastqs_trimmed

  output:
    set repRID, file ("${repRID}.unal.gz"), file ("${repRID}.bam"), file ("${repRID}.bai")

  script:
    """
-    ends=`cat ${ends}`;
-    if [ "\${ends}" == 'pe' ]; then
+    if [ "${endsManual}" == 'pe' ]; then
    hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${reference}/genome -1 ${fqs[0]} -2 ${fqs[1]} 1>${repRID}.align.out 2> ${repRID}.align.err;
    else hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${reference}/genome -U ${fqs[0]} 1>${repRID}.align.out 2> ${repRID}.align.err;
    fi;
    samtools view -1 --threads `nproc` -o ${repRID}.bam ${repRID}.sam 1>${repRID}.align.out 2> ${repRID}.align.err;
    samtools sort -@ `nproc` -O BAM ${repRID}.bam 1>${repRID}.align.out 2> ${repRID}.align.err;
    """
-}
+}
\ No newline at end of file
--- a/workflow/scripts/modifyFetch.py
+++ b/workflow/scripts/modifyFetch.py
@@ -6,20 +6,20 @@ import re

 def get_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
+    parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True)
    args = parser.parse_args()
    return args

 def main():
    args = get_args()
-    fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
-    fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
+    fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None)
+    fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0)
    fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"]
    fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"]
    fetchFile_filtered_renamed = fetchFile_filtered
    for i in fileFile_filtered["File_Name"]:
        fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0])
-    fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
+    fetchFile_filtered_renamed.to_csv(args.files+"/fetch.txt",sep="\t",header=False,index=False)

 if __name__ == '__main__':
    main()
\ No newline at end of file
--- a/workflow/scripts/parseMeta.py
+++ b/workflow/scripts/parseMeta.py
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True)
+    parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True)
+    parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    metaFile = pd.read_csv(args.metaFile,sep=",",header=0)
+
+    # Check replicate RID metadata from 'File.csv'
+    if (args.parameter == "repRID"):
+        if (len(metaFile.Replicate_RID.unique()) > 1):
+            print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique()))
+            exit(1)
+        if not (metaFile.Replicate_RID.unique() == args.repRID):
+            print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID)
+            exit(1)
+        else:
+            rep=metaFile["Replicate_RID"].unique()[0]
+            print(rep)
+        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
+            print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
+            exit(1)
+    
+    # Get endedness metadata from 'Experiment Settings.csv'
+    if (args.parameter == "endsMeta"):
+        if (metaFile.Paired_End.unique() == "Single End"):
+            endsMeta = "se"
+        elif (metaFile.Paired_End.unique() == "Paired End"):
+            endsMeta = "pe"
+        else:
+            endsMeta = "uk"
+        print(endsMeta)
+    
+    # Manually get endness count from 'File.csv'
+    if (args.parameter == "endsManual"):
+        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
+            endsManual = "se"
+        elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
+            endsManual = "pe"
+        print(endsManual)
+    
+    # Get strandedness metadata from 'Experiment Settings.csv'
+    if (args.parameter == "stranded"):
+        if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
+            stranded = "stranded"
+        elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
+            stranded = "unstranded"
+        else:
+            print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique())
+            exit(1)
+        print(stranded)
+    
+    # Get spike-in metadata from 'Experiment Settings.csv'
+    if (args.parameter == "spike"):
+        if (metaFile.Used_Spike_Ins.unique() == "yes"):
+            spike = "yes"
+        elif (metaFile.Used_Spike_Ins.unique() == "no"):
+            spike = "no"
+        else:
+            print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique())
+            exit(1)
+        print(spike)
+
+    # Get species metadata from 'Experiment.csv'
+    if (args.parameter == "species"):
+        if (metaFile.Species.unique() == "Mus musculus"):
+            species = "Mus musculus"
+        elif (metaFile.Species.unique() == "Homo sapiens"):
+            species = "Homo sapiens"
+        else:
+            print("Species metadata not match expected options: " + metaFile.Species.unique())
+            exit(1)
+        print(species)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/workflow/scripts/splitFetch.py
+++ b/workflow/scripts/splitFetch.py
@@ -6,14 +6,14 @@ import os

 def get_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
+    parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True)
    args = parser.parse_args()
    return args

 def main():
    args = get_args()
-    fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
-    fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
+    fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None)
+    fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0)
    replicateRID = fileFile.Replicate_RID.unique()
    fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID}
    for i in replicateRID:

--- a/workflow/tests/test_trimData.py
+++ b/workflow/tests/test_trimData.py
@@ -9,7 +9,10 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
                '/../../'

 @pytest.mark.trimData
-def test_trimData():
+def test_trimData_se():
    assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz'))
+
+@pytest.mark.trimData
+def test_trimData_pe():
    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz'))
    assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz'))
\ No newline at end of file