Merge branch '108-samtools.mem' into 'develop'

Resolve "samtools sort: couldn't allocate memory for bam_mem" Closes #109, #108, #107, #106, and #105 See merge request !64

Merge branch '108-samtools.mem' into 'develop'
Resolve "samtools sort: couldn't allocate memory for bam_mem" Closes #109, #108, #107, #106, and #105 See merge request !64
474a1d3e · Venkat Malladi · adf2b8be · e1d95064 · 474a1d3e · 474a1d3e
Commit 474a1d3e authored 4 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -109,7 +109,7 @@ parseMetadata:
  - study=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
  - endsRaw=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta)
  - endsMeta="uk"
-  - endsManual=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual)
+  - endsManual="se"
  - stranded=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded)
  - spike=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike)
  - species=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species)
@@ -750,6 +750,36 @@ failMismatchR1R2:
    when:
      - always

+failUnexpectedMeta:
+  stage: integration
+  only: [merge_requests]
+  except:
+    variables:
+        - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source staging --upload true -with-dag dag.png --dev false --ci true
+  retry:
+    max: 0
+    when:
+      - always
+
+failFileStructure:
+  stage: integration
+  only: [merge_requests]
+  except:
+    variables:
+        - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source staging --upload true -with-dag dag.png --dev false --ci true
+  retry:
+    max: 0
+    when:
+      - always
+
 override_inputBag:
  stage: integration
  only: [merge_requests]

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# v1.0.3 (in development)
+**User Facing**
+
+**Background**
+* Add memory limit (75%) per thread for samtools sort (#108)
+* Remove parsing restrictions for submitted stranded/spike/species (#105, #106)
+* Pass unidentified ends instead of overwriting it as unknown
+* Move fastqc process before trim to catch fastq errors (#107)
+* Only use fastq's that match *[_.]R[1-2].fastq.gz naming convention (#107)
+* Add error output for no fastq's
+* Update input bag export config to only fetch fastq's that match *[_.]R[1-2].fastq.gz naming convention
+* Remove check for multiple fastq check in parse metadata (redundant and no longer valid)
+* Handle blank submitted endness better
+* Don't use file.csv from inputBag to parse manual endness, use counted from getData
+* Detect malformed fastq's (#107)
+* Restrict sampled alignment process to use >32GB nodes on BioHPC (#108)
+* Use nproc**-1** for alignment processes (#108)
+
+*Known Bugs*
+* Override params (inputBag, fastq, species) aren't checked for integrity
+* Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included)
+
+<hr>
+
 # v1.0.2
 **User Facing**


--- a/docs/dag.png
+++ b/docs/dag.png
--- a/workflow/conf/Replicate_For_Input_Bag.json
+++ b/workflow/conf/Replicate_For_Input_Bag.json
@@ -89,7 +89,7 @@
        "processor": "fetch",
        "processor_params": {
          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
-          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/File_Name::ciregexp::%5B_.%5DR%5B12%5D%5C.fastq%5C.gz/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
        }
      }
    ]

--- a/workflow/conf/aws.config
+++ b/workflow/conf/aws.config
@@ -116,6 +116,10 @@ process {
    cpus = 1
    memory = '1 GB'
  }
+  withName:failPreExecutionRun_fastqFile {
+    cpus = 1
+    memory = '1 GB'
+  }
  withName:failPreExecutionRun_species {
 {
    cpus = 1

--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -32,7 +32,7 @@ process {
    executor = 'local'
  }
  withName:alignSampleData {
-    queue = 'super'
+    queue = '128GB,256GB,256GBv1,384GB'
  }
  withName:inferMetadata {
    queue = 'super'
@@ -85,6 +85,9 @@ process {
  withName:failPreExecutionRun_fastq {
    executor = 'local'
  }
+  withName:failPreExecutionRun_fastqFile {
+    executor = 'local'
+  }
  withName:failPreExecutionRun_species {
    executor = 'local'
  }

--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -91,6 +91,9 @@ process {
  withName:failPreExecutionRun_fastq {
    container = 'gudmaprbk/deriva1.4:1.0.0'
  }
+  withName:failPreExecutionRun_fastqFile {
+    container = 'gudmaprbk/deriva1.4:1.0.0'
+  }
  withName:failPreExecutionRun_species {
    container = 'gudmaprbk/deriva1.4:1.0.0'
  }
@@ -125,6 +128,6 @@ manifest {
  homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
  description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
  mainScript = 'rna-seq.nf'
-  version = 'v1.0.2'
+  version = 'v1.0.3'
  nextflowVersion = '>=19.09.0'
 }
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
@@ -48,6 +48,7 @@ deriva.into {
  deriva_uploadOutputBag
  deriva_finalizeExecutionRun
  deriva_failPreExecutionRun_fastq
+  deriva_failPreExecutionRun_fastqFile
  deriva_failPreExecutionRun_species
  deriva_failExecutionRun
 }
@@ -100,6 +101,7 @@ script_uploadInputBag = Channel.fromPath("${baseDir}/scripts/upload_input_bag.py
 script_uploadExecutionRun_uploadExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
 script_uploadExecutionRun_finalizeExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
 script_uploadExecutionRun_failPreExecutionRun_fastq = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
+script_uploadExecutionRun_failPreExecutionRun_fastqFile = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
 script_uploadExecutionRun_failPreExecutionRun_species = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
 script_uploadExecutionRun_failExecutionRun = Channel.fromPath("${baseDir}/scripts/upload_execution_run.py")
 script_uploadQC = Channel.fromPath("${baseDir}/scripts/upload_qc.py")
@@ -267,6 +269,10 @@ process getData {
    echo -e "LOG: fetched" >> ${repRID}.getData.log
    
    fastqCount=\$(ls *.fastq.gz | wc -l)
+    if [ "\${fastqCount}" == "0" ]
+    then
+      touch dummy.R1.fastq.gz
+    fi
    echo "\${fastqCount}" > fastqCount.csv
    """
 }
@@ -284,12 +290,12 @@ if (fastqsForce != "") {
    .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
    .collect().into {
      fastqs_parseMetadata
-      fastqs_trimData
+      fastqs_fastqc
    }
 } else {
-  fastqs.into {
+  fastqs.collect().into {
    fastqs_parseMetadata
-    fastqs_trimData
+    fastqs_fastqc
  }
 }

@@ -304,7 +310,7 @@ process parseMetadata {
    path file from fileMeta
    path experimentSettings, stageAs: "ExperimentSettings.csv" from experimentSettingsMeta
    path experiment from experimentMeta
-    path (fastq) from fastqs_parseMetadata
+    path (fastq) from fastqs_parseMetadata.collect()
    val fastqCount

  output:
@@ -337,17 +343,20 @@ process parseMetadata {
    elif [ "\${endsRaw}" == "Paired End" ]
    then
      endsMeta="pe"
-    else
-      endsMeta="unknown"
-    fi
-    if [ "\${endsRaw}" == "" ]
+    elif [ "\${endsRaw}" == "nan" ]
    then
      endsRaw="_No value_"
+      endsMeta="NA"
    fi

    # ganually get endness
-    endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${file}" -p endsManual)
-    echo -e "LOG: endedness manually detected: \${endsManual}" >> ${repRID}.parseMetadata.log
+    if [ "${fastqCount}" == "1" ]
+    then
+      endsManual="se"
+    else
+      endsManual="pe"
+    fi
+    echo -e "LOG: endedness manually detected: ${fastqCount}" >> ${repRID}.parseMetadata.log

    # get strandedness metadata
    stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettings}" -p stranded)
@@ -376,6 +385,10 @@ process parseMetadata {
    then
      fastqCountError=true
      fastqCountError_details="**Too many fastqs detected (>2)**"
+    elif [ "${fastqCount}" -eq "0" ]
+    then
+      fastqCountError=true
+      fastqCountError_details="**No valid fastqs detected \\(may not match .R{12}.fastq.gz convention\\)**"
    elif [ "\${endsMeta}" == "se" ] && [ "${fastqCount}" -ne "1" ]
    then
      fastqCountError=true
@@ -451,6 +464,7 @@ spikeMeta.into {
  spikeMeta_checkMetadata
  spikeMeta_aggrQC
  spikeMeta_failPreExecutionRun_fastq
+  spikeMeta_failPreExecutionRun_fastqFile
  spikeMeta_failPreExecutionRun_species
  spikeMeta_failExecutionRun
 }
@@ -458,6 +472,7 @@ speciesMeta.into {
  speciesMeta_checkMetadata
  speciesMeta_aggrQC
  speciesMeta_failPreExecutionRun_fastq
+  speciesMeta_failPreExecutionRun_fastqFile
  speciesMeta_failPreExecutionRun_species
  speciesMeta_failExecutionRun
 }
@@ -486,6 +501,7 @@ fastqError_fl.splitCsv(sep: ",", header: false).separate(

 //  Replicate errors for multiple process inputs
 fastqCountError.into {
+  fastqCountError_fastqc
  fastqCountError_trimData
  fastqCountError_getRefInfer
  fastqCountError_downsampleData
@@ -498,7 +514,6 @@ fastqCountError.into {
  fastqCountError_dedupData
  fastqCountError_makeBigWig
  fastqCountError_countData
-  fastqCountError_fastqc
  fastqCountError_dataQC
  fastqCountError_aggrQC
  fastqCountError_uploadQC
@@ -507,6 +522,7 @@ fastqCountError.into {
  fastqCountError_failPreExecutionRun_fastq
 }
 fastqReadError.into {
+  fastqReadError_fastqc
  fastqReadError_trimData
  fastqReadError_getRefInfer
  fastqReadError_downsampleData
@@ -519,7 +535,6 @@ fastqReadError.into {
  fastqReadError_dedupData
  fastqReadError_makeBigWig
  fastqReadError_countData
-  fastqReadError_fastqc
  fastqReadError_dataQC
  fastqReadError_aggrQC
  fastqReadError_uploadQC
@@ -528,6 +543,98 @@ fastqReadError.into {
  fastqReadError_failPreExecutionRun_fastq
 }

+/*
+ *fastqc: run fastqc on untrimmed fastq's
+*/
+process fastqc {
+  tag "${repRID}"
+
+  input:
+    path (fastq) from fastqs_fastqc.collect()
+    val fastqCountError_fastqc
+    val fastqReadError_fastqc
+
+  output:
+    path ("*.R{1,2}.fastq.gz", includeInputs:true) into fastqs_trimData
+    path ("*_fastqc.zip") into fastqc
+    path ("rawReads.csv") into rawReadsInfer_fl
+    path "fastqFileError.csv" into fastqFileError_fl
+
+  when:
+    fastqCountError_fastqc == 'false' && fastqReadError_fastqc == 'false'
+
+  script:
+    """
+    hostname > ${repRID}.fastqc.log
+    ulimit -a >> ${repRID}.fastqc.log
+
+    # run fastqc
+    echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log
+    fastqc *.fastq.gz -o . &> fastqc.out || true
+    fastqcErrorOut=\$(cat fastqc.out | grep -c 'Failed to process file') || fastqcErrorOut=0
+    fastqFileError=false
+    fastqFileError_details=""
+    if [ "\${fastqcErrorOut}" -ne "0" ]
+    then
+      fastqFileError=true
+      fastqFileError_details="**There is an error with the structure of the fastq**"
+      echo -e "LOG: There is an error with the structure of the fastq" >> ${repRID}.fastqc.log
+      touch dummy_fastqc.zip
+    else
+      echo -e "LOG: The structure of the fastq is correct" >> ${repRID}.fastqc.log
+    fi
+
+    # count raw reads
+    zcat *.R1.fastq.gz | echo \$((`wc -l`/4)) > rawReads.csv
+
+    # save fastq error file
+    echo "\${fastqFileError},\${fastqFileError_details}" > fastqFileError.csv
+    """
+}
+
+// Extract number of raw reads metadata into channel
+rawReadsInfer = Channel.create()
+rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
+  rawReadsInfer
+)
+
+// Replicate inferred raw reads for multiple process inputs
+rawReadsInfer.into {
+  rawReadsInfer_aggrQC
+  rawReadsInfer_uploadQC
+}
+
+// Split fastq count error into separate channel
+fastqFileError = Channel.create()
+fastqFileError_details = Channel.create()
+fastqFileError_fl.splitCsv(sep: ",", header: false).separate(
+  fastqFileError,
+  fastqFileError_details
+)
+
+//  Replicate errors for multiple process inputs
+fastqFileError.into {
+  fastqFileError_fastqc
+  fastqFileError_trimData
+  fastqFileError_getRefInfer
+  fastqFileError_downsampleData
+  fastqFileError_alignSampleData
+  fastqFileError_inferMetadata
+  fastqFileError_checkMetadata
+  fastqFileError_uploadExecutionRun
+  fastqFileError_getRef
+  fastqFileError_alignData
+  fastqFileError_dedupData
+  fastqFileError_makeBigWig
+  fastqFileError_countData
+  fastqFileError_dataQC
+  fastqFileError_aggrQC
+  fastqFileError_uploadQC
+  fastqFileError_uploadProcessedFile
+  fastqFileError_uploadOutputBag
+  fastqFileError_failPreExecutionRun_fastqFile
+}
+
 /*
 * trimData: trims any adapter or non-host sequences from the data
 */
@@ -539,16 +646,17 @@ process trimData {
    val ends from endsManual_trimData
    val fastqCountError_trimData
    val fastqReadError_trimData
+    val fastqFileError_trimData

  output:
    path ("*.fq.gz") into fastqsTrim
-    path ("*.fastq.gz", includeInputs:true) into fastqs_fastqc
    path ("*_trimming_report.txt") into trimQC
    path ("readLength.csv") into readLengthInfer_fl

  when:
    fastqCountError_trimData == "false"
    fastqReadError_trimData == "false"
+    fastqFileError_trimData == "false"

  script:
    """
@@ -592,7 +700,7 @@ fastqsTrim.into {
 }

 // Combine inputs of getRefInfer
-getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refDataInfer.combine(fastqCountError_getRefInfer.combine(fastqReadError_getRefInfer))))
+getRefInferInput = referenceInfer.combine(deriva_getRefInfer.combine(script_refDataInfer.combine(fastqCountError_getRefInfer.combine(fastqReadError_getRefInfer.combine(fastqFileError_getRefInfer)))))

 /*
  * getRefInfer: dowloads appropriate reference for metadata inference
@@ -601,7 +709,7 @@ process getRefInfer {
  tag "${refName}"

  input:
-    tuple val (refName), path (credential, stageAs: "credential.json"), path (script_refDataInfer), val (fastqCountError), val (fastqReadError) from getRefInferInput
+    tuple val (refName), path (credential, stageAs: "credential.json"), path (script_refDataInfer), val (fastqCountError), val (fastqReadError), val (fastqFileError) from getRefInferInput

  output:
    tuple val (refName), path ("hisat2", type: 'dir'), path ("*.fna"), path ("*.gtf")  into refInfer
@@ -610,6 +718,7 @@ process getRefInfer {
  when:
    fastqCountError == "false"
    fastqReadError == "false"
+    fastqFileError == "false"

  script:
    """
@@ -687,6 +796,7 @@ process downsampleData {
    val ends from endsManual_downsampleData
    val fastqCountError_downsampleData
    val fastqReadError_downsampleData
+    val fastqFileError_downsampleData

  output:
    path ("sampled.1.fq") into fastqs1Sample
@@ -695,6 +805,7 @@ process downsampleData {
  when:
    fastqCountError_downsampleData == "false"
    fastqReadError_downsampleData == "false"
+    fastqFileError_downsampleData == "false"

  script:
    """
@@ -718,7 +829,7 @@ process downsampleData {
 }

 // Replicate the dowsampled fastq's and attatched to the references
-inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect().combine(fastqCountError_alignSampleData.combine(fastqReadError_alignSampleData)))))
+inferInput = endsManual_alignSampleData.combine(refInfer.combine(fastqs1Sample.collect().combine(fastqs2Sample.collect().combine(fastqCountError_alignSampleData.combine(fastqReadError_alignSampleData.combine(fastqFileError_alignSampleData))))))

 /*
 * alignSampleData: aligns the downsampled reads to a reference database
@@ -727,7 +838,7 @@ process alignSampleData {
  tag "${ref}"

  input:
-    tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2), val (fastqCountError), val (fastqReadError) from inferInput
+    tuple val (ends), val (ref), path (hisat2), path (fna), path (gtf), path (fastq1), path (fastq2), val (fastqCountError), val (fastqReadError), val (fastqFileError) from inferInput

  output:
    path ("${ref}.sampled.sorted.bam") into sampleBam
@@ -737,6 +848,7 @@ process alignSampleData {
  when:
    fastqCountError == "false"
    fastqReadError == "false"
+    fastqFileError == "false"

  script:
    """
@@ -761,7 +873,10 @@ process alignSampleData {

    # sort the bam file using Samtools
    echo -e "LOG: sorting the bam file" >> ${repRID}.${ref}.alignSampleData.log
-    samtools sort -@ `nproc` -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam
+    proc=\$(expr `nproc` - 1)
+    mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*')
+    mem=\$(expr \${mem} / \${proc} \\* 85 / 100)
+    samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${ref}.sampled.sorted.bam ${ref}.sampled.bam

    # index the sorted bam using Samtools
    echo -e "LOG: indexing sorted bam file" >> ${repRID}.${ref}.alignSampleData.log
@@ -785,6 +900,7 @@ process inferMetadata {
    path alignSummary from alignSampleQC_inferMetadata.collect()
    val fastqCountError_inferMetadata
    val fastqReadError_inferMetadata
+    val fastqFileError_inferMetadata

  output:
    path "infer.csv" into inferMetadata_fl
@@ -794,6 +910,7 @@ process inferMetadata {
  when:
    fastqCountError_inferMetadata == "false"
    fastqReadError_inferMetadata == "false"
+    fastqFileError_inferMetadata == "false"

  script:
    """
@@ -1011,6 +1128,7 @@ process checkMetadata {
    val speciesInfer from speciesInfer_checkMetadata
    val fastqCountError_checkMetadata
    val fastqReadError_checkMetadata
+    val fastqFileError_checkMetadata
    val speciesError_checkMetadata

  output:
@@ -1020,6 +1138,7 @@ process checkMetadata {
  when:
    fastqCountError_checkMetadata == "false"
    fastqReadError_checkMetadata == "false"
+    fastqFileError_checkMetadata == "false"
    speciesError_checkMetadata == "false"

  script:
@@ -1185,6 +1304,7 @@ inputBagRID.into {
  inputBagRID_uploadExecutionRun
  inputBagRID_finalizeExecutionRun
  inputBagRID_failPreExecutionRun_fastq
+  inputBagRID_failPreExecutionRun_fastqFile
  inputBagRID_failPreExecutionRun_species
  inputBagRID_failExecutionRun
 }
@@ -1203,6 +1323,7 @@ process uploadExecutionRun {
    val inputBagRID from inputBagRID_uploadExecutionRun
    val fastqCountError_uploadExecutionRun
    val fastqReadError_uploadExecutionRun
+    val fastqFileError_uploadExecutionRun
    val speciesError_uploadExecutionRun
    
  output:
@@ -1212,6 +1333,7 @@ process uploadExecutionRun {
    upload
    fastqCountError_uploadExecutionRun == "false"
    fastqReadError_uploadExecutionRun == "false"
+    fastqFileError_uploadExecutionRun == "false"
    speciesError_uploadExecutionRun == "false"

  script:
@@ -1298,6 +1420,7 @@ process getRef {
    val species from speciesInfer_getRef
    val fastqCountError_getRef
    val fastqReadError_getRef
+    val fastqFileError_getRef
    val speciesError_getRef
    val pipelineError_getRef

@@ -1307,6 +1430,7 @@ process getRef {
  when:
    fastqCountError_getRef == "false"
    fastqReadError_getRef == "false"
+    fastqFileError_getRef == "false"
    speciesError_getRef == "false"
    pipelineError_getRef == "false"

@@ -1398,6 +1522,7 @@ process alignData {
    val stranded from strandedInfer_alignData
    val fastqCountError_alignData
    val fastqReadError_alignData
+    val fastqFileError_alignData
    val speciesError_alignData
    val pipelineError_alignData

@@ -1408,6 +1533,7 @@ process alignData {
  when:
    fastqCountError_alignData == "false"
    fastqReadError_alignData == "false"
+    fastqFileError_alignData == "false"
    speciesError_alignData == "false"
    pipelineError_alignData == "false"

@@ -1451,7 +1577,10 @@ process alignData {

    # sort the bam file using Samtools
    echo -e "LOG: sorting the bam file" >> ${repRID}.align.log
-    samtools sort -@ `nproc` -O BAM -o ${repRID}.sorted.bam ${repRID}.bam
+    proc=\$(expr `nproc` - 1)
+    mem=\$(vmstat -s -S K | grep 'total memory' | grep -o '[0-9]*')
+    mem=\$(expr \${mem} / \${proc} \\* 75 / 100)
+    samtools sort -@ \${proc} -m \${mem}K -O BAM -o ${repRID}.sorted.bam ${repRID}.bam

    # index the sorted bam using Samtools
    echo -e "LOG: indexing sorted bam file" >> ${repRID}.align.log
@@ -1475,6 +1604,7 @@ process dedupData {
    tuple path (bam), path (bai) from rawBam_dedupData
    val fastqCountError_dedupData
    val fastqReadError_dedupData
+    val fastqFileError_dedupData
    val speciesError_dedupData
    val pipelineError_dedupData

@@ -1486,6 +1616,7 @@ process dedupData {
  when:
    fastqCountError_dedupData == 'false'
    fastqReadError_dedupData == 'false'
+    fastqFileError_dedupData == 'false'
    speciesError_dedupData == 'false'
    pipelineError_dedupData == 'false'

@@ -1534,6 +1665,7 @@ process makeBigWig {
    tuple path (bam), path (bai) from dedupBam_makeBigWig
    val fastqCountError_makeBigWig
    val fastqReadError_makeBigWig
+    val fastqFileError_makeBigWig
    val speciesError_makeBigWig
    val pipelineError_makeBigWig

@@ -1543,6 +1675,7 @@ process makeBigWig {
  when:
    fastqCountError_makeBigWig == 'false'
    fastqReadError_makeBigWig == 'false'
+    fastqFileError_makeBigWig == 'false'
    speciesError_makeBigWig == 'false'
    pipelineError_makeBigWig == 'false'

@@ -1574,6 +1707,7 @@ process countData {
    val stranded from strandedInfer_countData
    val fastqCountError_countData
    val fastqReadError_countData
+    val fastqFileError_countData
    val speciesError_countData
    val pipelineError_countData

@@ -1585,6 +1719,7 @@ process countData {
  when:
    fastqCountError_countData == 'false'
    fastqReadError_countData == 'false'
+    fastqFileError_countData == 'false'
    speciesError_countData == 'false'
    pipelineError_countData == 'false'

@@ -1645,55 +1780,6 @@ assignedReadsInfer.into {
  assignedReadsInfer_uploadQC
 }

-/*
- *fastqc: run fastqc on untrimmed fastq's
-*/
-process fastqc {
-  tag "${repRID}"
-
-  input:
-    path (fastq) from fastqs_fastqc
-    val fastqCountError_fastqc
-    val fastqReadError_fastqc
-    val speciesError_fastqc
-    val pipelineError_fastqc
-
-  output:
-    path ("*_fastqc.zip") into fastqc
-    path ("rawReads.csv") into rawReadsInfer_fl
-
-  when:
-    fastqCountError_fastqc == 'false'
-    fastqReadError_fastqc == 'false'
-    speciesError_fastqc == 'false'
-    pipelineError_fastqc == 'false'
-
-  script:
-    """
-    hostname > ${repRID}.fastqc.log
-    ulimit -a >> ${repRID}.fastqc.log
-
-    # run fastqc
-    echo -e "LOG: running fastq on raw fastqs" >> ${repRID}.fastqc.log
-    fastqc *.fastq.gz -o .
-
-    # count raw reads
-    zcat *.R1.fastq.gz | echo \$((`wc -l`/4)) > rawReads.csv
-    """
-}
-
-// Extract number of raw reads metadata into channel
-rawReadsInfer = Channel.create()
-rawReadsInfer_fl.splitCsv(sep: ",", header: false).separate(
-  rawReadsInfer
-)
-
-// Replicate inferred raw reads for multiple process inputs
-rawReadsInfer.into {
-  rawReadsInfer_aggrQC
-  rawReadsInfer_uploadQC
-}
-
 /*
 *dataQC: calculate transcript integrity numbers (TIN) and bin as well as calculate innerdistance of PE replicates
 */
@@ -1708,6 +1794,7 @@ process dataQC {
    val ends from endsInfer_dataQC
    val fastqCountError_dataQC
    val fastqReadError_dataQC
+    val fastqFileError_dataQC
    val speciesError_dataQC
    val pipelineError_dataQC

@@ -1719,6 +1806,7 @@ process dataQC {
  when:
    fastqCountError_dataQC == 'false'
    fastqReadError_dataQC == 'false'
+    fastqFileError_dataQC == 'false'
    speciesError_dataQC == 'false'
    pipelineError_dataQC == 'false'

@@ -1804,6 +1892,7 @@ process aggrQC {
    val expRID from expRID_aggrQC
    val fastqCountError_aggrQC
    val fastqReadError_aggrQC
+    val fastqFileError_aggrQC
    val speciesError_aggrQC
    val pipelineError_aggrQC

@@ -1814,6 +1903,7 @@ process aggrQC {
  when:
    fastqCountError_aggrQC == 'false'
    fastqReadError_aggrQC == 'false'
+    fastqFileError_aggrQC == 'false'
    speciesError_aggrQC == 'false'
    pipelineError_aggrQC == 'false'

@@ -1906,6 +1996,7 @@ process uploadQC {
    val tinMed from tinMedInfer_uploadQC
    val fastqCountError_uploadQC
    val fastqReadError_uploadQC
+    val fastqFileError_uploadQC
    val speciesError_uploadQC
    val pipelineError_uploadQC

@@ -1916,6 +2007,7 @@ process uploadQC {
    upload
    fastqCountError_uploadQC == 'false'
    fastqReadError_uploadQC == 'false'
+    fastqFileError_uploadQC == 'false'
    speciesError_uploadQC == 'false'
    pipelineError_uploadQC == 'false'

@@ -1982,6 +2074,7 @@ process uploadProcessedFile {
    val executionRunRID from executionRunRID_uploadProcessedFile
    val fastqCountError_uploadProcessedFile
    val fastqReadError_uploadProcessedFile
+    val fastqFileError_uploadProcessedFile
    val speciesError_uploadProcessedFile
    val pipelineError_uploadProcessedFile

@@ -1992,6 +2085,7 @@ process uploadProcessedFile {
    upload
    fastqCountError_uploadProcessedFile == 'false'
    fastqReadError_uploadProcessedFile == 'false'
+    fastqFileError_uploadProcessedFile == 'false'
    speciesError_uploadProcessedFile == 'false'
    pipelineError_uploadProcessedFile == 'false'

@@ -2074,6 +2168,7 @@ process uploadOutputBag {
    val executionRunRID from executionRunRID_uploadOutputBag
    val fastqCountError_uploadOutputBag
    val fastqReadError_uploadOutputBag
+    val fastqFileError_uploadOutputBag
    val speciesError_uploadOutputBag
    val pipelineError_uploadOutputBag

@@ -2084,6 +2179,7 @@ process uploadOutputBag {
    upload
    fastqCountError_uploadOutputBag == 'false'
    fastqReadError_uploadOutputBag == 'false'
+    fastqFileError_uploadOutputBag == 'false'
    speciesError_uploadOutputBag == 'false'
    pipelineError_uploadOutputBag == 'false'

@@ -2256,6 +2352,86 @@ process failPreExecutionRun_fastq {
  """
 }

+/* 
+ * failPreExecutionRun_fastqFile: fail the execution run prematurely for fastqFile errors
+*/
+process failPreExecutionRun_fastqFile {
+  tag "${repRID}"
+
+  input:
+    path script_uploadExecutionRun from script_uploadExecutionRun_failPreExecutionRun_fastqFile
+    path credential, stageAs: "credential.json" from deriva_failPreExecutionRun_fastqFile
+    val spike from spikeMeta_failPreExecutionRun_fastqFile
+    val species from speciesMeta_failPreExecutionRun_fastqFile
+    val inputBagRID from inputBagRID_failPreExecutionRun_fastqFile
+    val fastqFileError from fastqFileError_failPreExecutionRun_fastqFile
+    val fastqFileError_details
+
+  when:
+    upload
+    fastqFileError == 'true'
+
+  script:
+  """
+  hostname > ${repRID}.failPreExecutionRun_fastqfile.log
+  ulimit -a >> ${repRID}.failPreExecutionRun_fastqfile.log
+
+  errorDetails=""
+  if [ ${fastqFileError} == true ]
+  then
+    errorDetails=\$(echo \$(errorDetails)${fastqFileError_details}"\\n")
+  fi
+
+  echo LOG: searching for workflow RID - BICF mRNA ${workflow.manifest.version} >> ${repRID}.failPreExecutionRun_fastqfile.log
+  workflow=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Workflow/Name=BICF%20mRNA%20Replicate/Version=${workflow.manifest.version})
+  workflow=\$(echo \${workflow} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+  workflow=\${workflow:7:-6}
+  echo LOG: workflow RID extracted - \${workflow} >> ${repRID}.failPreExecutionRun_fastqfile.log
+
+  if [ "${species}" == "Homo sapiens" ]
+  then
+    genomeName=\$(echo GRCh${refHuVersion})
+  elif [ "${species}" == "Mus musculus" ]
+  then
+    genomeName=\$(echo GRCm${refMoVersion})
+  fi
+  if [ "${spike}" == "yes" ]
+  then
+    genomeName=\$(echo \${genomeName}-S)
+  fi
+  echo LOG: searching for genome name - \${genomeName} >> ${repRID}.failPreExecutionRun_fastqfile.log
+  genome=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Name=\${genomeName})
+  genome=\$(echo \${genome} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+  genome=\${genome:7:-6}
+  echo LOG: genome RID extracted - \${genome} >> ${repRID}.failPreExecutionRun_fastqfile.log
+
+  cookie=\$(cat credential.json | grep -A 1 '\\"${source}\\": {' | grep -o '\\"cookie\\": \\".*\\"')
+  cookie=\${cookie:11:-1}
+
+  exist=\$(curl -s https://${source}/ermrest/catalog/2/entity/RNASeq:Execution_Run/Workflow=\${workflow}/Replicate=${repRID}/Input_Bag=${inputBagRID})
+  echo \${exist} >> ${repRID}.failPreExecutionRun_fastqfile.log
+  if [ "\${exist}" == "[]" ]
+  then
+    rid=\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u F)
+    echo LOG: execution run RID uploaded - \${rid} >> ${repRID}.failPreExecutionRun_fastqfile.log
+  else
+    rid=\$(echo \${exist} | grep -o '\\"RID\\":\\".*\\",\\"RCT')
+    rid=\${rid:7:-6}
+    echo \${rid} >> ${repRID}.failPreExecutionRun_fastqfile.log
+    executionRun_rid==\$(python3 ${script_uploadExecutionRun} -r ${repRID} -w \${workflow} -g \${genome} -i ${inputBagRID} -s Error -d "\${errorDetails}" -o ${source} -c \${cookie} -u \${rid})
+    echo LOG: execution run RID updated - \${executionRun_rid} >> ${repRID}.failPreExecutionRun_fastqfile.log
+  fi
+
+  dt=`date +%FT%T.%3N%:z`
+  curl -H 'Content-Type: application/json' -X PUT -d \
+    '{ \
+      "ID": "${workflow.sessionId}", \
+      "ExecutionRunRID": "'\${rid}'", \
+      "Failure": "'\${dt}'" \
+    }' \
+    "https://9ouc12dkwb.execute-api.us-east-2.amazonaws.com/prod/db/track"
+  """
+}

 /* 
 * failPreExecutionRun_species: fail the execution run prematurely for species error

--- a/workflow/scripts/bdbag_fetch.sh
+++ b/workflow/scripts/bdbag_fetch.sh
@@ -18,7 +18,7 @@ if [ "${validate}" != "is valid" ]
 then
    exit 1
 fi
-for i in $(find */ -name "*R*.fastq.gz")
+for i in $(find */ -name "*[_.]R[1-2].fastq.gz")
 do
    path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")
    cp ${i} ./${path}

--- a/workflow/scripts/parse_meta.py
+++ b/workflow/scripts/parse_meta.py
@@ -35,15 +35,11 @@ def main():
        else:
            rep = metaFile["Replicate_RID"].unique()[0]
            print(rep)
-        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
-            print("There are more then 2 fastq's in the metadata: " +
-                  " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
-            exit(1)

    # Check experiment RID metadata from 'Experiment.csv'
    if (args.parameter == "expRID"):
        if (len(metaFile.Experiment_RID.unique()) > 1):
-            print("There are multiple experoment RID's in the metadata: " +
+            print("There are multiple experiment RID's in the metadata: " +
                  " ".join(metaFile.Experiment_RID.unique()))
            exit(1)
        else:
@@ -65,14 +61,6 @@ def main():
        endsMeta = metaFile.Paired_End.unique()[0]
        print(endsMeta)

-    # Manually get endness count from 'File.csv'
-    if (args.parameter == "endsManual"):
-        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
-            endsManual = "se"
-        elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
-            endsManual = "pe"
-        print(endsManual)
-
    # Get strandedness metadata from 'Experiment Settings.csv'
    if (args.parameter == "stranded"):
        if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
@@ -80,9 +68,7 @@ def main():
        elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
            stranded = "unstranded"
        else:
-            print("Stranded metadata not match expected options: " +
-                  metaFile.Has_Strand_Specific_Information.unique())
-            exit(1)
+            stranded = metaFile.Has_Strand_Specific_Information.unique()[0]
        print(stranded)

    # Get spike-in metadata from 'Experiment Settings.csv'
@@ -92,9 +78,7 @@ def main():
        elif (metaFile.Used_Spike_Ins.unique() == "no"):
            spike = "no"
        else:
-            print("Spike-ins metadata not match expected options: " +
-                  metaFile.Used_Spike_Ins.unique())
-            exit(1)
+            spike = metaFile.Used_Spike_Ins.unique()[0]
        print(spike)

    # Get species metadata from 'Experiment.csv'
@@ -104,9 +88,7 @@ def main():
        elif (metaFile.Species.unique() == "Homo sapiens"):
            species = "Homo sapiens"
        else:
-            print("Species metadata not match expected options: " +
-                  metaFile.Species.unique())
-            exit(1)
+            species = metaFile.Species.unique()[0]
        print(species)

    # Get read length metadata from 'Experiment Settings.csv'