Merge branch '108-samtools.mem' into 'develop'

Resolve "samtools sort: couldn't allocate memory for bam_mem" Closes #109, #108, #107, #106, and #105 See merge request !64

Merge branch '108-samtools.mem' into 'develop'
Resolve "samtools sort: couldn't allocate memory for bam_mem" Closes #109, #108, #107, #106, and #105 See merge request !64
474a1d3e · Venkat Malladi · adf2b8be · e1d95064 · 474a1d3e · 474a1d3e
Commit 474a1d3e authored 4 years ago by Venkat Malladi
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -109,7 +109,7 @@ parseMetadata:
  - study=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
  - endsRaw=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta)
  - endsMeta="uk"
-  - endsManual=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual)
+  - endsManual="se"
  - stranded=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded)
  - spike=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike)
  - species=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species)
@@ -750,6 +750,36 @@ failMismatchR1R2:
    when:
      - always
+failUnexpectedMeta:
+  stage: integration
+  only: [merge_requests]
+  except:
+    variables:
+        - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source staging --upload true -with-dag dag.png --dev false --ci true
+  retry:
+    max: 0
+    when:
+      - always
+failFileStructure:
+  stage: integration
+  only: [merge_requests]
+  except:
+    variables:
+        - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/
+  script:
+  - hostname
+  - ulimit -a
+  - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source staging --upload true -with-dag dag.png --dev false --ci true
+  retry:
+    max: 0
+    when:
+      - always
 override_inputBag:
  stage: integration
  only: [merge_requests]

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# v1.0.3 (in development)
+**User Facing**
+**Background**
+* Add memory limit (75%) per thread for samtools sort (#108)
+* Remove parsing restrictions for submitted stranded/spike/species (#105, #106)
+* Pass unidentified ends instead of overwriting it as unknown
+* Move fastqc process before trim to catch fastq errors (#107)
+* Only use fastq's that match *[_.]R[1-2].fastq.gz naming convention (#107)
+* Add error output for no fastq's
+* Update input bag export config to only fetch fastq's that match *[_.]R[1-2].fastq.gz naming convention
+* Remove check for multiple fastq check in parse metadata (redundant and no longer valid)
+* Handle blank submitted endness better
+* Don't use file.csv from inputBag to parse manual endness, use counted from getData
+* Detect malformed fastq's (#107)
+* Restrict sampled alignment process to use >32GB nodes on BioHPC (#108)
+* Use nproc**-1** for alignment processes (#108)
+*Known Bugs*
+* Override params (inputBag, fastq, species) aren't checked for integrity
+* Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included)
+<hr>
 # v1.0.2
 **User Facing**

--- a/docs/dag.png
+++ b/docs/dag.png
--- a/workflow/conf/Replicate_For_Input_Bag.json
+++ b/workflow/conf/Replicate_For_Input_Bag.json
@@ -89,7 +89,7 @@
        "processor": "fetch",
        "processor_params": {
          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
-          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/File_Name::ciregexp::%5B_.%5DR%5B12%5D%5C.fastq%5C.gz/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
        }
      }
    ]

--- a/workflow/conf/aws.config
+++ b/workflow/conf/aws.config
@@ -116,6 +116,10 @@ process {
    cpus = 1
    memory = '1 GB'
  }
+  withName:failPreExecutionRun_fastqFile {
+    cpus = 1
+    memory = '1 GB'
+  }
  withName:failPreExecutionRun_species {
 {
    cpus = 1

--- a/workflow/conf/biohpc.config
+++ b/workflow/conf/biohpc.config
@@ -32,7 +32,7 @@ process {
    executor = 'local'
  }
  withName:alignSampleData {
-    queue = 'super'
+    queue = '128GB,256GB,256GBv1,384GB'
  }
  withName:inferMetadata {
    queue = 'super'
@@ -85,6 +85,9 @@ process {
  withName:failPreExecutionRun_fastq {
    executor = 'local'
  }
+  withName:failPreExecutionRun_fastqFile {
+    executor = 'local'
+  }
  withName:failPreExecutionRun_species {
    executor = 'local'
  }

--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@@ -91,6 +91,9 @@ process {
  withName:failPreExecutionRun_fastq {
    container = 'gudmaprbk/deriva1.4:1.0.0'
  }
+  withName:failPreExecutionRun_fastqFile {
+    container = 'gudmaprbk/deriva1.4:1.0.0'
+  }
  withName:failPreExecutionRun_species {
    container = 'gudmaprbk/deriva1.4:1.0.0'
  }
@@ -125,6 +128,6 @@ manifest {
  homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
  description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
  mainScript = 'rna-seq.nf'
-  version = 'v1.0.2'
+  version = 'v1.0.3'
  nextflowVersion = '>=19.09.0'
 }
--- a/workflow/rna-seq.nf
+++ b/workflow/rna-seq.nf
--- a/workflow/scripts/bdbag_fetch.sh
+++ b/workflow/scripts/bdbag_fetch.sh
@@ -18,7 +18,7 @@ if [ "${validate}" != "is valid" ]
 then
    exit 1
 fi
-for i in $(find */ -name "*R*.fastq.gz")
+for i in $(find */ -name "*[_.]R[1-2].fastq.gz")
 do
    path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")
    cp ${i} ./${path}

--- a/workflow/scripts/parse_meta.py
+++ b/workflow/scripts/parse_meta.py
@@ -35,15 +35,11 @@ def main():
        else:
            rep = metaFile["Replicate_RID"].unique()[0]
            print(rep)
-        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
-            print("There are more then 2 fastq's in the metadata: " +
-                  " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
-            exit(1)
    # Check experiment RID metadata from 'Experiment.csv'
    if (args.parameter == "expRID"):
        if (len(metaFile.Experiment_RID.unique()) > 1):
-            print("There are multiple experoment RID's in the metadata: " +
+            print("There are multiple experiment RID's in the metadata: " +
                  " ".join(metaFile.Experiment_RID.unique()))
            exit(1)
        else:
@@ -65,14 +61,6 @@ def main():
        endsMeta = metaFile.Paired_End.unique()[0]
        print(endsMeta)
-    # Manually get endness count from 'File.csv'
-    if (args.parameter == "endsManual"):
-        if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
-            endsManual = "se"
-        elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
-            endsManual = "pe"
-        print(endsManual)
    # Get strandedness metadata from 'Experiment Settings.csv'
    if (args.parameter == "stranded"):
        if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
@@ -80,9 +68,7 @@ def main():
        elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
            stranded = "unstranded"
        else:
-            print("Stranded metadata not match expected options: " +
+            stranded = metaFile.Has_Strand_Specific_Information.unique()[0]
-                  metaFile.Has_Strand_Specific_Information.unique())
-            exit(1)
        print(stranded)
    # Get spike-in metadata from 'Experiment Settings.csv'
@@ -92,9 +78,7 @@ def main():
        elif (metaFile.Used_Spike_Ins.unique() == "no"):
            spike = "no"
        else:
-            print("Spike-ins metadata not match expected options: " +
+            spike = metaFile.Used_Spike_Ins.unique()[0]
-                  metaFile.Used_Spike_Ins.unique())
-            exit(1)
        print(spike)
    # Get species metadata from 'Experiment.csv'
@@ -104,9 +88,7 @@ def main():
        elif (metaFile.Species.unique() == "Homo sapiens"):
            species = "Homo sapiens"
        else:
-            print("Species metadata not match expected options: " +
+            species = metaFile.Species.unique()[0]
-                  metaFile.Species.unique())
-            exit(1)
        print(species)
    # Get read length metadata from 'Experiment Settings.csv'