Skip to content
Snippets Groups Projects
Commit 474a1d3e authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '108-samtools.mem' into 'develop'

Resolve "samtools sort: couldn't allocate memory for bam_mem"

Closes #109, #108, #107, #106, and #105

See merge request !64
parents adf2b8be e1d95064
Branches
Tags
2 merge requests!65Develop,!64Resolve "samtools sort: couldn't allocate memory for bam_mem"
Pipeline #9072 passed with stages
in 4 minutes and 31 seconds
...@@ -109,7 +109,7 @@ parseMetadata: ...@@ -109,7 +109,7 @@ parseMetadata:
- study=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID) - study=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p studyRID)
- endsRaw=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta) - endsRaw=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsMeta)
- endsMeta="uk" - endsMeta="uk"
- endsManual=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual) - endsManual="se"
- stranded=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded) - stranded=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded)
- spike=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike) - spike=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p spike)
- species=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species) - species=$(singularity run 'docker://gudmaprbk/python3:1.0.0' python3 ./workflow/scripts/parse_meta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p species)
...@@ -750,6 +750,36 @@ failMismatchR1R2: ...@@ -750,6 +750,36 @@ failMismatchR1R2:
when: when:
- always - always
failUnexpectedMeta:
stage: integration
only: [merge_requests]
except:
variables:
- $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID 14-3R4R --source staging --upload true -with-dag dag.png --dev false --ci true
retry:
max: 0
when:
- always
failFileStructure:
stage: integration
only: [merge_requests]
except:
variables:
- $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5HT --source staging --upload true -with-dag dag.png --dev false --ci true
retry:
max: 0
when:
- always
override_inputBag: override_inputBag:
stage: integration stage: integration
only: [merge_requests] only: [merge_requests]
......
# v1.0.3 (in development)
**User Facing**
**Background**
* Add memory limit (75%) per thread for samtools sort (#108)
* Remove parsing restrictions for submitted stranded/spike/species (#105, #106)
* Pass unidentified ends instead of overwriting it as unknown
* Move fastqc process before trim to catch fastq errors (#107)
* Only use fastq's that match *[_.]R[1-2].fastq.gz naming convention (#107)
* Add error output for no fastq's
* Update input bag export config to only fetch fastq's that match *[_.]R[1-2].fastq.gz naming convention
* Remove check for multiple fastq check in parse metadata (redundant and no longer valid)
* Handle blank submitted endness better
* Don't use file.csv from inputBag to parse manual endness, use counted from getData
* Detect malformed fastq's (#107)
* Restrict sampled alignment process to use >32GB nodes on BioHPC (#108)
* Use nproc**-1** for alignment processes (#108)
*Known Bugs*
* Override params (inputBag, fastq, species) aren't checked for integrity
* Authentication files and tokens must be active (active auth client) for the duration of the pipeline run (until long-lived token utilization included)
<hr>
# v1.0.2 # v1.0.2
**User Facing** **User Facing**
......
docs/dag.png

4.57 MiB | W: | H:

docs/dag.png

5.27 MiB | W: | H:

docs/dag.png
docs/dag.png
docs/dag.png
docs/dag.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -89,7 +89,7 @@ ...@@ -89,7 +89,7 @@
"processor": "fetch", "processor": "fetch",
"processor_params": { "processor_params": {
"output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}", "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none" "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/File_Name::ciregexp::%5B_.%5DR%5B12%5D%5C.fastq%5C.gz/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
} }
} }
] ]
......
...@@ -116,6 +116,10 @@ process { ...@@ -116,6 +116,10 @@ process {
cpus = 1 cpus = 1
memory = '1 GB' memory = '1 GB'
} }
withName:failPreExecutionRun_fastqFile {
cpus = 1
memory = '1 GB'
}
withName:failPreExecutionRun_species { withName:failPreExecutionRun_species {
{ {
cpus = 1 cpus = 1
......
...@@ -32,7 +32,7 @@ process { ...@@ -32,7 +32,7 @@ process {
executor = 'local' executor = 'local'
} }
withName:alignSampleData { withName:alignSampleData {
queue = 'super' queue = '128GB,256GB,256GBv1,384GB'
} }
withName:inferMetadata { withName:inferMetadata {
queue = 'super' queue = 'super'
...@@ -85,6 +85,9 @@ process { ...@@ -85,6 +85,9 @@ process {
withName:failPreExecutionRun_fastq { withName:failPreExecutionRun_fastq {
executor = 'local' executor = 'local'
} }
withName:failPreExecutionRun_fastqFile {
executor = 'local'
}
withName:failPreExecutionRun_species { withName:failPreExecutionRun_species {
executor = 'local' executor = 'local'
} }
......
...@@ -91,6 +91,9 @@ process { ...@@ -91,6 +91,9 @@ process {
withName:failPreExecutionRun_fastq { withName:failPreExecutionRun_fastq {
container = 'gudmaprbk/deriva1.4:1.0.0' container = 'gudmaprbk/deriva1.4:1.0.0'
} }
withName:failPreExecutionRun_fastqFile {
container = 'gudmaprbk/deriva1.4:1.0.0'
}
withName:failPreExecutionRun_species { withName:failPreExecutionRun_species {
container = 'gudmaprbk/deriva1.4:1.0.0' container = 'gudmaprbk/deriva1.4:1.0.0'
} }
...@@ -125,6 +128,6 @@ manifest { ...@@ -125,6 +128,6 @@ manifest {
homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
mainScript = 'rna-seq.nf' mainScript = 'rna-seq.nf'
version = 'v1.0.2' version = 'v1.0.3'
nextflowVersion = '>=19.09.0' nextflowVersion = '>=19.09.0'
} }
This diff is collapsed.
...@@ -18,7 +18,7 @@ if [ "${validate}" != "is valid" ] ...@@ -18,7 +18,7 @@ if [ "${validate}" != "is valid" ]
then then
exit 1 exit 1
fi fi
for i in $(find */ -name "*R*.fastq.gz") for i in $(find */ -name "*[_.]R[1-2].fastq.gz")
do do
path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz") path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")
cp ${i} ./${path} cp ${i} ./${path}
......
...@@ -35,15 +35,11 @@ def main(): ...@@ -35,15 +35,11 @@ def main():
else: else:
rep = metaFile["Replicate_RID"].unique()[0] rep = metaFile["Replicate_RID"].unique()[0]
print(rep) print(rep)
if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
print("There are more then 2 fastq's in the metadata: " +
" ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
exit(1)
# Check experiment RID metadata from 'Experiment.csv' # Check experiment RID metadata from 'Experiment.csv'
if (args.parameter == "expRID"): if (args.parameter == "expRID"):
if (len(metaFile.Experiment_RID.unique()) > 1): if (len(metaFile.Experiment_RID.unique()) > 1):
print("There are multiple experoment RID's in the metadata: " + print("There are multiple experiment RID's in the metadata: " +
" ".join(metaFile.Experiment_RID.unique())) " ".join(metaFile.Experiment_RID.unique()))
exit(1) exit(1)
else: else:
...@@ -65,14 +61,6 @@ def main(): ...@@ -65,14 +61,6 @@ def main():
endsMeta = metaFile.Paired_End.unique()[0] endsMeta = metaFile.Paired_End.unique()[0]
print(endsMeta) print(endsMeta)
# Manually get endness count from 'File.csv'
if (args.parameter == "endsManual"):
if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
endsManual = "se"
elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
endsManual = "pe"
print(endsManual)
# Get strandedness metadata from 'Experiment Settings.csv' # Get strandedness metadata from 'Experiment Settings.csv'
if (args.parameter == "stranded"): if (args.parameter == "stranded"):
if (metaFile.Has_Strand_Specific_Information.unique() == "yes"): if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
...@@ -80,9 +68,7 @@ def main(): ...@@ -80,9 +68,7 @@ def main():
elif (metaFile.Has_Strand_Specific_Information.unique() == "no"): elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
stranded = "unstranded" stranded = "unstranded"
else: else:
print("Stranded metadata not match expected options: " + stranded = metaFile.Has_Strand_Specific_Information.unique()[0]
metaFile.Has_Strand_Specific_Information.unique())
exit(1)
print(stranded) print(stranded)
# Get spike-in metadata from 'Experiment Settings.csv' # Get spike-in metadata from 'Experiment Settings.csv'
...@@ -92,9 +78,7 @@ def main(): ...@@ -92,9 +78,7 @@ def main():
elif (metaFile.Used_Spike_Ins.unique() == "no"): elif (metaFile.Used_Spike_Ins.unique() == "no"):
spike = "no" spike = "no"
else: else:
print("Spike-ins metadata not match expected options: " + spike = metaFile.Used_Spike_Ins.unique()[0]
metaFile.Used_Spike_Ins.unique())
exit(1)
print(spike) print(spike)
# Get species metadata from 'Experiment.csv' # Get species metadata from 'Experiment.csv'
...@@ -104,9 +88,7 @@ def main(): ...@@ -104,9 +88,7 @@ def main():
elif (metaFile.Species.unique() == "Homo sapiens"): elif (metaFile.Species.unique() == "Homo sapiens"):
species = "Homo sapiens" species = "Homo sapiens"
else: else:
print("Species metadata not match expected options: " + species = metaFile.Species.unique()[0]
metaFile.Species.unique())
exit(1)
print(species) print(species)
# Get read length metadata from 'Experiment Settings.csv' # Get read length metadata from 'Experiment Settings.csv'
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment