Skip to content
Snippets Groups Projects
Commit 7c677cf1 authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Merge develop into align branch

parents fa22c087 1c3a166b
Branches
Tags
2 merge requests!37v0.0.1,!15Resolve "process_align"
Pipeline #5759 failed with stages
in 28 minutes and 42 seconds
......@@ -26,6 +26,15 @@ getData:
- singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' sh ./workflow/scripts/bdbagFetch.sh Replicate_16-1ZX4 16-1ZX4
- pytest -m getData
parseMetadata:
stage: unit
script:
- singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID
- singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p ends
- singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p endsManual
- singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p stranded
- singularity run 'docker://bicf/python3:1.3' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p specie
trimData:
stage: unit
script:
......
......@@ -13,14 +13,7 @@ process {
cpus = 1
memory = '1 GB'
withName:getBag {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:getData {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:trimData {
container = 'bicf/trimgalore:1.1'
cpus = 15
}
}
\ No newline at end of file
......@@ -13,14 +13,7 @@ process {
cpus = 1
memory = '1 GB'
withName:getBag {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:getData {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:trimData {
container = 'bicf/trimgalore:1.1'
cpus = 15
}
}
......@@ -9,6 +9,9 @@ process {
withName:getData {
executor = 'local'
}
withName:parseMetadata {
executor = 'local'
}
withName:trimData {
queue = '256GB,256GBv1,384GB'
}
......
......@@ -17,6 +17,9 @@ process {
withName:getData {
container = 'bicf/gudmaprbkfilexfer:1.3'
}
withName:parseMetadata {
container = 'bicf/python3:1.3'
}
withName:trimData {
container = 'bicf/trimgalore:1.1'
}
......
#!/usr/bin/env nextflow
// Define input variables
params.deriva = "${baseDir}/../test_data/credential.json"
params.bdbag = "${baseDir}/../test_data/cookies.txt"
params.deriva = "${baseDir}/../test_data/auth/credential.json"
params.bdbag = "${baseDir}/../test_data/auth/cookies.txt"
//params.repRID = "16-1ZX4"
params.repRID = "Q-Y5JA"
params.spikein = "false"
params.reference = "mouse"
params.outDir = "${baseDir}/../output"
......@@ -17,60 +15,23 @@ deriva = Channel
bdbag = Channel
.fromPath(params.bdbag)
.ifEmpty { exit 1, "deriva cookie file for bdbag not found: ${params.bdbag}" }
repRID = params.repRID
outDir = params.outDir
logsDir = "${outDir}/Logs"
reference = params.reference
if (params.spikein) {
if (params.reference == "human") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2")
} else if (params.reference == "mouse") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2")
}
} else if (params.reference == "mouse") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
} else if (params.reference == "human") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2")
} else {
print ("Warning: Reference genome not specified, defaulting to GRCm38.P6 with NO spike-in")
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
}
// Define fixed files
derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
/*
* Pass in programs from the scripts directory, and any default files for later
*/
/*
*Checking the species and spike-in status
*/
if (params.spikein) {
if (params.species == "human") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2")
} else if (params.species == "mouse") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2")
}
} else if (params.species == "mouse") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
} else if (params.species == "human") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2")
} else {
print ("Warning: Reference genome not specified, defaulting to GRCm38.P6 with NO spike-in")
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
}
// Define script files
script_bdbagFetch = Channel.fromPath("${baseDir}/scripts/bdbagFetch.sh")
script_parseMeta = Channel.fromPath("${baseDir}/scripts/parseMeta.py")
/*
* splitData: split bdbag files by replicate so fetch can occure in parallel, and rename files to replicate rid
*/
process getBag {
executor 'local'
tag "${repRID}"
publishDir "${logsDir}/getBag", mode: 'copy', pattern: "${repRID}.getBag.err"
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getBag.err"
input:
path credential, stageAs: 'credential.json' from deriva
......@@ -85,8 +46,12 @@ process getBag {
hostname >>${repRID}.getBag.err
ulimit -a >>${repRID}.getBag.err
export https_proxy=\${http_proxy}
# link credential file for authentication
ln -sf `readlink -e credential.json` ~/.deriva/credential.json 2>>${repRID}.getBag.err
echo "LOG: deriva credentials linked" >>${repRID}.getBag.err
# deriva-download replicate RID
deriva-download-cli dev.gudmap.org --catalog 2 ${derivaConfig} . rid=${repRID} 2>>${repRID}.getBag.err
"""
}
......@@ -96,10 +61,10 @@ process getBag {
*/
process getData {
tag "${repRID}"
publishDir "${logsDir}/getData", mode: 'copy', pattern: "${repRID}.getData.err"
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.getData.err"
input:
executor 'local'
path script_bdbagFetch
path cookies, stageAs: 'deriva-cookies.txt' from bdbag
path bagit
......@@ -110,49 +75,149 @@ process getData {
file("**/Experiment.csv") into experimentMeta
file ("${repRID}.getData.err")
script:
"""
hostname >>${repRID}.getData.err
ulimit -a >>${repRID}.getData.err
export https_proxy=\${http_proxy}
# link deriva cookie for authentication
ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt >>${repRID}.getData.err
echo "LOG: deriva cookie linked" >>${repRID}.getData.err
# get bagit basename
replicate=\$(basename "${bagit}" | cut -d '.' -f1)
echo "LOG: \${replicate}" >>${repRID}.getData.err
# unzip bagit
unzip ${bagit} 2>>${repRID}.getData.err
echo "LOG: replicate bdbag unzipped" >>${repRID}.getData.err
sh ${baseDir}/scripts/bdbagFetch.sh \${replicate} ${repRID} 2>>${repRID}.getData.err
# bagit fetch fastq's only and rename by repRID
sh ${script_bdbagFetch} \${replicate} ${repRID} 2>>${repRID}.getData.err
echo "LOG: replicate bdbag fetched" >>${repRID}.getData.err
"""
}
/*
* parseMetadata: parses metadata to extract experiment parameters
*/
process parseMetadata {
tag "${repRID}"
publishDir "${logsDir}", mode: 'copy', pattern: "${repRID}.parseMetadata.err"
input:
path script_parseMeta
val repRID
path fileMeta
path experimentSettingsMeta
path experimentMeta
output:
path 'design.csv' into metadata
script:
"""
hostname >>${repRID}.parseMetadata.err
ulimit -a >>${repRID}.parseMetadata.err
# Check replicate RID metadata
rep=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p repRID)
echo "LOG: replicate RID metadata parsed: \${rep}" >>${repRID}.parseMetadata.err
# Get endedness metadata
endsMeta=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p endsMeta)
echo "LOG: endedness metadata parsed: \${endsMeta}" >>${repRID}.parseMetadata.err
# Manually get endness
endsManual=\$(python3 ${script_parseMeta} -r ${repRID} -m "${fileMeta}" -p endsManual)
echo "LOG: endedness manually detected: \${endsManual}" >>${repRID}.parseMetadata.err
# Get strandedness metadata
stranded=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p stranded)
echo "LOG: strandedness metadata parsed: \${stranded}" >>${repRID}.parseMetadata.err
# Get spike-in metadata
spike=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentSettingsMeta}" -p spike)
echo "LOG: spike-in metadata parsed: \${spike}" >>${repRID}.parseMetadata.err
# Get species metadata
species=\$(python3 ${script_parseMeta} -r ${repRID} -m "${experimentMeta}" -p species)
echo "LOG: species metadata parsed: \${species}" >>${repRID}.parseMetadata.err
# Save design file
echo "\${rep},\${endsMeta},\${endsManual},\${stranded},\${spike},\${species}" > design.csv
"""
}
// Split metadata into separate channels
rep = Channel.create()
endsMeta = Channel.create()
endsManual = Channel.create()
stranded = Channel.create()
spike = Channel.create()
species = Channel.create()
metadata.splitCsv(sep: ',', header: false).separate(
rep,
endsMeta,
endsManual,
stranded,
spike,
species
)
// Exit with no species
if (species == "") {
print ("ERROR: Reference genome not specified")
exit 1
}
// Setting references
if (spike) {
if (species == "Homo sapiens") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12-S/hisat2")
} else if (species == "Mus musculus") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6-S/hisat2")
}
} else {
if (species == "Homo sapiens") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCh38.p12/hisat2")
} else if (species == "Mus musculus") {
reference = file ("/project/BICF/BICF_Core/s181706/github/gudmap/rna-seq/References/GRCm38.P6/hisat2")
}
}
/*
* trimData: trims any adapter or non-host sequences from the data
*/
process trimData {
tag "${repRID}"
publishDir "${logsDir}/trimData", mode: 'copy', pattern: "\${repRID}.trimData.*"
publishDir "${logsDir}", mode: 'copy', pattern: "\${repRID}.trimData.*"
input:
file(fastq) from fastqs
output:
tuple file ("params.csv"), file ("*.fq.gz") into fastqs_trimmed
path ("*.fq.gz") into fastqs_trimmed
file ("${repRID}.trimData.log")
file ("${repRID}.trimData.err")
script:
"""
if [ '${fastq[1]}' == 'null' ]
hostname >>${repRID}.trimData.err
ulimit -a >>${repRID}.trimData.err
# trim fastqs
if [ '${endsManual}' == 'se' ]
then
ends='se'
trim_galore --gzip -q 25 --illumina --length 35 --basename ${repRID} -j `nproc` ${fastq[0]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err;
else
ends='pe'
trim_galore --gzip -q 25 --illumina --length 35 --paired --basename ${repRID} -j `nproc` ${fastq[0]} ${fastq[1]} 1>>${repRID}.trimData.log 2>>${repRID}.trimData.err;
fi
echo \${ends} > params.csv;
"""
}
......@@ -161,23 +226,21 @@ process trimData {
*/
process alignReads {
tag "align-${repRID}"
publishDir "${outDir}/tempOut/aligned", mode: "copy"
publishDir "${outDir}/aligned", mode: "copy"
input:
set ends, fqs from fastqs_trimmed
file reference
path fqs from fastqs_trimmed
output:
set repRID, file ("${repRID}.unal.gz"), file ("${repRID}.bam"), file ("${repRID}.bai")
script:
"""
ends=`cat ${ends}`;
if [ "\${ends}" == 'pe' ]; then
if [ "${endsManual}" == 'pe' ]; then
hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${reference}/genome -1 ${fqs[0]} -2 ${fqs[1]} 1>${repRID}.align.out 2> ${repRID}.align.err;
else hisat2 -p `nproc` --add-chrname --un-gz ${repRID}.unal.gz -S ${repRID}.sam -x ${reference}/genome -U ${fqs[0]} 1>${repRID}.align.out 2> ${repRID}.align.err;
fi;
samtools view -1 --threads `nproc` -o ${repRID}.bam ${repRID}.sam 1>${repRID}.align.out 2> ${repRID}.align.err;
samtools sort -@ `nproc` -O BAM ${repRID}.bam 1>${repRID}.align.out 2> ${repRID}.align.err;
"""
}
}
\ No newline at end of file
......@@ -6,20 +6,20 @@ import re
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None)
fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0)
fileFile_filtered = fileFile[fileFile["File_Type"]=="FastQ"]
fetchFile_filtered = fetchFile[fetchFile[2].str[-9:]==".fastq.gz"]
fetchFile_filtered_renamed = fetchFile_filtered
for i in fileFile_filtered["File_Name"]:
fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)] = fetchFile_filtered_renamed[2][fetchFile_filtered_renamed[2].str.contains(i,regex=False)].values[0].replace(re.sub("\.R.\.fastq\.gz","",i),fileFile_filtered["Replicate_RID"][fileFile_filtered["File_Name"]==i].values[0])
fetchFile_filtered_renamed.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
fetchFile_filtered_renamed.to_csv(args.files+"/fetch.txt",sep="\t",header=False,index=False)
if __name__ == '__main__':
main()
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True)
parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True)
parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
metaFile = pd.read_csv(args.metaFile,sep=",",header=0)
# Check replicate RID metadata from 'File.csv'
if (args.parameter == "repRID"):
if (len(metaFile.Replicate_RID.unique()) > 1):
print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique()))
exit(1)
if not (metaFile.Replicate_RID.unique() == args.repRID):
print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID)
exit(1)
else:
rep=metaFile["Replicate_RID"].unique()[0]
print(rep)
if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
exit(1)
# Get endedness metadata from 'Experiment Settings.csv'
if (args.parameter == "endsMeta"):
if (metaFile.Paired_End.unique() == "Single End"):
endsMeta = "se"
elif (metaFile.Paired_End.unique() == "Paired End"):
endsMeta = "pe"
else:
endsMeta = "uk"
print(endsMeta)
# Manually get endness count from 'File.csv'
if (args.parameter == "endsManual"):
if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
endsManual = "se"
elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
endsManual = "pe"
print(endsManual)
# Get strandedness metadata from 'Experiment Settings.csv'
if (args.parameter == "stranded"):
if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
stranded = "stranded"
elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
stranded = "unstranded"
else:
print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique())
exit(1)
print(stranded)
# Get spike-in metadata from 'Experiment Settings.csv'
if (args.parameter == "spike"):
if (metaFile.Used_Spike_Ins.unique() == "yes"):
spike = "yes"
elif (metaFile.Used_Spike_Ins.unique() == "no"):
spike = "no"
else:
print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique())
exit(1)
print(spike)
# Get species metadata from 'Experiment.csv'
if (args.parameter == "species"):
if (metaFile.Species.unique() == "Mus musculus"):
species = "Mus musculus"
elif (metaFile.Species.unique() == "Homo sapiens"):
species = "Homo sapiens"
else:
print("Species metadata not match expected options: " + metaFile.Species.unique())
exit(1)
print(species)
if __name__ == '__main__':
main()
\ No newline at end of file
......@@ -6,14 +6,14 @@ import os
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
parser.add_argument('-f', '--files',help="The fetch file from bdgap.zip.",required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
fetchFile = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
fileFile = pd.read_csv(args.fetchFile+"/data/File.csv",sep=",",header=0)
fetchFile = pd.read_csv(args.files+"/fetch.txt",sep="\t",header=None)
fileFile = pd.read_csv(args.files+"/data/File.csv",sep=",",header=0)
replicateRID = fileFile.Replicate_RID.unique()
fetchArray = {i:fileFile.URI[(fileFile.Replicate_RID == i) & (fileFile.File_Type == "FastQ")] for i in replicateRID}
for i in replicateRID:
......
......@@ -9,7 +9,10 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
@pytest.mark.trimData
def test_trimData():
def test_trimData_se():
assert os.path.exists(os.path.join(test_output_path, '16-1ZX4_trimmed.fq.gz'))
@pytest.mark.trimData
def test_trimData_pe():
assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R1_val_1.fq.gz'))
assert os.path.exists(os.path.join(test_output_path, 'Q-Y5JA_R2_val_2.fq.gz'))
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment