Skip to content
Snippets Groups Projects
Commit 67e3eb6e authored by Gervaise Henry's avatar Gervaise Henry :cowboy:
Browse files

Merge branch 'develop' into 'master'

v0.0.1

See merge request !37
parents 4442b00e 8a8ce097
Branches
Tags v0.0.1
2 merge requests!78Master,!37v0.0.1
Pipeline #7843 passed with stages
in 1 hour, 19 minutes, and 39 seconds
Showing
with 1613 additions and 71 deletions
process {
executor = 'slurm'
queue = '256GB,256GBv1,384GB,128GB'
clusterOptions = '--hold'
}
singularity {
enabled = true
cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/'
}
env {
http_proxy = 'http://proxy.swmed.edu:3128'
https_proxy = 'http://proxy.swmed.edu:3128'
all_proxy = 'http://proxy.swmed.edu:3128'
}
name: bdbag
dependencies:
- pandas=0.23.3=py36_0
- pip:
- bdbag==1.5.5
custom_logo: './bicf_logo.png'
custom_logo_url: 'https/utsouthwestern.edu/labs/bioinformatics/'
custom_logo_title: 'Bioinformatics Core Facility'
report_header_info:
- Contact Email: 'bicf@utsouthwestern.edu'
- Application Type: 'RNA-Seq Analytic Pipeline for GUDMAP/RBK'
- Department: 'Bioinformatic Core Facility, Department of Bioinformatics, University of Texas Southwestern Medical Center'
title: RNA-Seq Analytic Pipeline for GUDMAP/RBK
report_comment: >
This report has been generated by the <a href="https://doi.org/10.5281/zenodo.3625056">GUDMAP/RBK RNA-Seq Pipeline</a>
top_modules:
- fastqc:
name: 'Raw'
info: 'Replicate Raw fastq QC Results'
- cutadapt:
name: 'Trim'
info: 'Replicate Trim Adapter QC Results'
- hisat2:
name: 'Align'
info: 'Replicate Alignment QC Results'
path_filters:
- '*alignSummary*'
- picard:
name: 'Dedup'
info: 'Replicate Alignement Deduplication QC Results'
- rseqc:
name: 'Inner Distance'
info: 'Replicate Paired End Inner Distance Distribution Results'
path_filters:
- '*insertSize*'
- custom_content
- featureCounts:
name: 'Count'
info: 'Replicate Feature Count QC Results'
- hisat2:
name: 'Inference: Align'
info: 'Inference Alignment (1M downsampled reads) QC Results'
path_filters:
- '*alignSampleSummary*'
- rseqc:
name: 'Inference: Stranded'
info: '1M Downsampled Reads Strandedness Inference Results'
path_filters:
- '*infer_experiment*'
report_section_order:
rid:
order: 2000
meta:
order: 1000
skip_generalstats: true
custom_data:
rid:
file_format: 'tsv'
section_name: 'RID'
description: 'This is the identifying RIDs'
plot_type: 'table'
pconfig:
id: 'rid'
headers:
Replicate RID
Experiment RID
Study RID
meta:
file_format: 'tsv'
section_name: 'Metadata'
description: 'This is the comparison of infered metadata, submitter provided, and calculated'
plot_type: 'table'
pconfig:
id: 'meta'
format: '{:,.0f}'
headers:
Source
Species
Ends
Stranded
Spike-in
Raw Reads
Assigned Reads
Median Read Length
Median TIN
tin:
file_format: 'tsv'
section_name: 'TIN'
description: 'This is the distribution of TIN values calculated by the tool RSeQC'
plot_type: 'bargraph'
pconfig:
id: 'tin'
headers:
chrom
0 - 9
10 - 19
20 - 29
30 - 39
40 - 49
50 - 59
60 - 69
70 - 79
80 - 89
90 - 99
sp:
rid:
fn: 'rid.tsv'
meta:
fn: 'metadata.tsv'
tin:
fn: '*.tin.hist.tsv'
process {
queue = 'highpriority-0ef8afb0-c7ad-11ea-b907-06c94a3c6390'
}
{
"bag": {
"bag_name": "Replicate_{rid}",
"bag_algorithms": [
"md5"
],
"bag_archiver": "zip"
},
"catalog": {
"query_processors": [
{
"processor": "csv",
"processor_params": {
"output_path": "Study",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Experiment",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Sequencing_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Experiment Antibodies",
"query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Experiment Custom Metadata",
"query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Experiment Settings",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Has_Strand_Specific_Information,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Replicate",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Specimen",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Specimen_Anatomical_Source",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Specimen_Cell_Types",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "Single Cell Metrics",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
}
},
{
"processor": "csv",
"processor_params": {
"output_path": "File",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none"
}
},
{
"processor": "fetch",
"processor_params": {
"output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
"query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
}
}
]
}
}
process {
queue = 'default-0ef8afb0-c7ad-11ea-b907-06c94a3c6390'
}
RUN apt-get install -y python3.7 python3-pip
RUN wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b && \
rm Miniconda3-latest-Linux-x86_64.sh
ENV PATH=/miniconda/bin:${PATH}
RUN conda config --add channels defaults && \
conda config --add channels bioconda && \
conda config --add channels conda-forge && \
conda update -n base -c defaults -y conda
RUN pip install --upgrade pip
......@@ -2,4 +2,98 @@ profiles {
standard {
includeConfig 'conf/biohpc.config'
}
biohpc {
includeConfig 'conf/biohpc.config'
}
biohpc_max {
includeConfig 'conf/biohpc_max.config'
}
aws_ondemand {
includeConfig 'conf/aws.config'
includeConfig 'conf/ondemand.config'
}
aws_spot {
includeConfig 'conf/aws.config'
includeConfig 'conf/spot.config'
}
}
process {
withName:getBag {
container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
}
withName:getData {
container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
}
withName: parseMetadata {
container = 'bicf/python3:2.0.1_indev'
}
withName: trimData {
container = 'bicf/trimgalore:1.1'
}
withName: getRefInfer {
container = 'bicf/awscli:1.1'
}
withName: downsampleData {
container = 'bicf/seqtk:2.0.1_indev'
}
withName: alignSampleData {
container = 'bicf/gudmaprbkaligner:2.0.1_indev'
}
withName: inferMetadata {
container = 'bicf/rseqc3.0:2.0.1_indev'
}
withName: getRef {
container = 'bicf/awscli:1.1'
}
withName: alignData {
container = 'bicf/gudmaprbkaligner:2.0.1_indev'
}
withName: dedupData {
container = 'bicf/gudmaprbkdedup:2.0.0'
}
withName: countData {
container = 'bicf/subread2:2.0.0'
}
withName: makeBigWig {
container = 'bicf/deeptools3.3:2.0.1_indev'
}
withName: fastqc {
container = 'bicf/fastqc:2.0.1_indev'
}
withName: dataQC {
container = 'bicf/rseqc3.0:2.0.1_indev'
}
withName: aggrQC {
container = 'bicf/multiqc1.8:2.0.1_indev'
}
}
trace {
enabled = true
file = 'pipeline_trace.txt'
fields = 'task_id,native_id,process,name,status,exit,submit,start,complete,duration,realtime,%cpu,%mem,rss'
}
timeline {
enabled = true
file = 'timeline.html'
}
report {
enabled = true
file = 'report.html'
}
tower {
accessToken = '3ade8f325d4855434b49aa387421a44c63e3360f'
enabled = true
}
manifest {
homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
mainScript = 'rna-seq.nf'
version = 'v0.0.1'
nextflowVersion = '>=19.09.0'
}
workflow/rna-seq.nf 100755 → 100644
This diff is collapsed.
#!/bin
#!/bin/bash
bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz $1
\ No newline at end of file
if [ -z "${3}" ]
then
bdbag --resolve-fetch all --fetch-filter filename\$*fastq.gz ${1}
for i in $(find */ -name "*R*.fastq.gz")
do
path=${2}.$(echo ${i##*/} | grep -o "R[1,2].fastq.gz")
cp ${i} ./${path}
done
elif [ "${3}" == "TEST" ]
then
bdbag --resolve-fetch all --fetch-filter filename\$*.txt ${1}
fi
gc()
library(optparse)
option_list=list(
make_option("--count",action="store",type='character',help="Count File")
)
opt=parse_args(OptionParser(option_list=option_list))
rm(option_list)
if (!("count" %in% names(opt))){
stop("No count file passed, exiting.")
} else if (!file.exists(opt$count)) {
stop("Count file doesn't exist, exiting.")
}
repRID <- basename(gsub(".featureCounts","",opt$count))
count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below
colnames(count)[7] <- "count"
rpk <- count$count/count$Length/1000
scale <- sum(rpk)/1000000
tpm <- rpk/scale
output <- cbind(count,tpm)
colnames(output)[7] <- "count"
write.table(output,file=paste0(repRID,".countTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
gc()
library(optparse)
option_list=list(
make_option("--repRID",action="store",type='character',help="Replicate RID")
)
opt=parse_args(OptionParser(option_list=option_list))
rm(option_list)
countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)
convert <- data.frame(geneID=countTable$Geneid)
convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
convert[is.na(convert$V2),3] <- ""
convert <- convert[,-1]
colnames(convert) <- c("GeneID","EntrezID")
convert <- unique(convert)
output <- merge(x=convert,y=countTable,by.x="GeneID",by.y="Geneid",all.x=TRUE)
write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
\ No newline at end of file
#!/bin/bash
if [ "${1}" == "endness" ]
then
awk '/Data/ {print}' "${2}" | sed -e 's/^This is //' -e 's/ Data$//'
elif [ "${1}" == "fail" ]
then
awk '/Fraction of reads failed/ {print}' "${2}" | sed -e 's/^Fraction of reads failed to determine: //'
elif [ "${1}" == "sef" ]
then
awk '/\+\+,--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "++,--": //'
elif [ "${1}" == "ser" ]
then
awk '/\+-,-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "+-,-+": //'
elif [ "${1}" == "pef" ]
then
awk '/1\+\+,1--,2\+-,2-\+/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1++,1--,2+-,2-+": //'
elif [ "${1}" == "per" ]
then
awk '/1\+-,1-\+,2\+\+,2--/ {print}' "${2}" | sed -e 's/^Fraction of reads explained by "1+-,1-+,2++,2--": //'
fi
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True)
parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True)
parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
metaFile = pd.read_csv(args.metaFile,sep=",",header=0)
# Check replicate RID metadata from 'File.csv'
if (args.parameter == "repRID"):
if (len(metaFile.Replicate_RID.unique()) > 1):
print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique()))
exit(1)
if not (metaFile.Replicate_RID.unique() == args.repRID):
print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID)
exit(1)
else:
rep=metaFile["Replicate_RID"].unique()[0]
print(rep)
if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
exit(1)
# Check experiment RID metadata from 'Experiment.csv'
if (args.parameter == "expRID"):
if (len(metaFile.Experiment_RID.unique()) > 1):
print("There are multiple experoment RID's in the metadata: " + " ".join(metaFile.Experiment_RID.unique()))
exit(1)
else:
exp=metaFile["Experiment_RID"].unique()[0]
print(exp)
# Check study RID metadata from 'Experiment.csv'
if (args.parameter == "studyRID"):
if (len(metaFile.Study_RID.unique()) > 1):
print("There are multiple study RID's in the metadata: " + " ".join(metaFile.Study_RID.unique()))
exit(1)
else:
study=metaFile["Study_RID"].unique()[0]
print(study)
# Get endedness metadata from 'Experiment Settings.csv'
if (args.parameter == "endsMeta"):
if (metaFile.Paired_End.unique() == "Single End"):
endsMeta = "se"
elif (metaFile.Paired_End.unique() == "Paired End"):
endsMeta = "pe"
else:
endsMeta = "uk"
print(endsMeta)
# Manually get endness count from 'File.csv'
if (args.parameter == "endsManual"):
if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
endsManual = "se"
elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
endsManual = "pe"
print(endsManual)
# Get strandedness metadata from 'Experiment Settings.csv'
if (args.parameter == "stranded"):
if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
stranded = "stranded"
elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
stranded = "unstranded"
else:
print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique())
exit(1)
print(stranded)
# Get spike-in metadata from 'Experiment Settings.csv'
if (args.parameter == "spike"):
if (metaFile.Used_Spike_Ins.unique() == "yes"):
spike = "yes"
elif (metaFile.Used_Spike_Ins.unique() == "no"):
spike = "no"
else:
print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique())
exit(1)
print(spike)
# Get species metadata from 'Experiment.csv'
if (args.parameter == "species"):
if (metaFile.Species.unique() == "Mus musculus"):
species = "Mus musculus"
elif (metaFile.Species.unique() == "Homo sapiens"):
species = "Homo sapiens"
else:
print("Species metadata not match expected options: " + metaFile.Species.unique())
exit(1)
print(species)
# Get read length metadata from 'Experiment Settings.csv'
if (args.parameter == "readLength"):
readLength = metaFile.Read_Length.unique()
print(str(readLength).strip('[]'))
if __name__ == '__main__':
main()
#!/bin
while read loc checksum fileLocation
do
file=$(echo ${fileLocation##*/})
fileName=$(echo ${file%.R*.fastq.gz})
fileExt=$(echo ${file##${fileName}.})
while IFS="," read RID Study_RID Experiment_RID Replicate_RID Caption File_Type File_Name URI File_size MD5 GEO_Archival_URL dbGaP_Accession_ID Processed Notes Principal_Investigator Consortium Release_Date RCT RMT Legacy_File_RID GUDMAP_NGF_OID GUDMAP_NGS_OID
do
if [ ${file} == ${File_Name} ]
then
find . -type f -name ${file} -execdir mv {} ${Replicate_RID}.${fileExt} ';'
fi
done < $1/data/File.csv
done < $1/fetch.txt
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fetchFile',help="The fetch file from bdgap.zip.",required=True)
parser.add_argument('-s', '--studyRID',help="The study RID.",required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
fetch = pd.read_csv(args.fetchFile+"/fetch.txt",sep="\t",header=None)
fetch_filtered = fetch[fetch[2].str[-9:]==".fastq.gz"]
fetch_filtered.to_csv(args.fetchFile+"/fetch.txt",sep="\t",header=False,index=False)
studyRID=pd.read_json(args.studyRID+"_studyRID.json")
if studyRID["RID"].count() > 0:
studyRID["RID"].to_csv(args.studyRID+"_studyRID.csv",header=False,index=False)
else:
raise Exception("No associated replicates found: %s" %
studyRID)
if __name__ == '__main__':
main()
\ No newline at end of file
main()
#!/bin/bash
# query GUDMAP/RBK for study RID
echo "curl --location --request GET 'https://www.gudmap.org/ermrest/catalog/2/entity/RNASeq:Replicate/Study_RID="${1}"'" | bash > $1_studyRID.json
# extract replicate RIDs
module load python/3.6.4-anaconda
python3 ./workflow/scripts/splitStudy.py -s $1
# run pipeline on replicate RIDs in parallel
module load nextflow/20.01.0
module load singularity/3.5.3
while read repRID; do echo ${repRID}; done < "$1_studyRID.csv" | xargs -P 5 -I {} nextflow run workflow/rna-seq.nf --repRID {}
# cleanup study RID files
rm $1_studyRID.json
rm $1_studyRID.csv
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment