diff --git a/conf/.gitkeep b/conf/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/conf/Execution_Run_For_Output_Bag.json b/conf/Execution_Run_For_Output_Bag.json new file mode 100755 index 0000000000000000000000000000000000000000..5945b1eb8c4c5e3ec862840f232ed7a8e386d770 --- /dev/null +++ b/conf/Execution_Run_For_Output_Bag.json @@ -0,0 +1,64 @@ +{ + "bag": { + "bag_name": "Execution_Run_{rid}", + "bag_algorithms": [ + "md5" + ], + "bag_archiver": "zip", + "bag_metadata": {} + }, + "catalog": { + "catalog_id": "2", + "query_processors": [ + { + "processor": "csv", + "processor_params": { + "output_path": "Execution_Run", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/RID,Replicate_RID:=Replicate,Workflow_RID:=Workflow,Reference_Genone_RID:=Reference_Genome,Input_Bag_RID:=Input_Bag,Notes,Execution_Status,Execution_Status_Detail,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Workflow", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Workflow?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Reference_Genome", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Reference_Genome?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Input_Bag", + "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Input_Bag?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "mRNA_QC", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/(RID)=(RNASeq:mRNA_QC:Execution_Run)/RID,Execution_Run_RID:=Execution_Run,Replicate_RID:=Replicate,Paired_End,Strandedness,Median_Read_Length,Raw_Count,Final_Count,Notes,RCT,RMT?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Output_Files", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/(RID)=(RNASeq:Processed_File:Execution_Run)/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Input_Bag", + "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/RNASeq:Input_Bag/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none" + } + } + ] + } +} \ No newline at end of file diff --git a/conf/Replicate_For_Input_Bag.json b/conf/Replicate_For_Input_Bag.json new file mode 100644 index 0000000000000000000000000000000000000000..508a0245051534fae39020792719b04d78947613 --- /dev/null +++ b/conf/Replicate_For_Input_Bag.json @@ -0,0 +1,97 @@ +{ + "bag": { + "bag_name": "{rid}_inputBag", + "bag_algorithms": [ + "md5" + ], + "bag_archiver": "zip" + }, + "catalog": { + "query_processors": [ + { + "processor": "csv", + "processor_params": { + "output_path": "Study", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Experiment_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Antibodies", + "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Custom Metadata", + "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Experiment Settings", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Strandedness,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Replicate", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen_Anatomical_Source", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Specimen_Cell_Types", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "Single Cell Metrics", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none" + } + }, + { + "processor": "csv", + "processor_params": { + "output_path": "File", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none" + } + }, + { + "processor": "fetch", + "processor_params": { + "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}", + "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/File_Name::ciregexp::%5B_.%5DR%5B12%5D%5C.fastq%5C.gz/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none" + } + } + ] + } +} diff --git a/conf/aws.config b/conf/aws.config new file mode 100644 index 0000000000000000000000000000000000000000..bf5b59c7cf9db00606a5db9f97c706d53f21137f --- /dev/null +++ b/conf/aws.config @@ -0,0 +1,127 @@ +params { + refSource = "aws" +} + +workDir = 's3://gudmap-rbk.output/work' +aws.client.storageEncryption = 'AES256' +aws { + region = 'us-east-2' + batch { + cliPath = '/home/ec2-user/miniconda/bin/aws' + } +} + +process { + executor = 'awsbatch' + cpus = 1 + memory = '1 GB' + + withName:trackStart { + cpus = 1 + memory = '1 GB' + } + withName:getBag { + cpus = 1 + memory = '1 GB' + } + withName:getData { + cpus = 1 + memory = '1 GB' + } + withName:parseMetadata { + cpus = 15 + memory = '1 GB' + } + withName:trimData { + cpus = 20 + memory = '2 GB' + } + withName:getRefInfer { + cpus = 1 + memory = '1 GB' + } + withName:downsampleData { + cpus = 1 + memory = '1 GB' + } + withName:alignSampleData { + cpus = 50 + memory = '5 GB' + } + withName:inferMetadata { + cpus = 5 + memory = '1 GB' + } + withName:checkMetadata { + cpus = 1 + memory = '1 GB' + } + withName:getRef { + cpus = 1 + memory = '1 GB' + } + withName:alignData { + cpus = 50 + memory = '10 GB' + } + withName:dedupData { + cpus = 5 + memory = '20 GB' + } + withName:countData { + cpus = 2 + memory = '5 GB' + } + withName:makeBigWig { + cpus = 15 + memory = '5 GB' + } + withName:fastqc { + cpus = 1 + memory = '1 GB' + } + withName:dataQC { + cpus = 15 + memory = '2 GB' + } + withName:aggrQC { + cpus = 2 + memory = '1 GB' + } + withName:uploadInputBag { + cpus = 1 + memory = '1 GB' + } + withName:uploadExecutionRun { + cpus = 1 + memory = '1 GB' + } + withName:uploadQC { + cpus = 1 + memory = '1 GB' + } + withName:uploadProcessedFile { + cpus = 1 + memory = '1 GB' + } + withName:uploadOutputBag { + cpus = 1 + memory = '1 GB' + } + withName:finalizeExecutionRun { + cpus = 1 + memory = '1 GB' + } + withName:failPreExecutionRun { + cpus = 1 + memory = '1 GB' + } + withName:failExecutionRun { + cpus = 1 + memory = '1 GB' + } + withName:uploadQC_fail { + cpus = 1 + memory = '1 GB' + } +} diff --git a/conf/biohpc.config b/conf/biohpc.config new file mode 100755 index 0000000000000000000000000000000000000000..a12f2a704b3c63df9031789c2bb05d11e04d6b3a --- /dev/null +++ b/conf/biohpc.config @@ -0,0 +1,105 @@ +params { + refSource = "biohpc" +} + +process { + executor = 'slurm' + queue = 'super' + clusterOptions = '--hold' + time = '4h' + errorStrategy = 'retry' + maxRetries = 1 + + withName:trackStart { + executor = 'local' + } + withName:getBag { + executor = 'local' + } + withName:getData { + queue = 'super' + } + withName:parseMetadata { + executor = 'local' + } + withName:trimData { + queue = 'super' + } + withName:getRefInfer { + queue = 'super' + } + withName:downsampleData { + executor = 'local' + } + withName:alignSampleData { + queue = '128GB,256GB,256GBv1,384GB' + } + withName:inferMetadata { + queue = 'super' + } + withName:checkMetadata { + executor = 'local' + } + withName:getRef { + queue = 'super' + } + withName:alignData { + queue = '256GB,256GBv1' + } + withName:dedupData { + queue = 'super' + } + withName:countData { + queue = 'super' + } + withName:makeBigWig { + queue = 'super' + } + withName:fastqc { + queue = 'super' + } + withName:dataQC { + queue = 'super' + } + withName:aggrQC { + executor = 'local' + } + withName:uploadInputBag { + executor = 'local' + } + withName:uploadExecutionRun { + executor = 'local' + } + withName:uploadQC { + executor = 'local' + } + withName:uploadProcessedFile { + executor = 'local' + } + withName:uploadOutputBag { + executor = 'local' + } + withName:finalizeExecutionRun { + executor = 'local' + } + withName:failPreExecutionRun { + executor = 'local' + } + withName:failExecutionRun { + executor = 'local' + } + withName:uploadQC_fail { + executor = 'local' + } +} + +singularity { + enabled = true + cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/' +} + +env { + http_proxy = 'http://proxy.swmed.edu:3128' + https_proxy = 'http://proxy.swmed.edu:3128' + all_proxy = 'http://proxy.swmed.edu:3128' +} diff --git a/conf/biohpc_max.config b/conf/biohpc_max.config new file mode 100755 index 0000000000000000000000000000000000000000..0e93ccf6a0be4c15c076ab6eb955a4bb39d96120 --- /dev/null +++ b/conf/biohpc_max.config @@ -0,0 +1,16 @@ +process { + executor = 'slurm' + queue = '256GB,256GBv1,384GB,128GB' + clusterOptions = '--hold' +} + +singularity { + enabled = true + cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/' +} + +env { + http_proxy = 'http://proxy.swmed.edu:3128' + https_proxy = 'http://proxy.swmed.edu:3128' + all_proxy = 'http://proxy.swmed.edu:3128' +} diff --git a/conf/multiqc_config.yaml b/conf/multiqc_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed1375aed47a454394029e5057695b0c15babd8c --- /dev/null +++ b/conf/multiqc_config.yaml @@ -0,0 +1,180 @@ +custom_logo: './bicf_logo.png' +custom_logo_url: 'https/utsouthwestern.edu/labs/bioinformatics/' +custom_logo_title: 'Bioinformatics Core Facility' + +report_header_info: + - Contact Email: 'bicf@utsouthwestern.edu' + - Application Type: 'RNA-Seq Analytic Pipeline for GUDMAP/RBK' + - Department: 'Bioinformatic Core Facility, Department of Bioinformatics, University of Texas Southwestern Medical Center' + +title: RNA-Seq Analytic Pipeline for GUDMAP/RBK + +report_comment: > + This report has been generated by the <a href="https://doi.org/10.5281/zenodo.3625056">GUDMAP/RBK RNA-Seq Pipeline</a> + +top_modules: + - fastqc: + name: 'Raw' + info: 'Replicate Raw fastq QC Results' + - cutadapt: + name: 'Trim' + info: 'Replicate Trim Adapter QC Results' + - hisat2: + name: 'Align' + info: 'Replicate Alignment QC Results' + path_filters: + - '*alignSummary*' + - picard: + name: 'Dedup' + info: 'Replicate Alignement Deduplication QC Results' + - rseqc: + name: 'Inner Distance' + info: 'Replicate Paired End Inner Distance Distribution Results' + path_filters: + - '*insertSize*' + - custom_content + - featureCounts: + name: 'Count' + info: 'Replicate Feature Count QC Results' + - hisat2: + name: 'Inference: Align' + info: 'Inference Alignment (1M downsampled reads) QC Results' + path_filters: + - '*alignSampleSummary*' + - rseqc: + name: 'Inference: Stranded' + info: '1M Downsampled Reads Strandedness Inference Results' + path_filters: + - '*infer_experiment*' + +report_section_order: + run: + order: 4000 + rid: + order: 3000 + meta: + order: 2000 + ref: + order: 1000 + software_versions: + order: -1000 + software_references: + order: -2000 + +skip_generalstats: true + +custom_data: + run: + file_format: 'tsv' + section_name: 'Run' + description: 'This is the run information' + plot_type: 'table' + pconfig: + id: 'run' + scale: false + format: '{}' + headers: + Session: + description: '' + Session ID: + description: 'Nextflow session ID' + Pipeline Version: + description: 'BICF pipeline version' + Input: + description: 'Input overrides' + rid: + file_format: 'tsv' + section_name: 'RID' + description: 'This is the identifying RIDs' + plot_type: 'table' + pconfig: + id: 'rid' + scale: false + format: '{}' + headers: + Replicate: + description: '' + Replicate RID: + description: 'Replicate RID' + Experiment RID: + description: 'Experiment RID' + Study RID: + description: 'Study RID' + meta: + file_format: 'tsv' + section_name: 'Metadata' + description: 'This is the comparison of infered metadata, submitter provided, and calculated' + plot_type: 'table' + pconfig: + id: 'meta' + scale: false + format: '{:,.0f}' + headers: + Source: + description: 'Metadata source' + Species: + description: 'Species' + Ends: + description: 'Single or paired end sequencing' + Stranded: + description: 'Stranded (forward/reverse) or unstranded library prep' + Spike-in: + description: 'ERCC spike in' + Raw Reads: + description: 'Number of reads of the sequencer' + Assigned Reads: + description: 'Final reads after fintering' + Median Read Length: + description: 'Average read length' + Median TIN: + description: 'Average transcript integrity number' + + ref: + file_format: 'tsv' + section_name: 'Reference' + description: 'This is the reference version information' + plot_type: 'table' + pconfig: + id: 'ref' + scale: false + format: '{}' + headers: + Species: + description: 'Reference species' + Genome Reference Consortium Build: + description: 'Reference source build' + Genome Reference Consortium Patch: + description: 'Reference source patch version' + GENCODE Annotation Release: + description: 'Annotation release version' + tin: + file_format: 'tsv' + section_name: 'TIN' + description: 'This is the distribution of TIN values calculated by the tool RSeQC' + plot_type: 'bargraph' + pconfig: + id: 'tin' + headers: + chrom + 1 - 10 + 11 - 20 + 21 - 30 + 31 - 40 + 41 - 50 + 51 - 60 + 61 - 70 + 71 - 80 + 81 - 90 + 91 - 100 + +sp: + run: + fn: "run.tsv" + rid: + fn: 'rid.tsv' + meta: + fn: 'metadata.tsv' + ref: + fn: 'reference.tsv' + tin: + fn: '*_tin.hist.tsv' diff --git a/conf/ondemand.config b/conf/ondemand.config new file mode 100755 index 0000000000000000000000000000000000000000..131fdbb19e1fedf1bc9e206a03d801f13791b810 --- /dev/null +++ b/conf/ondemand.config @@ -0,0 +1,3 @@ +process { + queue = 'highpriority-0ef8afb0-c7ad-11ea-b907-06c94a3c6390' +} diff --git a/conf/spot.config b/conf/spot.config new file mode 100755 index 0000000000000000000000000000000000000000..d9c7a4c8fa34aadd597da0170f8e3e223923011a --- /dev/null +++ b/conf/spot.config @@ -0,0 +1,3 @@ +process { + queue = 'default-0ef8afb0-c7ad-11ea-b907-06c94a3c6390' +}