Merge branch 'develop' into 'dnanexus'

# Conflicts: # nextflow.config # rna-seq.nf

Merge branch 'develop' into 'dnanexus'
# Conflicts: # nextflow.config # rna-seq.nf
9c320c12 · Gervaise Henry · 53c18086 · a566d140 · 9c320c12 · 9c320c12
Commit 9c320c12 authored 4 years ago by Gervaise Henry
--- a/conf/.gitkeep
+++ b/conf/.gitkeep
--- a/conf/Execution_Run_For_Output_Bag.json
+++ b/conf/Execution_Run_For_Output_Bag.json
+{
+  "bag": {
+    "bag_name": "Execution_Run_{rid}",
+    "bag_algorithms": [
+      "md5"
+    ],
+    "bag_archiver": "zip",
+    "bag_metadata": {}
+  },
+  "catalog": {
+    "catalog_id": "2",
+    "query_processors": [
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Execution_Run",
+          "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/RID,Replicate_RID:=Replicate,Workflow_RID:=Workflow,Reference_Genone_RID:=Reference_Genome,Input_Bag_RID:=Input_Bag,Notes,Execution_Status,Execution_Status_Detail,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Workflow",
+          "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Workflow?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Reference_Genome",
+          "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Reference_Genome?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Input_Bag",
+          "query_path": "/entity/M:=RNASeq:Execution_Run/RID=17-BPAG/RNASeq:Input_Bag?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "mRNA_QC",
+          "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/(RID)=(RNASeq:mRNA_QC:Execution_Run)/RID,Execution_Run_RID:=Execution_Run,Replicate_RID:=Replicate,Paired_End,Strandedness,Median_Read_Length,Raw_Count,Final_Count,Notes,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "fetch",
+        "processor_params": {
+          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Output_Files",
+          "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/(RID)=(RNASeq:Processed_File:Execution_Run)/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none"
+        }
+      },
+      {
+        "processor": "fetch",
+        "processor_params": {
+          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}/Execution_Run/{Execution_Run_RID}/Input_Bag",
+          "query_path": "/attribute/M:=RNASeq:Execution_Run/RID=17-BPAG/R:=RNASeq:Replicate/$M/RNASeq:Input_Bag/url:=File_URL,length:=File_Bytes,filename:=File_Name,md5:=File_MD5,Execution_Run_RID:=M:RID,Study_RID:=R:Study_RID,Experiment_RID:=R:Experiment_RID,Replicate_RID:=R:RID?limit=none"
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
--- a/conf/Replicate_For_Input_Bag.json
+++ b/conf/Replicate_For_Input_Bag.json
+{
+  "bag": {
+    "bag_name": "{rid}_inputBag",
+    "bag_algorithms": [
+      "md5"
+    ],
+    "bag_archiver": "zip"
+  },
+  "catalog": {
+    "query_processors": [
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Study",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Study_RID)=(RNASeq:Study:RID)/Study_RID:=RID,Internal_ID,Title,Summary,Overall_Design,GEO_Series_Accession_ID,GEO_Platform_Accession_ID,Funding,Pubmed_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment:RID)/Experiment_RID:=RID,Study_RID,Internal_ID,Name,Description,Experiment_Method,Experiment_Type,Species,Specimen_Type,Molecule_Type,Pooled_Sample,Pool_Size,Markers,Cell_Count,Treatment_Protocol,Treatment_Protocol_Reference,Isolation_Protocol,Isolation_Protocol_Reference,Growth_Protocol,Growth_Protocol_Reference,Label_Protocol,Label_Protocol_Reference,Hybridization_Protocol,Hybridization_Protocol_Reference,Scan_Protocol,Scan_Protocol_Reference,Data_Processing,Value_Definition,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Antibodies",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Antibodies:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Custom Metadata",
+          "query_path": "/entity/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Custom_Metadata:Experiment_RID)?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Experiment Settings",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Experiment_RID)=(RNASeq:Experiment_Settings:Experiment_RID)/RID,Experiment_RID,Alignment_Format,Aligner,Aligner_Version,Reference_Genome,Sequence_Trimming,Duplicate_Removal,Pre-alignment_Sequence_Removal,Junction_Reads,Library_Type,Protocol_Reference,Library_Selection,Quantification_Format,Quantification_Software,Expression_Metric,Transcriptome_Model,Sequencing_Platform,Paired_End,Read_Length,Strandedness,Used_Spike_Ins,Spike_Ins_Amount,Visualization_Format,Visualization_Software,Visualization_Version,Visualization_Setting,Notes,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Replicate",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/RID,Study_RID,Experiment_RID,Biological_Replicate_Number,Technical_Replicate_Number,Specimen_RID,Collection_Date,Mapped_Reads,GEO_Sample_Accession_ID,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/S:=(Specimen_RID)=(Gene_Expression:Specimen:RID)/T:=left(Stage_ID)=(Vocabulary:Developmental_Stage:ID)/$S/RID,Title,Species,Stage_ID,Stage_Name:=T:Name,Stage_Detail,Assay_Type,Strain,Wild_Type,Sex,Passage,Phenotype,Cell_Line,Parent_Specimen,Upload_Notes,Preparation,Fixation,Embedding,Internal_ID,Principal_Investigator,Consortium,Release_Date,RCT,RMT,GUDMAP2_Accession_ID?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Anatomical_Source",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Tissue:Specimen_RID)/RID,Specimen_RID,Tissue,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Specimen_Cell_Types",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(Specimen_RID)=(Gene_Expression:Specimen:RID)/(RID)=(Gene_Expression:Specimen_Cell_Type:Specimen)/RID,Specimen_RID:=Specimen,Cell_Type,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "Single Cell Metrics",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:Single_Cell_Metrics:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Reads_%28Millions%29,Reads%2FCell,Detected_Gene_Count,Genes%2FCell,UMI%2FCell,Estimated_Cell_Count,Principal_Investigator,Consortium,Release_Date,RCT,RMT?limit=none"
+        }
+      },
+      {
+        "processor": "csv",
+        "processor_params": {
+          "output_path": "File",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/RID,Study_RID,Experiment_RID,Replicate_RID,Caption,File_Type,File_Name,URI,File_size,MD5,GEO_Archival_URL,dbGaP_Accession_ID,Processed,Notes,Principal_Investigator,Consortium,Release_Date,RCT,RMT,Legacy_File_RID,GUDMAP_NGF_OID,GUDMAP_NGS_OID?limit=none"
+        }
+      },
+      {
+        "processor": "fetch",
+        "processor_params": {
+          "output_path": "assets/Study/{Study_RID}/Experiment/{Experiment_RID}/Replicate/{Replicate_RID}",
+          "query_path": "/attribute/M:=RNASeq:Replicate/RID={rid}/(RID)=(RNASeq:File:Replicate_RID)/File_Type=FastQ/File_Name::ciregexp::%5B_.%5DR%5B12%5D%5C.fastq%5C.gz/url:=URI,length:=File_size,filename:=File_Name,md5:=MD5,Study_RID,Experiment_RID,Replicate_RID?limit=none"
+        }
+      }
+    ]
+  }
+}
--- a/conf/aws.config
+++ b/conf/aws.config
+params {
+  refSource = "aws"
+}
+
+workDir = 's3://gudmap-rbk.output/work'
+aws.client.storageEncryption = 'AES256'
+aws {
+  region = 'us-east-2'
+  batch {
+    cliPath = '/home/ec2-user/miniconda/bin/aws'
+  }
+}
+
+process {
+  executor = 'awsbatch'
+  cpus = 1
+  memory = '1 GB'
+
+  withName:trackStart {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:getBag {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:getData {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:parseMetadata {
+    cpus = 15
+    memory = '1 GB'
+  }
+  withName:trimData {
+    cpus = 20
+    memory = '2 GB'
+  }
+  withName:getRefInfer {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:downsampleData {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:alignSampleData {
+    cpus = 50
+    memory = '5 GB'
+  }
+  withName:inferMetadata {
+    cpus = 5
+    memory = '1 GB'
+  }
+  withName:checkMetadata {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:getRef {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:alignData {
+    cpus = 50
+    memory = '10 GB'
+  }
+  withName:dedupData {
+    cpus = 5
+    memory = '20 GB'
+  }
+  withName:countData {
+    cpus = 2
+    memory = '5 GB'
+  }
+  withName:makeBigWig {
+    cpus = 15
+    memory = '5 GB'
+  }
+  withName:fastqc {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:dataQC {
+    cpus = 15
+    memory = '2 GB'
+  }
+  withName:aggrQC {
+    cpus = 2
+    memory = '1 GB'
+  }
+  withName:uploadInputBag {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:uploadExecutionRun {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:uploadQC {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:uploadProcessedFile {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:uploadOutputBag {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:finalizeExecutionRun {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:failPreExecutionRun {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:failExecutionRun {
+    cpus = 1
+    memory = '1 GB'
+  }
+  withName:uploadQC_fail {
+    cpus = 1
+    memory = '1 GB'
+  }
+}
--- a/conf/biohpc.config
+++ b/conf/biohpc.config
+params {
+  refSource = "biohpc"
+}
+
+process {
+  executor = 'slurm'
+  queue = 'super'
+  clusterOptions = '--hold'
+  time = '4h'
+  errorStrategy = 'retry'
+  maxRetries = 1
+
+  withName:trackStart {
+    executor = 'local'
+  }
+  withName:getBag {
+    executor = 'local'
+  }
+  withName:getData {
+    queue = 'super'
+  }
+  withName:parseMetadata {
+    executor = 'local'
+  }
+  withName:trimData {
+    queue = 'super'
+  }
+  withName:getRefInfer {
+    queue = 'super'
+  }
+  withName:downsampleData {
+    executor = 'local'
+  }
+  withName:alignSampleData {
+    queue = '128GB,256GB,256GBv1,384GB'
+  }
+  withName:inferMetadata {
+    queue = 'super'
+  }
+  withName:checkMetadata {
+    executor = 'local'
+  }
+  withName:getRef {
+    queue = 'super'
+  }
+  withName:alignData {
+    queue = '256GB,256GBv1'
+  }
+  withName:dedupData {
+    queue = 'super'
+  }
+  withName:countData {
+    queue = 'super'
+  }
+  withName:makeBigWig {
+    queue = 'super'
+  }
+  withName:fastqc {
+    queue = 'super'
+  }
+  withName:dataQC {
+    queue = 'super'
+  }
+  withName:aggrQC {
+    executor = 'local'
+  }
+  withName:uploadInputBag {
+    executor = 'local'
+  }
+  withName:uploadExecutionRun {
+    executor = 'local'
+  }
+  withName:uploadQC {
+    executor = 'local'
+  }
+  withName:uploadProcessedFile {
+    executor = 'local'
+  }
+  withName:uploadOutputBag {
+    executor = 'local'
+  }
+  withName:finalizeExecutionRun {
+    executor = 'local'
+  }
+  withName:failPreExecutionRun {
+    executor = 'local'
+  }
+  withName:failExecutionRun {
+    executor = 'local'
+  }
+  withName:uploadQC_fail {
+    executor = 'local'
+  }
+}
+
+singularity {
+  enabled = true
+  cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/'
+}
+
+env {
+  http_proxy = 'http://proxy.swmed.edu:3128'
+  https_proxy = 'http://proxy.swmed.edu:3128'
+  all_proxy = 'http://proxy.swmed.edu:3128'
+}
--- a/conf/biohpc_max.config
+++ b/conf/biohpc_max.config
+process {
+  executor = 'slurm'
+  queue = '256GB,256GBv1,384GB,128GB'
+  clusterOptions = '--hold'
+}
+
+singularity {
+  enabled = true
+  cacheDir = '/project/BICF/BICF_Core/shared/gudmap/singularity_cache/'
+}
+
+env {
+  http_proxy = 'http://proxy.swmed.edu:3128'
+  https_proxy = 'http://proxy.swmed.edu:3128'
+  all_proxy = 'http://proxy.swmed.edu:3128'
+}
--- a/conf/multiqc_config.yaml
+++ b/conf/multiqc_config.yaml
+custom_logo: './bicf_logo.png'
+custom_logo_url: 'https/utsouthwestern.edu/labs/bioinformatics/'
+custom_logo_title: 'Bioinformatics Core Facility'
+
+report_header_info:
+  - Contact Email: 'bicf@utsouthwestern.edu'
+  - Application Type: 'RNA-Seq Analytic Pipeline for GUDMAP/RBK'
+  - Department: 'Bioinformatic Core Facility, Department of Bioinformatics, University of Texas Southwestern Medical Center'
+
+title: RNA-Seq Analytic Pipeline for GUDMAP/RBK
+
+report_comment: >
+  This report has been generated by the <a href="https://doi.org/10.5281/zenodo.3625056">GUDMAP/RBK RNA-Seq Pipeline</a>
+
+top_modules:
+  - fastqc:
+      name: 'Raw'
+      info: 'Replicate Raw fastq QC Results'
+  - cutadapt:
+      name: 'Trim'
+      info: 'Replicate Trim Adapter QC Results'
+  - hisat2:
+      name: 'Align'
+      info: 'Replicate Alignment QC Results'
+      path_filters:
+        - '*alignSummary*'
+  - picard:
+      name: 'Dedup'
+      info: 'Replicate Alignement Deduplication QC Results'
+  - rseqc:
+      name: 'Inner Distance'
+      info: 'Replicate Paired End Inner Distance Distribution Results'
+      path_filters:
+        - '*insertSize*'
+  - custom_content
+  - featureCounts:
+      name: 'Count'
+      info: 'Replicate Feature Count QC Results'
+  - hisat2:
+      name: 'Inference: Align'
+      info: 'Inference Alignment (1M downsampled reads) QC Results'
+      path_filters:
+        - '*alignSampleSummary*'
+  - rseqc:
+      name: 'Inference: Stranded'
+      info: '1M Downsampled Reads Strandedness Inference Results'
+      path_filters:
+        - '*infer_experiment*'
+
+report_section_order:
+    run:
+      order: 4000
+    rid:
+      order: 3000
+    meta:
+      order: 2000
+    ref:
+      order: 1000
+    software_versions:
+      order: -1000
+    software_references:
+      order: -2000
+
+skip_generalstats: true
+
+custom_data:
+    run:
+        file_format: 'tsv'
+        section_name: 'Run'
+        description: 'This is the run information'
+        plot_type: 'table'
+        pconfig:
+            id: 'run'
+            scale: false
+            format: '{}'
+        headers:
+            Session:
+                description: ''
+            Session ID:
+                description: 'Nextflow session ID'
+            Pipeline Version:
+                description: 'BICF pipeline version'
+            Input:
+                description: 'Input overrides'
+    rid:
+        file_format: 'tsv'
+        section_name: 'RID'
+        description: 'This is the identifying RIDs'
+        plot_type: 'table'
+        pconfig:
+            id: 'rid'
+            scale: false
+            format: '{}'
+        headers:
+            Replicate:
+                description: ''
+            Replicate RID:
+                description: 'Replicate RID'
+            Experiment RID:
+                description: 'Experiment RID'
+            Study RID:
+                description: 'Study RID'
+    meta:
+        file_format: 'tsv'
+        section_name: 'Metadata'
+        description: 'This is the comparison of infered metadata, submitter provided, and calculated'
+        plot_type: 'table'
+        pconfig:
+            id: 'meta'
+            scale: false
+            format: '{:,.0f}'
+        headers:
+            Source:
+                description: 'Metadata source'
+            Species:
+                description: 'Species'
+            Ends:
+                description: 'Single or paired end sequencing'
+            Stranded:
+                description: 'Stranded (forward/reverse) or unstranded library prep'
+            Spike-in:
+                description: 'ERCC spike in'
+            Raw Reads:
+                description: 'Number of reads of the sequencer'
+            Assigned Reads:
+                description: 'Final reads after fintering'
+            Median Read Length:
+                description: 'Average read length'
+            Median TIN:
+                description: 'Average transcript integrity number'
+
+    ref:
+        file_format: 'tsv'
+        section_name: 'Reference'
+        description: 'This is the reference version information'
+        plot_type: 'table'
+        pconfig:
+            id: 'ref'
+            scale: false
+            format: '{}'
+        headers:
+            Species:
+                description: 'Reference species'
+            Genome Reference Consortium Build:
+                description: 'Reference source build'
+            Genome Reference Consortium Patch:
+                description: 'Reference source patch version'
+            GENCODE Annotation Release:
+                description: 'Annotation release version'
+    tin:
+        file_format: 'tsv'
+        section_name: 'TIN'
+        description: 'This is the distribution of TIN values calculated by the tool RSeQC'
+        plot_type: 'bargraph'
+        pconfig:
+            id: 'tin'
+        headers:
+            chrom
+            1 - 10
+            11 - 20
+            21 - 30
+            31 - 40
+            41 - 50
+            51 - 60
+            61 - 70
+            71 - 80
+            81 - 90
+            91 - 100
+
+sp:
+    run:
+        fn: "run.tsv"
+    rid:
+        fn: 'rid.tsv'
+    meta:
+        fn: 'metadata.tsv'
+    ref:
+        fn: 'reference.tsv'
+    tin:
+        fn: '*_tin.hist.tsv'
--- a/conf/ondemand.config
+++ b/conf/ondemand.config
+process {
+  queue = 'highpriority-0ef8afb0-c7ad-11ea-b907-06c94a3c6390'
+}
--- a/conf/spot.config
+++ b/conf/spot.config
+process {
+  queue = 'default-0ef8afb0-c7ad-11ea-b907-06c94a3c6390'
+}