diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9a846f6e432738ec3ec1a11d9dcf4bca2d473ed1..1cf1103821fac0e31e30bafbb9901591bdf8c919 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,6 +14,11 @@ stages: getBag: stage: unit + only: + - push + - tags + except: + - merge_requests script: - ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 @@ -21,14 +26,24 @@ getBag: getData: stage: unit + only: + - push + - tags + except: + - merge_requests script: - ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt - - unzip ./test_data/bagit/Replicate_Q-Y5F6.zip + - unzip ./test_data/bag/Replicate_Q-Y5F6.zip - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST - pytest -m getData parseMetadata: stage: unit + only: + - push + - tags + except: + - merge_requests script: - rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID) - exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID) @@ -44,6 +59,11 @@ parseMetadata: inferMetadata: stage: unit + only: + - push + - tags + except: + - merge_requests script: - > align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) && @@ -56,6 +76,11 @@ inferMetadata: getRef: stage: unit + only: + - push + - tags + except: + - merge_requests script: - mkdir -p hu - mkdir -p mo @@ -64,6 +89,11 @@ getRef: trimData: stage: unit + only: + - push + - tags + except: + - merge_requests script: - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz - singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz @@ -73,12 +103,22 @@ trimData: downsampleData: stage: unit + only: + - push + - tags + except: + - merge_requests script: - singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq - pytest -m downsampleData alignData: stage: unit + only: + - push + - tags + except: + - merge_requests script: - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary - singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam @@ -92,6 +132,11 @@ alignData: dedupData: stage: unit + only: + - push + - tags + except: + - merge_requests script: - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true - singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam @@ -104,26 +149,49 @@ dedupData: countData: stage: unit + only: + - push + - tags + except: + - merge_requests script: - - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam - - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts + - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv + - ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv + - singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam + - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData + - singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se - assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)') - pytest -m makeFeatureCounts makeBigWig: stage: unit + only: + - push + - tags + except: + - merge_requests script: - singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw - pytest -m makeBigWig fastqc: stage: unit + only: + - push + - tags + except: + - merge_requests script: - singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o . - pytest -m fastqc dataQC: stage: unit + only: + - push + - tags + except: + - merge_requests script: - echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls - for i in {"chr8","chr4","chrY"}; do @@ -132,6 +200,11 @@ dataQC: outputBag: stage: unit + only: + - push + - tags + except: + - merge_requests script: - mkdir test - singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip @@ -140,6 +213,10 @@ outputBag: integration_se: stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a @@ -150,11 +227,20 @@ integration_se: when: always paths: - output/qc/ + - output/report/ - SE_multiqc_data.json expire_in: 7 days + retry: + max: 1 + when: + - always integration_pe: stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - hostname - ulimit -a @@ -166,11 +252,87 @@ integration_pe: paths: - dag.png - output/qc/ + - output/report/ - PE_multiqc_data.json expire_in: 7 days + retry: + max: 1 + when: + - always + +override_inputBag: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/Replicate_Q-Y5F6.zip --ci true + - find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \; + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - inputBagOverride_PE_multiqc_data.json + expire_in: 7 days + retry: + max: 1 + when: + - always + +override_fastq: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true + - find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \; + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - fastqOverride_PE_multiqc_data.json + expire_in: 7 days + retry: + max: 1 + when: + - always + +override_species: + stage: integration + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ + script: + - hostname + - ulimit -a + - nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true + - find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \; + artifacts: + name: "$CI_JOB_NAME" + when: always + paths: + - speciesOverride_PE_multiqc_data.json + expire_in: 7 days + retry: + max: 1 + when: + - always + consistency: stage: consistency + only: [merge_requests] + except: + variables: + - $CI_MERGE_REQUEST_TARGET_BRANCH_NAME =~ /master/ script: - grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt - grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index ee3ac2203267513b79b841638ae0f717e601b2aa..d955884646b77d5c5f43f1517a0aa3be8e505f08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,32 @@ -# v0.0.2 (in development) +# v0.0.3 (in development) +**User Facing** +* TPM table: + * Add Ensembl Gene ID + * Rename columns: *GENCODE_Gene_Symbol*, *Ensembl_GeneID*, *NCBI_GeneID* +* MultiQC output custom tables (html+JSON): + * Run table: *Session ID* and *Pipeline Version* + * Reference Table: *Species*, *Genome Reference Consortium Build*, *Genome Reference Consortium Patch*, *GENCODE Annotation Release* (outputs both human and mouse versions) +* Add inputBag override param (`inputBagForce`) [`*.zip`] + * Uses provided inputBag instead of downloading from data-hub + * Still requires matching repRID input param +* Add fastq override param (`fastqsForce`) [`R1`,`R2`] + * Uses provided fastq instead of downloading from data-hub + * Still requires matching repRID input param and will pull inputBag from data-hub to access submitted metadata for reporting +* Add species override param (`speciesForce`) [`Mus musculus` or `Homo sapiens`] + * forces the use of the provided species + * ignores inferred ambiguous species + +**Background** +* Add GeneSymbol/EnsemblID/EntrezID translation files to references + +*Known Bugs* +* outputBag does not contain fetch for processed data +* Does not include automatic data upload +* Override params (inputBag, fastq, species) aren't checked for integrity + +<hr> + +# v0.0.2 **User Facing** * Output: * inputBag @@ -19,4 +47,4 @@ **INITIAL BETA VERSION**\ Does not include automatic data upload\ This version is for initial upload of test data to GUDMAP/RBK data-hub for internal integration -<hr> +<hr> \ No newline at end of file diff --git a/README.md b/README.md index 2216c2775a2145f8e021cb221b18414f38ed9304..49069c53ada7b1f8351684cea8ba161830369208 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,13 @@ To Run: * reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version) * *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22 * *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31 +* ***Optional*** input overrides + * `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input) + * eg: `--inputBagForce test_data/bag/Replicate_Q-Y5F6.zip` (must be the expected bag structure) + * `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input) + * eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order) + * `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error + * eg: `--speciesForce 'Mus musculus'` * Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)): * `--ci` boolean (default = false) * `--dev` boolean (default = false) diff --git a/docs/dag.png b/docs/dag.png index 04aafa139a1715cdd8213ae7a81a25dfaf6b4dac..2a2ec56ccc0615d8b1c9c1f6ce73201c0073874d 100755 Binary files a/docs/dag.png and b/docs/dag.png differ diff --git a/test_data/createTestData.sh b/test_data/createTestData.sh index 47c73d46fba5510f64fe5f9bb5c540eb4ad618cc..aec4e91bb71d5bb79d87cd76994f6253cfe63ea3 100644 --- a/test_data/createTestData.sh +++ b/test_data/createTestData.sh @@ -9,12 +9,12 @@ mkdir -p NEW_test_data ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json -mkdir -p ./NEW_test_data/bagit +mkdir -p ./NEW_test_data/bag singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6 -cp Replicate_Q-Y5F6.zip ./NEW_test_data/bagit/Replicate_Q-Y5F6.zip +cp Replicate_Q-Y5F6.zip ./NEW_test_data/bag/Replicate_Q-Y5F6.zip mkdir -p ./NEW_test_data/fastq -unzip ./test_data/bagit/Replicate_Q-Y5F6.zip +unzip ./test_data/bag/Replicate_Q-Y5F6.zip singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz @@ -81,10 +81,14 @@ cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M. mkdir -p ./NEW_test_data/counts mkdir -p ./NEW_test_data/counts/small -singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.featureCounts -cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.featureCounts +ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv +ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv +singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam +singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.countData +singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se +cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countData cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv +cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.tpmTable.csv mkdir -p ./NEW_test_data/bw mkdir -p ./NEW_test_data/bw/small diff --git a/workflow/conf/multiqc_config.yaml b/workflow/conf/multiqc_config.yaml index db6a335a8e5e6f8677dd8c433d391c6fd5224751..0c780d967f2859c1c76e02a63f3348c26049c694 100644 --- a/workflow/conf/multiqc_config.yaml +++ b/workflow/conf/multiqc_config.yaml @@ -48,14 +48,32 @@ top_modules: - '*infer_experiment*' report_section_order: + run: + order: 4000 rid: - order: 2000 + order: 3000 meta: + order: 2000 + ref: order: 1000 skip_generalstats: true custom_data: + run: + file_format: 'tsv' + section_name: 'Run' + description: 'This is the run information' + plot_type: 'table' + pconfig: + id: 'run' + scale: false + format: '{}' + headers: + Session + Session ID + Pipeline Version + Input rid: file_format: 'tsv' section_name: 'RID' @@ -63,7 +81,10 @@ custom_data: plot_type: 'table' pconfig: id: 'rid' + scale: false + format: '{}' headers: + Replicate Replicate RID Experiment RID Study RID @@ -74,6 +95,7 @@ custom_data: plot_type: 'table' pconfig: id: 'meta' + scale: false format: '{:,.0f}' headers: Source @@ -85,6 +107,21 @@ custom_data: Assigned Reads Median Read Length Median TIN + Pipeline Version + ref: + file_format: 'tsv' + section_name: 'Reference' + description: 'This is the referenec version information' + plot_type: 'table' + pconfig: + id: 'ref' + scale: false + format: '{}' + headers: + Species + Genome Reference Consortium Build + Genome Reference Consortium Patch + GENCODE Annotation Release" tin: file_format: 'tsv' section_name: 'TIN' @@ -106,9 +143,13 @@ custom_data: 90 - 99 sp: + run: + fn: "run.tsv" rid: fn: 'rid.tsv' meta: fn: 'metadata.tsv' + ref: + fn: 'reference.tsv' tin: - fn: '*.tin.hist.tsv' + fn: '*.tin.hist.tsv' \ No newline at end of file diff --git a/workflow/nextflow.config b/workflow/nextflow.config index 660ec331080a9235d0e7eddc630a48500ead61f3..fd6f6f02be0aa82e19fa3e704cc3d609eca8bd99 100644 --- a/workflow/nextflow.config +++ b/workflow/nextflow.config @@ -97,6 +97,6 @@ manifest { homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq' description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.' mainScript = 'rna-seq.nf' - version = 'v0.0.2_indev' + version = 'v0.0.3_indev' nextflowVersion = '>=19.09.0' } diff --git a/workflow/rna-seq.nf b/workflow/rna-seq.nf index a311b9e876249c0d4cfbbb1cd13e7d94b1902807..b6687df5d29f1d1b60d017c3d42ebaebf4edbcd0 100644 --- a/workflow/rna-seq.nf +++ b/workflow/rna-seq.nf @@ -19,6 +19,11 @@ params.refHuVersion = "38.p12.v31" params.refERCCVersion = "92" params.outDir = "${baseDir}/../output" +// Define override input variable +params.inputBagForce = "" +params.fastqsForce = "" +params.speciesForce = "" + // Parse input variables deriva = Channel .fromPath(params.deriva) @@ -32,6 +37,9 @@ refHuVersion = params.refHuVersion refERCCVersion = params.refERCCVersion outDir = params.outDir logsDir = "${outDir}/Logs" +inputBagForce = params.inputBagForce +fastqsForce = params.fastqsForce +speciesForce = params.speciesForce // Define fixed files derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json") @@ -117,7 +125,10 @@ process getBag { path derivaConfig output: - path ("Replicate_*.zip") into bagit + path ("Replicate_*.zip") into bag + + when: + inputBagForce == "" script: """ @@ -131,12 +142,21 @@ process getBag { echo -e "LOG: linked" >> ${repRID}.getBag.log # deriva-download replicate RID - echo -e "LOG: fetching bagit for ${repRID} in GUDMAP" >> ${repRID}.getBag.log + echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID} echo -e "LOG: fetched" >> ${repRID}.getBag.log """ } +// Set inputBag to downloaded or forced input +if (inputBagForce != "") { + inputBag = Channel + .fromPath(inputBagForce) + .ifEmpty { exit 1, "override inputBag file not found: ${inputBagForce}" } +} else { + inputBag = bag +} + /* * getData: fetch study files from consortium with downloaded bdbag.zip */ @@ -146,7 +166,7 @@ process getData { input: path script_bdbagFetch path cookies, stageAs: "deriva-cookies.txt" from bdbag - path bagit + path inputBag output: path ("*.R{1,2}.fastq.gz") into fastqs @@ -158,33 +178,43 @@ process getData { """ hostname > ${repRID}.getData.log ulimit -a >> ${repRID}.getData.log - + # link deriva cookie for authentication echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log mkdir -p ~/.bdbag ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt echo -e "LOG: linked" >> ${repRID}.getData.log - # get bagit basename - replicate=\$(basename "${bagit}" | cut -d "." -f1) - echo -e "LOG: bagit replicate name \${replicate}" >> ${repRID}.getData.log + # get bag basename + replicate=\$(basename "${inputBag}" | cut -d "." -f1) + echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log - # unzip bagit - echo -e "LOG: unzipping replicate bagit" >> ${repRID}.getData.log - unzip ${bagit} + # unzip bag + echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log + unzip ${inputBag} echo -e "LOG: unzipped" >> ${repRID}.getData.log - # bagit fetch fastq's only and rename by repRID + # bag fetch fastq's only and rename by repRID echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log sh ${script_bdbagFetch} \${replicate} ${repRID} echo -e "LOG: fetched" >> ${repRID}.getData.log """ } -// Replicate raw fastq's for multiple process inputs -fastqs.into { - fastqs_trimData - fastqs_fastqc +// Set raw fastq to downloaded or forced input and replicate them for multiple process inputs +if (fastqsForce != "") { + Channel + .fromPath(fastqsForce) + .ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" } + .collect().into { + fastqs_trimData + fastqs_fastqc + } +} else { + fastqs.into { + fastqs_trimData + fastqs_fastqc + } } /* @@ -533,7 +563,24 @@ process inferMetadata { bed="./GRCm/bed/genome.bed" else echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log - exit 1 + if [ "${speciesForce}" == "" ] + then + exit 1 + fi + fi + if [ "${speciesForce}" != "" ] + then + echo -e "LOG: species overridden to: ${speciesForce}" + species="${speciesForce}" + if [ "${speciesForce}" == "Homo sapiens" ] + then + bam="GRCh.sampled.sorted.bam" + bed="./GRCh/bed/genome.bed" + elif [ "${speciesForce}" == "Mus musculus" ] + then + bam="GRCm.sampled.sorted.bam" + bed="./GRCm/bed/genome.bed" + fi fi echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log @@ -875,10 +922,10 @@ process countData { echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log if [ "${ends}" == "se" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam elif [ "${ends}" == "pe" ] then - featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam + featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam fi echo -e "LOG: counted" >> ${repRID}.countData.log @@ -1034,18 +1081,53 @@ process aggrQC { hostname > ${repRID}.aggrQC.log ulimit -a >> ${repRID}.aggrQC.log + # make run table + if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ] + then + input="default" + else + input="override:" + if [ "${params.inputBagForce}" != "" ] + then + input=\$(echo \${input} inputBag) + fi + if [ "${params.fastqsForce}" != "" ] + then + input=\$(echo \${input} fastq) + fi + if [ "${params.speciesForce}" != "" ] + then + input=\$(echo \${input} species) + fi + fi + echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log + echo -e "Session\tSession ID\tPipeline Version\tInput" > run.tsv + echo -e "Session\t${workflow.sessionId}\t${workflow.manifest.version}\t\${input}" >> run.tsv + + # make RID table echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log - echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv - echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv + echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv + echo -e "Replicate\t${repRID}\t${expRID}\t${studyRID}" >> rid.tsv # make metadata table echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv - echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + if [ "${params.speciesForce}" == "" ] + then + echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + else + echo -e "Infered\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv + fi echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv + # make reference table + echo -e "LOG: creating referencerun table" >> ${repRID}.aggrQC.log + echo -e "Species\tGenome Reference Consortium Build\tGenome Reference Consortium Patch\tGENCODE Annotation Release" > reference.tsv + echo -e "Human\tGRCh\$(echo `echo ${params.refHuVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refHuVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refHuVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv + echo -e "Mouse\tGRCm\$(echo `echo ${params.refMoVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refMoVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refMoVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv + # remove inner distance report if it is empty (SE repRID) echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log if [ "${endsM}" == "se" ] @@ -1081,5 +1163,4 @@ process outputBag { cp ${multiqcJSON} Replicate_${repRID}.outputBag bdbag Replicate_${repRID}.outputBag --archiver zip """ -} - +} \ No newline at end of file diff --git a/workflow/scripts/calculateTPM.R b/workflow/scripts/calculateTPM.R index d4851d4806c5e12e06416a103f2e6972a8b4befe..a26bf943dce7d082ed03b77939390abb022bba82 100644 --- a/workflow/scripts/calculateTPM.R +++ b/workflow/scripts/calculateTPM.R @@ -13,10 +13,13 @@ if (!("count" %in% names(opt))){ stop("Count file doesn't exist, exiting.") } -repRID <- basename(gsub(".featureCounts","",opt$count)) +repRID <- basename(gsub(".countData","",opt$count)) count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below -colnames(count)[7] <- "count" +colnames(count)[1] <- "gene_name" +colnames(count)[7] <- "gene_id" +colnames(count)[8] <- "count" +count <- count[,c(1,7,2:6,8)] rpk <- count$count/count$Length/1000 diff --git a/workflow/scripts/convertGeneSymbols.R b/workflow/scripts/convertGeneSymbols.R index 92d3c3e2e8ad97ebacbca3c5f9d0fc185ceafb7e..49752f1bba5a4dd8d91ed8609c6f2f82b8fafacc 100644 --- a/workflow/scripts/convertGeneSymbols.R +++ b/workflow/scripts/convertGeneSymbols.R @@ -7,18 +7,20 @@ option_list=list( opt=parse_args(OptionParser(option_list=option_list)) rm(option_list) -countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE) +countTable <- read.csv(paste0(opt$repRID,".countTable.csv"), stringsAsFactors=FALSE) geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE) Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE) -convert <- data.frame(geneID=countTable$Geneid) -convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE) +convert <- data.frame(gene_name=countTable$gene_name) +convert <- merge(x=convert,y=geneID[,1:2],by.x="gene_name",by.y="V2",all.x=TRUE) convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE) convert[is.na(convert$V2),3] <- "" convert <- convert[,-1] colnames(convert) <- c("GeneID","EntrezID") convert <- unique(convert) -output <- merge(x=convert,y=countTable[,c("Geneid","count","tpm")],by.x="GeneID",by.y="Geneid",all.x=TRUE) +output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")],by.x="GeneID",by.y="gene_name",all.x=TRUE) +colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm") +output <- output[,c(1,3,2,4:5)] write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE) diff --git a/workflow/tests/test_makeFeatureCounts.py b/workflow/tests/test_makeFeatureCounts.py index d741a2f4ac0f33fe15af8f13300c95002a00a887..d33527a9c7bffcaa9919639b72842cc8cf63b14f 100644 --- a/workflow/tests/test_makeFeatureCounts.py +++ b/workflow/tests/test_makeFeatureCounts.py @@ -11,5 +11,6 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \ @pytest.mark.makeFeatureCounts def test_makeFeatureCounts(): - assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.featureCounts')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countData')) assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countTable.csv')) + assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.tpmTable.csv'))