Skip to content
Snippets Groups Projects
Commit d14cdf77 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '0.0.3' into 'develop'

0.0.3

Closes #74, #73, #72, #71, and #61

See merge request !42
parents 64c587de 1bfecd57
Branches
Tags
2 merge requests!43Develop,!420.0.3
Pipeline #8026 canceled with stages
in 27 minutes and 46 seconds
This commit is part of merge request !43. Comments created here will be created in the context of that merge request.
......@@ -14,6 +14,11 @@ stages:
getBag:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
......@@ -21,14 +26,24 @@ getBag:
getData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- unzip ./test_data/bagit/Replicate_Q-Y5F6.zip
- unzip ./test_data/bag/Replicate_Q-Y5F6.zip
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST
- pytest -m getData
parseMetadata:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
- exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
......@@ -44,6 +59,11 @@ parseMetadata:
inferMetadata:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- >
align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
......@@ -56,6 +76,11 @@ inferMetadata:
getRef:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- mkdir -p hu
- mkdir -p mo
......@@ -64,6 +89,11 @@ getRef:
trimData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
......@@ -73,12 +103,22 @@ trimData:
downsampleData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
- pytest -m downsampleData
alignData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
......@@ -92,6 +132,11 @@ alignData:
dedupData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
......@@ -104,26 +149,49 @@ dedupData:
countData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts
- ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
- ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
- assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
- pytest -m makeFeatureCounts
makeBigWig:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
- pytest -m makeBigWig
fastqc:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
- pytest -m fastqc
dataQC:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls
- for i in {"chr8","chr4","chrY"}; do
......@@ -132,6 +200,11 @@ dataQC:
outputBag:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- mkdir test
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip
......@@ -140,6 +213,11 @@ outputBag:
integration_se:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
......@@ -150,11 +228,21 @@ integration_se:
when: always
paths:
- output/qc/
- output/report/
- SE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
integration_pe:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
......@@ -166,11 +254,91 @@ integration_pe:
paths:
- dag.png
- output/qc/
- output/report/
- PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
override_inputBag:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/Replicate_Q-Y5F6.zip --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- inputBagOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
override_fastq:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- fastqOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
override_species:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- speciesOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
consistency:
stage: consistency
only:
- merge_requests
except:
refs:
- master
script:
- grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt
- grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt
......
# v0.0.2 (in development)
# v0.0.3 (in development)
**User Facing**
* TPM table:
* Add Ensembl Gene ID
* Rename columns: *GENCODE_Gene_Symbol*, *Ensembl_GeneID*, *NCBI_GeneID*
* MultiQC output custom tables (html+JSON):
* Run table: *Session ID* and *Pipeline Version*
* Reference Table: *Species*, *Genome Reference Consortium Build*, *Genome Reference Consortium Patch*, *GENCODE Annotation Release* (outputs both human and mouse versions)
* Add inputBag override param (`inputBagForce`) [`*.zip`]
* Uses provided inputBag instead of downloading from data-hub
* Still requires matching repRID input param
* Add fastq override param (`fastqsForce`) [`R1`,`R2`]
* Uses provided fastq instead of downloading from data-hub
* Still requires matching repRID input param and will pull inputBag from data-hub to access submitted metadata for reporting
* Add species override param (`speciesForce`) [`Mus musculus` or `Homo sapiens`]
* forces the use of the provided species
* ignores inferred ambiguous species
**Background**
* Add GeneSymbol/EnsemblID/EntrezID translation files to references
*Known Bugs*
* outputBag does not contain fetch for processed data
* Does not include automatic data upload
* Override params (inputBag, fastq, species) aren't checked for integrity
<hr>
# v0.0.2
**User Facing**
* Output:
* inputBag
......@@ -19,4 +47,4 @@
**INITIAL BETA VERSION**\
Does not include automatic data upload\
This version is for initial upload of test data to GUDMAP/RBK data-hub for internal integration
<hr>
<hr>
\ No newline at end of file
......@@ -48,6 +48,13 @@ To Run:
* reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version)
* *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22
* *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31
* ***Optional*** input overrides
* `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input)
* eg: `--inputBagForce test_data/bag/Replicate_Q-Y5F6.zip` (must be the expected bag structure)
* `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input)
* eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order)
* `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error
* eg: `--speciesForce 'Mus musculus'`
* Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)):
* `--ci` boolean (default = false)
* `--dev` boolean (default = false)
......
docs/dag.png

674 KiB | W: | H:

docs/dag.png

693 KiB | W: | H:

docs/dag.png
docs/dag.png
docs/dag.png
docs/dag.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -9,12 +9,12 @@ mkdir -p NEW_test_data
ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
mkdir -p ./NEW_test_data/bagit
mkdir -p ./NEW_test_data/bag
singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
cp Replicate_Q-Y5F6.zip ./NEW_test_data/bagit/Replicate_Q-Y5F6.zip
cp Replicate_Q-Y5F6.zip ./NEW_test_data/bag/Replicate_Q-Y5F6.zip
mkdir -p ./NEW_test_data/fastq
unzip ./test_data/bagit/Replicate_Q-Y5F6.zip
unzip ./test_data/bag/Replicate_Q-Y5F6.zip
singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6
cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz
cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz
......@@ -81,10 +81,14 @@ cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.
mkdir -p ./NEW_test_data/counts
mkdir -p ./NEW_test_data/counts/small
singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.featureCounts
cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.featureCounts
ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.countData
singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countData
cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv
cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.tpmTable.csv
mkdir -p ./NEW_test_data/bw
mkdir -p ./NEW_test_data/bw/small
......
......@@ -48,14 +48,32 @@ top_modules:
- '*infer_experiment*'
report_section_order:
run:
order: 4000
rid:
order: 2000
order: 3000
meta:
order: 2000
ref:
order: 1000
skip_generalstats: true
custom_data:
run:
file_format: 'tsv'
section_name: 'Run'
description: 'This is the run information'
plot_type: 'table'
pconfig:
id: 'run'
scale: false
format: '{}'
headers:
Session
Session ID
Pipeline Version
Input
rid:
file_format: 'tsv'
section_name: 'RID'
......@@ -63,7 +81,10 @@ custom_data:
plot_type: 'table'
pconfig:
id: 'rid'
scale: false
format: '{}'
headers:
Replicate
Replicate RID
Experiment RID
Study RID
......@@ -74,6 +95,7 @@ custom_data:
plot_type: 'table'
pconfig:
id: 'meta'
scale: false
format: '{:,.0f}'
headers:
Source
......@@ -85,6 +107,21 @@ custom_data:
Assigned Reads
Median Read Length
Median TIN
Pipeline Version
ref:
file_format: 'tsv'
section_name: 'Reference'
description: 'This is the referenec version information'
plot_type: 'table'
pconfig:
id: 'ref'
scale: false
format: '{}'
headers:
Species
Genome Reference Consortium Build
Genome Reference Consortium Patch
GENCODE Annotation Release"
tin:
file_format: 'tsv'
section_name: 'TIN'
......@@ -106,9 +143,13 @@ custom_data:
90 - 99
sp:
run:
fn: "run.tsv"
rid:
fn: 'rid.tsv'
meta:
fn: 'metadata.tsv'
ref:
fn: 'reference.tsv'
tin:
fn: '*.tin.hist.tsv'
fn: '*.tin.hist.tsv'
\ No newline at end of file
......@@ -97,6 +97,6 @@ manifest {
homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
mainScript = 'rna-seq.nf'
version = 'v0.0.2_indev'
version = 'v0.0.3_indev'
nextflowVersion = '>=19.09.0'
}
......@@ -19,6 +19,11 @@ params.refHuVersion = "38.p12.v31"
params.refERCCVersion = "92"
params.outDir = "${baseDir}/../output"
// Define override input variable
params.inputBagForce = ""
params.fastqsForce = ""
params.speciesForce = ""
// Parse input variables
deriva = Channel
.fromPath(params.deriva)
......@@ -32,6 +37,9 @@ refHuVersion = params.refHuVersion
refERCCVersion = params.refERCCVersion
outDir = params.outDir
logsDir = "${outDir}/Logs"
inputBagForce = params.inputBagForce
fastqsForce = params.fastqsForce
speciesForce = params.speciesForce
// Define fixed files
derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
......@@ -117,7 +125,10 @@ process getBag {
path derivaConfig
output:
path ("Replicate_*.zip") into bagit
path ("Replicate_*.zip") into bag
when:
inputBagForce == ""
script:
"""
......@@ -131,12 +142,21 @@ process getBag {
echo -e "LOG: linked" >> ${repRID}.getBag.log
# deriva-download replicate RID
echo -e "LOG: fetching bagit for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID}
echo -e "LOG: fetched" >> ${repRID}.getBag.log
"""
}
// Set inputBag to downloaded or forced input
if (inputBagForce != "") {
inputBag = Channel
.fromPath(inputBagForce)
.ifEmpty { exit 1, "override inputBag file not found: ${inputBagForce}" }
} else {
inputBag = bag
}
/*
* getData: fetch study files from consortium with downloaded bdbag.zip
*/
......@@ -146,7 +166,7 @@ process getData {
input:
path script_bdbagFetch
path cookies, stageAs: "deriva-cookies.txt" from bdbag
path bagit
path inputBag
output:
path ("*.R{1,2}.fastq.gz") into fastqs
......@@ -158,33 +178,43 @@ process getData {
"""
hostname > ${repRID}.getData.log
ulimit -a >> ${repRID}.getData.log
# link deriva cookie for authentication
echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log
mkdir -p ~/.bdbag
ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
echo -e "LOG: linked" >> ${repRID}.getData.log
# get bagit basename
replicate=\$(basename "${bagit}" | cut -d "." -f1)
echo -e "LOG: bagit replicate name \${replicate}" >> ${repRID}.getData.log
# get bag basename
replicate=\$(basename "${inputBag}" | cut -d "." -f1)
echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
# unzip bagit
echo -e "LOG: unzipping replicate bagit" >> ${repRID}.getData.log
unzip ${bagit}
# unzip bag
echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log
unzip ${inputBag}
echo -e "LOG: unzipped" >> ${repRID}.getData.log
# bagit fetch fastq's only and rename by repRID
# bag fetch fastq's only and rename by repRID
echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
sh ${script_bdbagFetch} \${replicate} ${repRID}
echo -e "LOG: fetched" >> ${repRID}.getData.log
"""
}
// Replicate raw fastq's for multiple process inputs
fastqs.into {
fastqs_trimData
fastqs_fastqc
// Set raw fastq to downloaded or forced input and replicate them for multiple process inputs
if (fastqsForce != "") {
Channel
.fromPath(fastqsForce)
.ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
.collect().into {
fastqs_trimData
fastqs_fastqc
}
} else {
fastqs.into {
fastqs_trimData
fastqs_fastqc
}
}
/*
......@@ -533,7 +563,24 @@ process inferMetadata {
bed="./GRCm/bed/genome.bed"
else
echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log
exit 1
if [ "${speciesForce}" == "" ]
then
exit 1
fi
fi
if [ "${speciesForce}" != "" ]
then
echo -e "LOG: species overridden to: ${speciesForce}"
species="${speciesForce}"
if [ "${speciesForce}" == "Homo sapiens" ]
then
bam="GRCh.sampled.sorted.bam"
bed="./GRCh/bed/genome.bed"
elif [ "${speciesForce}" == "Mus musculus" ]
then
bam="GRCm.sampled.sorted.bam"
bed="./GRCm/bed/genome.bed"
fi
fi
echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log
......@@ -875,10 +922,10 @@ process countData {
echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log
if [ "${ends}" == "se" ]
then
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
elif [ "${ends}" == "pe" ]
then
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
fi
echo -e "LOG: counted" >> ${repRID}.countData.log
......@@ -1034,18 +1081,53 @@ process aggrQC {
hostname > ${repRID}.aggrQC.log
ulimit -a >> ${repRID}.aggrQC.log
# make run table
if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ]
then
input="default"
else
input="override:"
if [ "${params.inputBagForce}" != "" ]
then
input=\$(echo \${input} inputBag)
fi
if [ "${params.fastqsForce}" != "" ]
then
input=\$(echo \${input} fastq)
fi
if [ "${params.speciesForce}" != "" ]
then
input=\$(echo \${input} species)
fi
fi
echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log
echo -e "Session\tSession ID\tPipeline Version\tInput" > run.tsv
echo -e "Session\t${workflow.sessionId}\t${workflow.manifest.version}\t\${input}" >> run.tsv
# make RID table
echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log
echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv
echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv
echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv
echo -e "Replicate\t${repRID}\t${expRID}\t${studyRID}" >> rid.tsv
# make metadata table
echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv
echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv
echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
if [ "${params.speciesForce}" == "" ]
then
echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
else
echo -e "Infered\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
fi
echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv
# make reference table
echo -e "LOG: creating referencerun table" >> ${repRID}.aggrQC.log
echo -e "Species\tGenome Reference Consortium Build\tGenome Reference Consortium Patch\tGENCODE Annotation Release" > reference.tsv
echo -e "Human\tGRCh\$(echo `echo ${params.refHuVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refHuVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refHuVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv
echo -e "Mouse\tGRCm\$(echo `echo ${params.refMoVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refMoVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refMoVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv
# remove inner distance report if it is empty (SE repRID)
echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log
if [ "${endsM}" == "se" ]
......@@ -1081,5 +1163,4 @@ process outputBag {
cp ${multiqcJSON} Replicate_${repRID}.outputBag
bdbag Replicate_${repRID}.outputBag --archiver zip
"""
}
}
\ No newline at end of file
......@@ -13,10 +13,13 @@ if (!("count" %in% names(opt))){
stop("Count file doesn't exist, exiting.")
}
repRID <- basename(gsub(".featureCounts","",opt$count))
repRID <- basename(gsub(".countData","",opt$count))
count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below
colnames(count)[7] <- "count"
colnames(count)[1] <- "gene_name"
colnames(count)[7] <- "gene_id"
colnames(count)[8] <- "count"
count <- count[,c(1,7,2:6,8)]
rpk <- count$count/count$Length/1000
......
......@@ -7,18 +7,20 @@ option_list=list(
opt=parse_args(OptionParser(option_list=option_list))
rm(option_list)
countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
countTable <- read.csv(paste0(opt$repRID,".countTable.csv"), stringsAsFactors=FALSE)
geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)
convert <- data.frame(geneID=countTable$Geneid)
convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
convert <- data.frame(gene_name=countTable$gene_name)
convert <- merge(x=convert,y=geneID[,1:2],by.x="gene_name",by.y="V2",all.x=TRUE)
convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
convert[is.na(convert$V2),3] <- ""
convert <- convert[,-1]
colnames(convert) <- c("GeneID","EntrezID")
convert <- unique(convert)
output <- merge(x=convert,y=countTable[,c("Geneid","count","tpm")],by.x="GeneID",by.y="Geneid",all.x=TRUE)
output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")],by.x="GeneID",by.y="gene_name",all.x=TRUE)
colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm")
output <- output[,c(1,3,2,4:5)]
write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
......@@ -11,5 +11,6 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.makeFeatureCounts
def test_makeFeatureCounts():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.featureCounts'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countData'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countTable.csv'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.tpmTable.csv'))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment