Skip to content
Snippets Groups Projects
Commit d14cdf77 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '0.0.3' into 'develop'

0.0.3

Closes #74, #73, #72, #71, and #61

See merge request !42
parents 64c587de 1bfecd57
Branches
Tags
2 merge requests!43Develop,!420.0.3
Pipeline #8026 canceled with stages
in 27 minutes and 46 seconds
......@@ -14,6 +14,11 @@ stages:
getBag:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
......@@ -21,14 +26,24 @@ getBag:
getData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- unzip ./test_data/bagit/Replicate_Q-Y5F6.zip
- unzip ./test_data/bag/Replicate_Q-Y5F6.zip
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6 TEST
- pytest -m getData
parseMetadata:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- rep=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p repRID)
- exp=$(singularity run 'docker://bicf/python3:2.0.1_indev' python3 ./workflow/scripts/parseMeta.py -r Replicate_RID -m "./test_data/meta/metaTest.csv" -p expRID)
......@@ -44,6 +59,11 @@ parseMetadata:
inferMetadata:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- >
align=$(echo $(grep "Overall alignment rate" ./test_data/meta/Q-Y5F6_1M.se.alignSummary.txt | cut -f2 -d ':' | cut -f2 -d ' ' | tr -d '%')) &&
......@@ -56,6 +76,11 @@ inferMetadata:
getRef:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- mkdir -p hu
- mkdir -p mo
......@@ -64,6 +89,11 @@ getRef:
trimData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --basename Q-Y5F6_1M.se ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz
- singularity run 'docker://bicf/trimgalore:1.1' trim_galore --gzip -q 25 --length 35 --paired --basename Q-Y5F6_1M.pe ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz ./test_data/fastq/small/Q-Y5F6_1M.R2.fastq.gz
......@@ -73,12 +103,22 @@ trimData:
downsampleData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/seqtk:2.0.1_indev' seqtk sample -s100 ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz 1000 1> sampled.1.fq
- pytest -m downsampleData
alignData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' hisat2 -p 20 --add-chrname --un-gz Q-Y5F6_1M.se.unal.gz -S Q-Y5F6_1M.se.sam -x /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2/genome --rna-strandness F -U ./test_data/fastq/small/Q-Y5F6_1M.se_trimmed.fq.gz --summary-file Q-Y5F6_1M.se.alignSummary.txt --new-summary
- singularity run 'docker://bicf/gudmaprbkaligner:2.0.1_indev' samtools view -1 -@ 20 -F 4 -F 8 -F 256 -o Q-Y5F6_1M.se.bam Q-Y5F6_1M.se.sam
......@@ -92,6 +132,11 @@ alignData:
dedupData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' java -jar /picard/build/libs/picard.jar MarkDuplicates I=./test_data/bam/small/Q-Y5F6_1M.se.sorted.bam O=Q-Y5F6_1M.se.deduped.bam M=Q-Y5F6_1M.se.deduped.Metrics.txt REMOVE_DUPLICATES=true
- singularity run 'docker://bicf/gudmaprbkdedup:2.0.0' samtools sort -@ 20 -O BAM -o Q-Y5F6_1M.se.sorted.deduped.bam ./test_data/bam/small/Q-Y5F6_1M.se.deduped.bam
......@@ -104,26 +149,49 @@ dedupData:
countData:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.featureCounts
- ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
- ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
- singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count ./test_data/counts/small/Q-Y5F6_1M.se.countData
- singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
- assignedReads=$(grep -m 1 'Assigned' *.summary | grep -oe '\([0-9.]*\)')
- pytest -m makeFeatureCounts
makeBigWig:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/deeptools3.3:2.0.1_indev' bamCoverage -p 20 -b ./test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam -o Q-Y5F6_1M.se.bw
- pytest -m makeBigWig
fastqc:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- singularity run 'docker://bicf/fastqc:2.0.1_indev' fastqc ./test_data/fastq/small/Q-Y5F6_1M.R1.fastq.gz -o .
- pytest -m fastqc
dataQC:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- echo -e "geneID\tchrom\ttx_start\ttx_end\tTIN" > Q-Y5F6_1M.se.sorted.deduped.tin.xls
- for i in {"chr8","chr4","chrY"}; do
......@@ -132,6 +200,11 @@ dataQC:
outputBag:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- mkdir test
- singularity run 'docker://bicf/gudmaprbkfilexfer:2.0.1_indev' bdbag test --archiver zip
......@@ -140,6 +213,11 @@ outputBag:
integration_se:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
......@@ -150,11 +228,21 @@ integration_se:
when: always
paths:
- output/qc/
- output/report/
- SE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
integration_pe:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
......@@ -166,11 +254,91 @@ integration_pe:
paths:
- dag.png
- output/qc/
- output/report/
- PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
override_inputBag:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --inputBagForce ./test_data/bag/Replicate_Q-Y5F6.zip --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./inputBagOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- inputBagOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
override_fastq:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5F6 --fastqsForce './test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz' --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./fastqOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- fastqOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
override_species:
stage: integration
only:
- merge_requests
except:
refs:
- master
script:
- hostname
- ulimit -a
- nextflow -q run ./workflow/rna-seq.nf --deriva ./test_data/auth/credential.json --bdbag ./test_data/auth/cookies.txt --repRID Q-Y5ER --speciesForce 'Homo sapiens' --ci true
- find . -type f -name "multiqc_data.json" -exec cp {} ./speciesOverride_PE_multiqc_data.json \;
artifacts:
name: "$CI_JOB_NAME"
when: always
paths:
- speciesOverride_PE_multiqc_data.json
expire_in: 7 days
retry:
max: 1
when:
- always
consistency:
stage: consistency
only:
- merge_requests
except:
refs:
- master
script:
- grep -m 1 \"Assigned\":.[0-9] SE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedSE.txt
- grep -m 1 \"Assigned\":.[0-9] PE_multiqc_data.json | grep -oe '\([0-9.]*\)' > assignedPE.txt
......
# v0.0.2 (in development)
# v0.0.3 (in development)
**User Facing**
* TPM table:
* Add Ensembl Gene ID
* Rename columns: *GENCODE_Gene_Symbol*, *Ensembl_GeneID*, *NCBI_GeneID*
* MultiQC output custom tables (html+JSON):
* Run table: *Session ID* and *Pipeline Version*
* Reference Table: *Species*, *Genome Reference Consortium Build*, *Genome Reference Consortium Patch*, *GENCODE Annotation Release* (outputs both human and mouse versions)
* Add inputBag override param (`inputBagForce`) [`*.zip`]
* Uses provided inputBag instead of downloading from data-hub
* Still requires matching repRID input param
* Add fastq override param (`fastqsForce`) [`R1`,`R2`]
* Uses provided fastq instead of downloading from data-hub
* Still requires matching repRID input param and will pull inputBag from data-hub to access submitted metadata for reporting
* Add species override param (`speciesForce`) [`Mus musculus` or `Homo sapiens`]
* forces the use of the provided species
* ignores inferred ambiguous species
**Background**
* Add GeneSymbol/EnsemblID/EntrezID translation files to references
*Known Bugs*
* outputBag does not contain fetch for processed data
* Does not include automatic data upload
* Override params (inputBag, fastq, species) aren't checked for integrity
<hr>
# v0.0.2
**User Facing**
* Output:
* inputBag
......@@ -19,4 +47,4 @@
**INITIAL BETA VERSION**\
Does not include automatic data upload\
This version is for initial upload of test data to GUDMAP/RBK data-hub for internal integration
<hr>
<hr>
\ No newline at end of file
......@@ -48,6 +48,13 @@ To Run:
* reference version consists of Genome Reference Consortium version, patch release and GENCODE annotation release # (leaving the params blank will use the default version tied to the pipeline version)
* *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22
* *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31
* ***Optional*** input overrides
* `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input)
* eg: `--inputBagForce test_data/bag/Replicate_Q-Y5F6.zip` (must be the expected bag structure)
* `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input)
* eg: `--fastqsForce 'test_data/fastq/small/Q-Y5F6_1M.R{1,2}.fastq.gz'` (note the quotes around fastq's which must me named in the correct standard [*\*.R1.fastq.gz and/or \*.R2.fastq.gz*] and in the correct order)
* `--speciesForce` forces the species to be "Mus musculus" or "Homo sapiens", it bypasses ambiguous species error
* eg: `--speciesForce 'Mus musculus'`
* Tracking parameters ([Tracking Site](http://bicf.pipeline.tracker.s3-website-us-east-1.amazonaws.com/)):
* `--ci` boolean (default = false)
* `--dev` boolean (default = false)
......
docs/dag.png

674 KiB | W: | H:

docs/dag.png

693 KiB | W: | H:

docs/dag.png
docs/dag.png
docs/dag.png
docs/dag.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -9,12 +9,12 @@ mkdir -p NEW_test_data
ln -sfn `readlink -e ./test_data/auth/credential.json` ~/.deriva/credential.json
mkdir -p ./NEW_test_data/bagit
mkdir -p ./NEW_test_data/bag
singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' deriva-download-cli dev.gudmap.org --catalog 2 ./workflow/conf/replicate_export_config.json . rid=Q-Y5F6
cp Replicate_Q-Y5F6.zip ./NEW_test_data/bagit/Replicate_Q-Y5F6.zip
cp Replicate_Q-Y5F6.zip ./NEW_test_data/bag/Replicate_Q-Y5F6.zip
mkdir -p ./NEW_test_data/fastq
unzip ./test_data/bagit/Replicate_Q-Y5F6.zip
unzip ./test_data/bag/Replicate_Q-Y5F6.zip
singularity run 'docker://bicf/gudmaprbkfilexfer:1.3' bash ./workflow/scripts/bdbagFetch.sh Replicate_Q-Y5F6 Replicate_Q-Y5F6
cp Replicate_Q-Y5F6.R1.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R1.fastq.gz
cp Replicate_Q-Y5F6.R2.fastq.gz ./NEW_test_data/fastq/Replicate_Q-Y5F6.R2.fastq.gz
......@@ -81,10 +81,14 @@ cp Q-Y5F6_1M.se.sorted.deduped.chrY.bam.bai ./NEW_test_data/bam/small/Q-Y5F6_1M.
mkdir -p ./NEW_test_data/counts
mkdir -p ./NEW_test_data/counts/small
singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' -o Q-Y5F6_1M.se.featureCounts -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.featureCounts
cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.featureCounts
ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/geneID.tsv
ln -s /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/Entrez.tsv
singularity run 'docker://bicf/subread2:2.0.0' featureCounts -T 20 -a /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.gtf -G /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o Q-Y5F6_1M.se.countData -s 1 -R SAM --primary --ignoreDup ./NEW_test_data/bam/small/Q-Y5F6_1M.se.sorted.deduped.bam
singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/calculateTPM.R --count Q-Y5F6_1M.se.countData
singularity run 'docker://bicf/subread2:2.0.0' Rscript ./workflow/scripts/convertGeneSymbols.R --repRID Q-Y5F6_1M.se
cp Q-Y5F6_1M.se.featureCounts ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countData
cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.countTable.csv
cp Q-Y5F6_1M.se.countTable.csv ./NEW_test_data/counts/small/Q-Y5F6_1M.se.tpmTable.csv
mkdir -p ./NEW_test_data/bw
mkdir -p ./NEW_test_data/bw/small
......
......@@ -48,14 +48,32 @@ top_modules:
- '*infer_experiment*'
report_section_order:
run:
order: 4000
rid:
order: 2000
order: 3000
meta:
order: 2000
ref:
order: 1000
skip_generalstats: true
custom_data:
run:
file_format: 'tsv'
section_name: 'Run'
description: 'This is the run information'
plot_type: 'table'
pconfig:
id: 'run'
scale: false
format: '{}'
headers:
Session
Session ID
Pipeline Version
Input
rid:
file_format: 'tsv'
section_name: 'RID'
......@@ -63,7 +81,10 @@ custom_data:
plot_type: 'table'
pconfig:
id: 'rid'
scale: false
format: '{}'
headers:
Replicate
Replicate RID
Experiment RID
Study RID
......@@ -74,6 +95,7 @@ custom_data:
plot_type: 'table'
pconfig:
id: 'meta'
scale: false
format: '{:,.0f}'
headers:
Source
......@@ -85,6 +107,21 @@ custom_data:
Assigned Reads
Median Read Length
Median TIN
Pipeline Version
ref:
file_format: 'tsv'
section_name: 'Reference'
description: 'This is the referenec version information'
plot_type: 'table'
pconfig:
id: 'ref'
scale: false
format: '{}'
headers:
Species
Genome Reference Consortium Build
Genome Reference Consortium Patch
GENCODE Annotation Release"
tin:
file_format: 'tsv'
section_name: 'TIN'
......@@ -106,9 +143,13 @@ custom_data:
90 - 99
sp:
run:
fn: "run.tsv"
rid:
fn: 'rid.tsv'
meta:
fn: 'metadata.tsv'
ref:
fn: 'reference.tsv'
tin:
fn: '*.tin.hist.tsv'
fn: '*.tin.hist.tsv'
\ No newline at end of file
......@@ -97,6 +97,6 @@ manifest {
homePage = 'https://git.biohpc.swmed.edu/gudmap_rbk/rna-seq'
description = 'This pipeline was created to be a standard mRNA-sequencing analysis pipeline which integrates with the GUDMAP and RBK consortium data-hub.'
mainScript = 'rna-seq.nf'
version = 'v0.0.2_indev'
version = 'v0.0.3_indev'
nextflowVersion = '>=19.09.0'
}
......@@ -19,6 +19,11 @@ params.refHuVersion = "38.p12.v31"
params.refERCCVersion = "92"
params.outDir = "${baseDir}/../output"
// Define override input variable
params.inputBagForce = ""
params.fastqsForce = ""
params.speciesForce = ""
// Parse input variables
deriva = Channel
.fromPath(params.deriva)
......@@ -32,6 +37,9 @@ refHuVersion = params.refHuVersion
refERCCVersion = params.refERCCVersion
outDir = params.outDir
logsDir = "${outDir}/Logs"
inputBagForce = params.inputBagForce
fastqsForce = params.fastqsForce
speciesForce = params.speciesForce
// Define fixed files
derivaConfig = Channel.fromPath("${baseDir}/conf/replicate_export_config.json")
......@@ -117,7 +125,10 @@ process getBag {
path derivaConfig
output:
path ("Replicate_*.zip") into bagit
path ("Replicate_*.zip") into bag
when:
inputBagForce == ""
script:
"""
......@@ -131,12 +142,21 @@ process getBag {
echo -e "LOG: linked" >> ${repRID}.getBag.log
# deriva-download replicate RID
echo -e "LOG: fetching bagit for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
echo -e "LOG: fetching bag for ${repRID} in GUDMAP" >> ${repRID}.getBag.log
deriva-download-cli ${source} --catalog 2 ${derivaConfig} . rid=${repRID}
echo -e "LOG: fetched" >> ${repRID}.getBag.log
"""
}
// Set inputBag to downloaded or forced input
if (inputBagForce != "") {
inputBag = Channel
.fromPath(inputBagForce)
.ifEmpty { exit 1, "override inputBag file not found: ${inputBagForce}" }
} else {
inputBag = bag
}
/*
* getData: fetch study files from consortium with downloaded bdbag.zip
*/
......@@ -146,7 +166,7 @@ process getData {
input:
path script_bdbagFetch
path cookies, stageAs: "deriva-cookies.txt" from bdbag
path bagit
path inputBag
output:
path ("*.R{1,2}.fastq.gz") into fastqs
......@@ -158,33 +178,43 @@ process getData {
"""
hostname > ${repRID}.getData.log
ulimit -a >> ${repRID}.getData.log
# link deriva cookie for authentication
echo -e "LOG: linking deriva cookie" >> ${repRID}.getData.log
mkdir -p ~/.bdbag
ln -sf `readlink -e deriva-cookies.txt` ~/.bdbag/deriva-cookies.txt
echo -e "LOG: linked" >> ${repRID}.getData.log
# get bagit basename
replicate=\$(basename "${bagit}" | cut -d "." -f1)
echo -e "LOG: bagit replicate name \${replicate}" >> ${repRID}.getData.log
# get bag basename
replicate=\$(basename "${inputBag}" | cut -d "." -f1)
echo -e "LOG: bag replicate name \${replicate}" >> ${repRID}.getData.log
# unzip bagit
echo -e "LOG: unzipping replicate bagit" >> ${repRID}.getData.log
unzip ${bagit}
# unzip bag
echo -e "LOG: unzipping replicate bag" >> ${repRID}.getData.log
unzip ${inputBag}
echo -e "LOG: unzipped" >> ${repRID}.getData.log
# bagit fetch fastq's only and rename by repRID
# bag fetch fastq's only and rename by repRID
echo -e "LOG: fetching replicate bdbag" >> ${repRID}.getData.log
sh ${script_bdbagFetch} \${replicate} ${repRID}
echo -e "LOG: fetched" >> ${repRID}.getData.log
"""
}
// Replicate raw fastq's for multiple process inputs
fastqs.into {
fastqs_trimData
fastqs_fastqc
// Set raw fastq to downloaded or forced input and replicate them for multiple process inputs
if (fastqsForce != "") {
Channel
.fromPath(fastqsForce)
.ifEmpty { exit 1, "override inputBag file not found: ${fastqsForce}" }
.collect().into {
fastqs_trimData
fastqs_fastqc
}
} else {
fastqs.into {
fastqs_trimData
fastqs_fastqc
}
}
/*
......@@ -533,7 +563,24 @@ process inferMetadata {
bed="./GRCm/bed/genome.bed"
else
echo -e "LOG: ERROR - inference of species returns an ambiguous result: hu=\${align_hu} mo=\${align_mo}" >> ${repRID}.inferMetadata.log
exit 1
if [ "${speciesForce}" == "" ]
then
exit 1
fi
fi
if [ "${speciesForce}" != "" ]
then
echo -e "LOG: species overridden to: ${speciesForce}"
species="${speciesForce}"
if [ "${speciesForce}" == "Homo sapiens" ]
then
bam="GRCh.sampled.sorted.bam"
bed="./GRCh/bed/genome.bed"
elif [ "${speciesForce}" == "Mus musculus" ]
then
bam="GRCm.sampled.sorted.bam"
bed="./GRCm/bed/genome.bed"
fi
fi
echo -e "LOG: inference of species results in: \${species}" >> ${repRID}.inferMetadata.log
......@@ -875,10 +922,10 @@ process countData {
echo -e "LOG: counting ${ends} features" >> ${repRID}.countData.log
if [ "${ends}" == "se" ]
then
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
elif [ "${ends}" == "pe" ]
then
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
featureCounts -T `nproc` -a ./genome.gtf -G ./genome.fna -g 'gene_name' --extraAttributes 'gene_id' -o ${repRID}.countData -s \${stranding} -p -B -R SAM --primary --ignoreDup ${repRID}.sorted.deduped.bam
fi
echo -e "LOG: counted" >> ${repRID}.countData.log
......@@ -1034,18 +1081,53 @@ process aggrQC {
hostname > ${repRID}.aggrQC.log
ulimit -a >> ${repRID}.aggrQC.log
# make run table
if [ "${params.inputBagForce}" == "" ] && [ "${params.fastqsForce}" == "" ] && [ "${params.speciesForce}" == "" ]
then
input="default"
else
input="override:"
if [ "${params.inputBagForce}" != "" ]
then
input=\$(echo \${input} inputBag)
fi
if [ "${params.fastqsForce}" != "" ]
then
input=\$(echo \${input} fastq)
fi
if [ "${params.speciesForce}" != "" ]
then
input=\$(echo \${input} species)
fi
fi
echo -e "LOG: creating run table" >> ${repRID}.aggrQC.log
echo -e "Session\tSession ID\tPipeline Version\tInput" > run.tsv
echo -e "Session\t${workflow.sessionId}\t${workflow.manifest.version}\t\${input}" >> run.tsv
# make RID table
echo -e "LOG: creating RID table" >> ${repRID}.aggrQC.log
echo -e "Replicate RID\tExperiment RID\tStudy RID" > rid.tsv
echo -e "${repRID}\t${expRID}\t${studyRID}" >> rid.tsv
echo -e "Replicate\tReplicate RID\tExperiment RID\tStudy RID" > rid.tsv
echo -e "Replicate\t${repRID}\t${expRID}\t${studyRID}" >> rid.tsv
# make metadata table
echo -e "LOG: creating metadata table" >> ${repRID}.aggrQC.log
echo -e "Source\tSpecies\tEnds\tStranded\tSpike-in\tRaw Reads\tAssigned Reads\tMedian Read Length\tMedian TIN" > metadata.tsv
echo -e "Submitter\t${speciesM}\t${endsM}\t${strandedM}\t${spikeM}\t-\t-\t'${readLengthM}'\t-" >> metadata.tsv
echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
if [ "${params.speciesForce}" == "" ]
then
echo -e "Infered\t${speciesI}\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
else
echo -e "Infered\t${speciesI} (FORCED)\t${endsI}\t${strandedI}\t${spikeI}\t-\t-\t-\t-" >> metadata.tsv
fi
echo -e "Measured\t-\t${endsManual}\t-\t-\t'${rawReadsI}'\t'${assignedReadsI}'\t'${readLengthI}'\t'${tinMedI}'" >> metadata.tsv
# make reference table
echo -e "LOG: creating referencerun table" >> ${repRID}.aggrQC.log
echo -e "Species\tGenome Reference Consortium Build\tGenome Reference Consortium Patch\tGENCODE Annotation Release" > reference.tsv
echo -e "Human\tGRCh\$(echo `echo ${params.refHuVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refHuVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refHuVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv
echo -e "Mouse\tGRCm\$(echo `echo ${params.refMoVersion} | cut -d "." -f 1`)\t\$(echo `echo ${params.refMoVersion} | cut -d "." -f 2`)\t'\$(echo `echo ${params.refMoVersion} | cut -d "." -f 3 | sed "s/^v//"`)'" >> reference.tsv
# remove inner distance report if it is empty (SE repRID)
echo -e "LOG: removing dummy inner distance file" >> ${repRID}.aggrQC.log
if [ "${endsM}" == "se" ]
......@@ -1081,5 +1163,4 @@ process outputBag {
cp ${multiqcJSON} Replicate_${repRID}.outputBag
bdbag Replicate_${repRID}.outputBag --archiver zip
"""
}
}
\ No newline at end of file
......@@ -13,10 +13,13 @@ if (!("count" %in% names(opt))){
stop("Count file doesn't exist, exiting.")
}
repRID <- basename(gsub(".featureCounts","",opt$count))
repRID <- basename(gsub(".countData","",opt$count))
count <- read.delim(opt$count, comment.char="#") # if featureCounts file changes structure, be sure to update count and Length columns below
colnames(count)[7] <- "count"
colnames(count)[1] <- "gene_name"
colnames(count)[7] <- "gene_id"
colnames(count)[8] <- "count"
count <- count[,c(1,7,2:6,8)]
rpk <- count$count/count$Length/1000
......
......@@ -7,18 +7,20 @@ option_list=list(
opt=parse_args(OptionParser(option_list=option_list))
rm(option_list)
countTable <- read.csv(paste0(opt$repRID,".countData.countTable.csv"), stringsAsFactors=FALSE)
countTable <- read.csv(paste0(opt$repRID,".countTable.csv"), stringsAsFactors=FALSE)
geneID <- read.delim("geneID.tsv", header=FALSE, stringsAsFactors=FALSE)
Entrez <- read.delim("Entrez.tsv", header=FALSE, stringsAsFactors=FALSE)
convert <- data.frame(geneID=countTable$Geneid)
convert <- merge(x=convert,y=geneID[,1:2],by.x="geneID",by.y="V2",all.x=TRUE)
convert <- data.frame(gene_name=countTable$gene_name)
convert <- merge(x=convert,y=geneID[,1:2],by.x="gene_name",by.y="V2",all.x=TRUE)
convert <- merge(x=convert,y=Entrez,by.x="V1",by.y="V1",all.x=TRUE)
convert[is.na(convert$V2),3] <- ""
convert <- convert[,-1]
colnames(convert) <- c("GeneID","EntrezID")
convert <- unique(convert)
output <- merge(x=convert,y=countTable[,c("Geneid","count","tpm")],by.x="GeneID",by.y="Geneid",all.x=TRUE)
output <- merge(x=convert,y=countTable[,c("gene_name","gene_id","count","tpm")],by.x="GeneID",by.y="gene_name",all.x=TRUE)
colnames(output) <- c("GENCODE_Gene_Symbol","NCBI_GeneID","Ensembl_GeneID","count","tpm")
output <- output[,c(1,3,2,4:5)]
write.table(output,file=paste0(opt$repRID,".tpmTable.csv"),sep=",",row.names=FALSE,quote=FALSE)
......@@ -11,5 +11,6 @@ data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.makeFeatureCounts
def test_makeFeatureCounts():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.featureCounts'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countData'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.countTable.csv'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.tpmTable.csv'))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment