diff --git a/workflow/main.nf b/workflow/main.nf index 6843ebbe6e7b1c890970a6ecaf97f9706148f84a..1103db0f58bc9ccff692063b3e29c49a847fa4bb 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -14,18 +14,10 @@ params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ? params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false params.cutoffRatio = 1.2 +params.outDir= "$baseDir/output" +params.extendReadsLen = 100 params.topPeakCount = 600 -// Define regular variables -pairedEnd = params.pairedEnd -designFile = params.designFile -genomeSize = params.genomeSize -genome = params.genome -chromSizes = params.chromSizes -fasta = params.fasta -cutoffRatio = params.cutoffRatio -topPeakCount = params.topPeakCount - // Check inputs if( params.bwaIndex ){ bwaIndex = Channel @@ -42,11 +34,21 @@ readsList = Channel .map { file -> [ file.getFileName().toString(), file.toString() ].join("\t")} .collectFile( name: 'fileList.tsv', newLine: true ) +// Define regular variables +pairedEnd = params.pairedEnd +designFile = params.designFile +genomeSize = params.genomeSize +chromSizes = params.chromSizes +fasta = params.fasta +cutoffRatio = params.cutoffRatio +outDir = params.outDir +extendReadsLen = params.extendReadsLen +topPeakCount = params.topPeakCount // Check design file for errors process checkDesignFile { - publishDir "$baseDir/output/design", mode: 'copy' + publishDir "$outDir/design", mode: 'copy' input: @@ -87,7 +89,7 @@ rawReads = designFilePaths process trimReads { tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -117,7 +119,7 @@ process trimReads { process alignReads { tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -148,7 +150,7 @@ process alignReads { process filterReads { tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -181,13 +183,13 @@ process filterReads { dedupReads .map{ sampleId, bam, bai, experimentId, biosample, factor, treatment, replicate, controlId -> "$sampleId\t$bam\t$bai\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} -.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") +.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$outDir/design") .into { dedupDesign; preDiffDesign } // Quality Metrics using deeptools process experimentQC { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -200,7 +202,7 @@ process experimentQC { script: """ - python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign + python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -e $extendReadsLen """ } @@ -209,7 +211,7 @@ process experimentQC { process convertReads { tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -238,7 +240,7 @@ process convertReads { process crossReads { tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: @@ -268,12 +270,12 @@ process crossReads { xcorDesign = xcorReads .map{ sampleId, seTagAlign, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId -> "$sampleId\t$seTagAlign\t$tagAlign\t$xcor\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} - .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + .collectFile(name:'design_xcor.tsv', seed:"sample_id\tse_tag_align\ttag_align\txcor\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$outDir/design") // Make Experiment design files to be read in for downstream analysis process defineExpDesignFiles { - publishDir "$baseDir/output/design", mode: 'copy' + publishDir "$outDir/design", mode: 'copy' input: @@ -297,7 +299,7 @@ process poolAndPsuedoReads { tag "${experimentObjs.baseName}" - publishDir "$baseDir/output/design", mode: 'copy' + publishDir "$outDir/design", mode: 'copy' input: @@ -331,7 +333,7 @@ experimentRows = experimentPoolObjs process callPeaksMACS { tag "$sampleId-$replicate" - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: set sampleId, tagAlign, xcor, experimentId, biosample, factor, treatment, replicate, controlId, controlTagAlign from experimentRows @@ -359,12 +361,12 @@ process callPeaksMACS { peaksDesign = experimentPeaks .map{ sampleId, peak, fcSignal, pvalueSignal, experimentId, biosample, factor, treatment, replicate, controlId -> "$sampleId\t$peak\t$fcSignal\t$pvalueSignal\t$experimentId\t$biosample\t$factor\t$treatment\t$replicate\t$controlId\n"} - .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$baseDir/output/design") + .collectFile(name:'design_peak.tsv', seed:"sample_id\tpeaks\tfc_signal\tpvalue_signal\texperiment_id\tbiosample\tfactor\ttreatment\treplicate\tcontrol_id\n", storeDir:"$outDir/design") // Calculate Consensus Peaks process consensusPeaks { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "$outDir/${task.process}", mode: 'copy' input: diff --git a/workflow/scripts/experiment_qc.py b/workflow/scripts/experiment_qc.py index 6f63b52993b54de042f57968d1cdcc1c21bc595c..7386fcffd8e982d4d432e99becfc2aacb23ad2f2 100644 --- a/workflow/scripts/experiment_qc.py +++ b/workflow/scripts/experiment_qc.py @@ -34,6 +34,11 @@ def get_args(): help="The design file to run QC (tsv format).", required=True) + parser.add_argument('-e', '--extension', + help="Number of base pairs to extend the reads", + type=int, + required=True) + args = parser.parse_args() return args @@ -51,7 +56,7 @@ def check_tools(): raise Exception('Missing deeptools') -def generate_read_summary(design): +def generate_read_summary(design, extension): '''Generate summary of data based on read counts.''' bam_files = ' '.join(design['bam_reads']) @@ -59,8 +64,8 @@ def generate_read_summary(design): mbs_filename = 'sample_mbs.npz' mbs_command = ( - "multiBamSummary bins -p %d --bamfiles %s --labels %s -out %s" - % (cpu_count(), bam_files, labels, mbs_filename) + "multiBamSummary bins -p %d --bamfiles %s --extendReads %d --labels %s -out %s" + % (cpu_count(), bam_files, extension, labels, mbs_filename) ) logger.info("Running deeptools with %s", mbs_command) @@ -90,7 +95,7 @@ def check_correlation(mbs): out, err = spearman_correlation.communicate() -def check_coverage(design): +def check_coverage(design, extension): '''Asses the sequencing depth of samples.''' bam_files = ' '.join(design['bam_reads']) @@ -100,8 +105,8 @@ def check_coverage(design): " --ignoreDuplicates --minMappingQuality 10" coverage_command = ( - "plotCoverage -b %s --labels %s %s --plotFile %s" - % (bam_files, labels, coverage_params, coverage_filename) + "plotCoverage -b %s --extendReads %d --labels %s %s --plotFile %s" + % (bam_files, extension, labels, coverage_params, coverage_filename) ) logger.info("Running deeptools with %s", coverage_command) @@ -128,14 +133,14 @@ def update_controls(design): return design -def check_enrichment(sample_id, control_id, sample_reads, control_reads): +def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension): '''Asses the enrichment per sample.''' fingerprint_filename = sample_id + '_fingerprint.png' fingerprint_command = ( - "plotFingerprint -b %s %s --labels %s %s --plotFile %s" - % (sample_reads, control_reads, sample_id, control_id, fingerprint_filename) + "plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s" + % (sample_reads, control_reads, extension, sample_id, control_id, fingerprint_filename) ) logger.info("Running deeptools with %s", fingerprint_command) @@ -147,6 +152,7 @@ def check_enrichment(sample_id, control_id, sample_reads, control_reads): def main(): args = get_args() design = args.design + extension = args.extension # Create a file handler handler = logging.FileHandler('experiment_qc.log') @@ -159,11 +165,11 @@ def main(): design_df = pd.read_csv(design, sep='\t') # Run correlation - mbs_filename = generate_read_summary(design_df) + mbs_filename = generate_read_summary(design_df, extension) check_correlation(mbs_filename) # Run coverage - check_coverage(design_df) + check_coverage(design_df, extension) # Run enrichment new_design_df = update_controls(design_df) @@ -172,7 +178,8 @@ def main(): row['sample_id'], row['control_id'], row['bam_reads'], - row['control_reads']) + row['control_reads'], + extension) if __name__ == '__main__':