diff --git a/workflow/main.nf b/workflow/main.nf index 4b1fa5bdbe6ef4c48f92a2f4d121d9a238be45f8..0332ec64dc402372cf9d8629a08dcb770e8bd925 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -14,6 +14,7 @@ params.genomeSize = params.genome ? params.genomes[ params.genome ].genomesize ? params.chromSizes = params.genome ? params.genomes[ params.genome ].chromsizes ?: false : false params.cutoffRatio = 1.2 params.outDir= "$outDir" +params.extendReadsLen = 100 // Check inputs if( params.bwaIndex ){ @@ -38,6 +39,7 @@ genomeSize = params.genomeSize chromSizes = params.chromSizes cutoffRatio = params.cutoffRatio outDir = params.outDir +extendReadsLen = params.extendReadsLen process checkDesignFile { @@ -195,7 +197,7 @@ process experimentQC { script: """ - python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign + python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -e $extendReadsLen """ } diff --git a/workflow/scripts/experiment_qc.py b/workflow/scripts/experiment_qc.py index 6f63b52993b54de042f57968d1cdcc1c21bc595c..7386fcffd8e982d4d432e99becfc2aacb23ad2f2 100644 --- a/workflow/scripts/experiment_qc.py +++ b/workflow/scripts/experiment_qc.py @@ -34,6 +34,11 @@ def get_args(): help="The design file to run QC (tsv format).", required=True) + parser.add_argument('-e', '--extension', + help="Number of base pairs to extend the reads", + type=int, + required=True) + args = parser.parse_args() return args @@ -51,7 +56,7 @@ def check_tools(): raise Exception('Missing deeptools') -def generate_read_summary(design): +def generate_read_summary(design, extension): '''Generate summary of data based on read counts.''' bam_files = ' '.join(design['bam_reads']) @@ -59,8 +64,8 @@ def generate_read_summary(design): mbs_filename = 'sample_mbs.npz' mbs_command = ( - "multiBamSummary bins -p %d --bamfiles %s --labels %s -out %s" - % (cpu_count(), bam_files, labels, mbs_filename) + "multiBamSummary bins -p %d --bamfiles %s --extendReads %d --labels %s -out %s" + % (cpu_count(), bam_files, extension, labels, mbs_filename) ) logger.info("Running deeptools with %s", mbs_command) @@ -90,7 +95,7 @@ def check_correlation(mbs): out, err = spearman_correlation.communicate() -def check_coverage(design): +def check_coverage(design, extension): '''Asses the sequencing depth of samples.''' bam_files = ' '.join(design['bam_reads']) @@ -100,8 +105,8 @@ def check_coverage(design): " --ignoreDuplicates --minMappingQuality 10" coverage_command = ( - "plotCoverage -b %s --labels %s %s --plotFile %s" - % (bam_files, labels, coverage_params, coverage_filename) + "plotCoverage -b %s --extendReads %d --labels %s %s --plotFile %s" + % (bam_files, extension, labels, coverage_params, coverage_filename) ) logger.info("Running deeptools with %s", coverage_command) @@ -128,14 +133,14 @@ def update_controls(design): return design -def check_enrichment(sample_id, control_id, sample_reads, control_reads): +def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension): '''Asses the enrichment per sample.''' fingerprint_filename = sample_id + '_fingerprint.png' fingerprint_command = ( - "plotFingerprint -b %s %s --labels %s %s --plotFile %s" - % (sample_reads, control_reads, sample_id, control_id, fingerprint_filename) + "plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s" + % (sample_reads, control_reads, extension, sample_id, control_id, fingerprint_filename) ) logger.info("Running deeptools with %s", fingerprint_command) @@ -147,6 +152,7 @@ def check_enrichment(sample_id, control_id, sample_reads, control_reads): def main(): args = get_args() design = args.design + extension = args.extension # Create a file handler handler = logging.FileHandler('experiment_qc.log') @@ -159,11 +165,11 @@ def main(): design_df = pd.read_csv(design, sep='\t') # Run correlation - mbs_filename = generate_read_summary(design_df) + mbs_filename = generate_read_summary(design_df, extension) check_correlation(mbs_filename) # Run coverage - check_coverage(design_df) + check_coverage(design_df, extension) # Run enrichment new_design_df = update_controls(design_df) @@ -172,7 +178,8 @@ def main(): row['sample_id'], row['control_id'], row['bam_reads'], - row['control_reads']) + row['control_reads'], + extension) if __name__ == '__main__': diff --git a/workflow/scripts/xcor.py b/workflow/scripts/xcor.py index 52838594571b351de72fe03afb05f17840bbdc2a..f799137aa41caddf5f27729be9abb0eebeeb7482 100644 --- a/workflow/scripts/xcor.py +++ b/workflow/scripts/xcor.py @@ -61,20 +61,20 @@ def xcor(tag, paired): '''Use spp to calculate cross-correlation stats.''' tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz'])) - uncompressed_tag_filename = tag_basename - out, err = utils.run_pipe([ - 'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename) + # Subsample tagAlign file NREADS = 15000000 - subsampled_tag_filename = \ tag_basename + ".%d.tagAlign.gz" % (NREADS/1000000) + + steps = [ - 'grep -v "chrM" %s' % (uncompressed_tag_filename), - 'shuf -n %d --random-source=%s' % (NREADS, uncompressed_tag_filename) - ] + 'zcat %s' % (tag), + 'grep -v "chrM"', + 'shuf -n %d --random-source=%s' % (NREADS, tag)] + if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) diff --git a/workflow/tests/test_experiment_qc.py b/workflow/tests/test_experiment_qc.py index 875aea6e4d955de53347b848705ffd5581752c23..308ebe2acc432740305e17c3cefef65d8ea105b6 100644 --- a/workflow/tests/test_experiment_qc.py +++ b/workflow/tests/test_experiment_qc.py @@ -40,5 +40,8 @@ def test_experiment_qc_singleend(): @pytest.mark.integration def test_experiment_qc_pairedend(): - # Do the same thing for paired end data - pass + assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) + assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) + assert os.path.exists(os.path.join(test_output_path, 'coverage.png')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_fingerprint.png')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB637LZP_fingerprint.png'))