From 89c92593bec9d52c7b3848c8efc8eecd2481c92f Mon Sep 17 00:00:00 2001 From: Holly Ruess <s185797@biohpcwsc012.biohpc.swmed.edu> Date: Wed, 18 Dec 2019 08:54:55 -0600 Subject: [PATCH] fix experiment qc --- .gitlab-ci.yml | 1 + CHANGELOG.md | 1 + README.md | 7 ++- workflow/main.nf | 29 +++++---- workflow/scripts/experiment_qc.py | 88 +++++++++++----------------- workflow/tests/test_experiment_qc.py | 51 +++++++++++++--- 6 files changed, 102 insertions(+), 75 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0f8a5b7..23a892a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,6 +12,7 @@ user_configuration: stage: unit script: - pytest -m unit + - pytest -m unit --cov=./workflow/scripts single_end_human: stage: integration diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f3f686..84794a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. ### Fixed - Removed biosample, factor, treatment from design file - Updated documentation + - Output graphics are in pdf format ### Added - Changelog diff --git a/README.md b/README.md index 1e206ec..6a965f5 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git 2. Trim adaptors with TrimGalore! 3. Map reads with BWA, filter with SamTools, and sort with Sambamba 4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools + 5. QC metrics with deep tools ## Output Files @@ -63,7 +64,11 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed filterReads | *.dedup.bam.bai | indexed filtered bam file filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools) filterReads | *.dedup.pbc.qc | QC metrics of library complexity - +experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample +experimentQC | *_fingerprint.pdf | plot to determine if the antibody-treatment enriched sufficiently +experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples +experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples +experimentQC | sample_mbs.npz | array of multiple BAM summaries ## Common Errors diff --git a/workflow/main.nf b/workflow/main.nf index 379169f..0b4352c 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -251,28 +251,35 @@ process filterReads { // Define channel collecting dedup reads into new design file dedupReads .map{ sampleId, bam, bai, experimentId, replicate -> -"$sampleId\t$bam\t$bai\t$experimentId\t$replicate\n"} -.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir:"$baseDir/output/design") +"${sampleId}\t${bam}\t${bai}\t${experimentId}\t${replicate}\n"} +.collectFile(name: 'design_dedup.tsv', seed: "sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir: "${outDir}/design") .into { dedupDesign; preDiffDesign } // Quality Metrics using deeptools process experimentQC { - publishDir "$baseDir/output/${task.process}", mode: 'copy' + publishDir "${outDir}/${task.process}", mode: 'copy' input: - - file dedupDesign + file dedupDesign output: - - file '*.{png,npz}' into deepToolsStats + file '*.{png,npz}' into deepToolsStats + file('version_*.txt') into experimentQCVersions script: - - """ - python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -a - """ + if (params.astrocyte == true) { + """ + module load python/3.6.1-2-anaconda + module load deeptools/2.5.0.1 + python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign} + """ + } + else { + """ + python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign} + """ + } } diff --git a/workflow/scripts/experiment_qc.py b/workflow/scripts/experiment_qc.py index afc5f3a..27157a4 100644 --- a/workflow/scripts/experiment_qc.py +++ b/workflow/scripts/experiment_qc.py @@ -35,11 +35,6 @@ def get_args(): help="The design file to run QC (tsv format).", required=True) - parser.add_argument('-a', '--atac', - help="True/False if ATAC-seq or ChIP-seq.", - default=False, - action='store_true') - args = parser.parse_args() return args @@ -52,6 +47,15 @@ def check_tools(): deeptools_path = shutil.which("deeptools") if deeptools_path: logger.info('Found deeptools: %s', deeptools_path) + + # Get Version + deeptools_version_command = "deeptools --version" + deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT) + + # Write to file + deeptools_file = open("version_deeptools.txt", "wb") + deeptools_file.write(deeptools_version) + deeptools_file.close() else: logger.error('Missing deeptools') raise Exception('Missing deeptools') @@ -77,10 +81,10 @@ def generate_read_summary(design): return mbs_filename -def check_correlation(mbs): +def check_spearman_correlation(mbs): '''Check Spearman pairwise correlation of samples based on read counts.''' - spearman_filename = 'heatmap_SpearmanCorr.png' + spearman_filename = 'heatmap_SpearmanCorr.pdf' spearman_params = "--corMethod spearman --skipZero" + \ " --plotTitle \"Spearman Correlation of Read Counts\"" + \ " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" @@ -96,12 +100,31 @@ def check_correlation(mbs): out, err = spearman_correlation.communicate() +def check_pearson_correlation(mbs): + '''Check Pearson pairwise correlation of samples based on read counts.''' + + pearson_filename = 'heatmap_PearsonCorr.pdf' + pearson_params = "--corMethod pearson --skipZero" + \ + " --plotTitle \"Pearson Correlation of Read Counts\"" + \ + " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" + + pearson_command = ( + "plotCorrelation -in %s %s -o %s" + % (mbs, pearson_params, pearson_filename) + ) + + logger.info("Running deeptools with %s", pearson_command) + + pearson_correlation = subprocess.Popen(pearson_command, shell=True) + out, err = pearson_correlation.communicate() + + def check_coverage(design): '''Asses the sequencing depth of samples.''' bam_files = ' '.join(design['bam_reads']) labels = ' '.join(design['sample_id']) - coverage_filename = 'coverage.png' + coverage_filename = 'coverage.pdf' coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \ " --ignoreDuplicates --minMappingQuality 10" @@ -116,44 +139,9 @@ def check_coverage(design): out, err = coverage_summary.communicate() -def update_controls(design): - '''Update design file to append controls list.''' - - logger.info("Running control file update.") - - file_dict = design[['sample_id', 'bam_reads']] \ - .set_index('sample_id').T.to_dict() - - design['control_reads'] = design['control_id'] \ - .apply(lambda x: file_dict[x]['bam_reads']) - - logger.info("Removing rows that are there own control.") - - design = design[design['control_id'] != design['sample_id']] - - return design - - -def check_enrichment(sample_id, control_id, sample_reads, control_reads): - '''Asses the enrichment per sample.''' - - fingerprint_filename = sample_id + '_fingerprint.png' - - fingerprint_command = ( - "plotFingerprint -b %s %s --labels %s %s --plotFile %s" - % (sample_reads, control_reads, sample_id, control_id, fingerprint_filename) - ) - - logger.info("Running deeptools with %s", fingerprint_command) - - fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True) - out, err = fingerprint_summary.communicate() - - def main(): args = get_args() design = args.design - atac = args.atac # Create a file handler handler = logging.FileHandler('experiment_qc.log') @@ -167,21 +155,11 @@ def main(): # Run correlation mbs_filename = generate_read_summary(design_df) - check_correlation(mbs_filename) + check_spearman_correlation(mbs_filename) + check_pearson_correlation(mbs_filename) # Run coverage check_coverage(design_df) - # Run enrichment - if not atac: - new_design_df = update_controls(design_df) - for index, row in new_design_df.iterrows(): - check_enrichment( - row['sample_id'], - row['control_id'], - row['bam_reads'], - row['control_reads']) - - if __name__ == '__main__': main() diff --git a/workflow/tests/test_experiment_qc.py b/workflow/tests/test_experiment_qc.py index fdf36ea..2cccf00 100644 --- a/workflow/tests/test_experiment_qc.py +++ b/workflow/tests/test_experiment_qc.py @@ -23,14 +23,49 @@ def design_bam(): return design_df -@pytest.mark.integration -def test_experiment_qc_singleend(): +@pytest.mark.singleend_human +def test_coverage_singleend_human(): assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) - assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) - assert os.path.exists(os.path.join(test_output_path, 'coverage.png')) + assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf')) + + +@pytest.mark.singleend_human +def test_spearman_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf')) + + +@pytest.mark.singleend_human +def test_pearson_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf')) + + +@pytest.mark.singleend_human +def test_fingerprint_singleend_human(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB170KFO_fingerprint.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX_fingerprint.pdf')) + + +@pytest.mark.pairedend_mouse +def test_coverage_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) + assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf')) + + +@pytest.mark.pairedend_mouse +def test_spearman_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf')) + + +@pytest.mark.pairedend_mouse +def test_pearson_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf')) + + +@pytest.mark.pairedend_mouse +def test_fingerprint_pairedend_mouse(): + assert os.path.exists(os.path.join(test_output_path, 'ENCLB749GLW_fingerprint.pdf')) + assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP_fingerprint.pdf')) + + -@pytest.mark.integration -def test_experiment_qc_pairedend(): - # Do the same thing for paired end data - pass -- GitLab