fix experiment qc

89c92593 · Holly Ruess · a14bd619 · 89c92593 · 89c92593 · 89c92593
Commit 89c92593 authored 5 years ago by Holly Ruess
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,6 +12,7 @@ user_configuration:
  stage: unit
  script:
  - pytest -m unit
+  - pytest -m unit --cov=./workflow/scripts

 single_end_human:
  stage: integration

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file.
 ### Fixed
 - Removed biosample, factor, treatment from design file
 - Updated documentation
+ - Output graphics are in pdf format

 ### Added
 - Changelog

--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
    2. Trim adaptors with TrimGalore!
    3. Map reads with BWA, filter with SamTools, and sort with Sambamba
    4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
+    5. QC metrics with deep tools


 ## Output Files
@@ -63,7 +64,11 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
 filterReads | *.dedup.bam.bai | indexed filtered bam file
 filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
 filterReads | *.dedup.pbc.qc | QC metrics of library complexity
-
+experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
+experimentQC | *_fingerprint.pdf | plot to determine if the antibody-treatment enriched sufficiently
+experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
+experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
+experimentQC | sample_mbs.npz | array of multiple BAM summaries


 ## Common Errors

--- a/workflow/main.nf
+++ b/workflow/main.nf
@@ -251,28 +251,35 @@ process filterReads {
 // Define channel collecting dedup reads into new design file
 dedupReads
 .map{ sampleId, bam, bai, experimentId, replicate ->
-"$sampleId\t$bam\t$bai\t$experimentId\t$replicate\n"}
-.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir:"$baseDir/output/design")
+"${sampleId}\t${bam}\t${bai}\t${experimentId}\t${replicate}\n"}
+.collectFile(name: 'design_dedup.tsv', seed: "sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir: "${outDir}/design")
 .into { dedupDesign; preDiffDesign }

 // Quality Metrics using deeptools
 process experimentQC {

-  publishDir "$baseDir/output/${task.process}", mode: 'copy'
+  publishDir "${outDir}/${task.process}", mode: 'copy'

  input:
-
-  file dedupDesign
+    file dedupDesign

  output:
-
-  file '*.{png,npz}' into deepToolsStats
+    file '*.{png,npz}' into deepToolsStats
+    file('version_*.txt') into experimentQCVersions

  script:
-
-  """
-  python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -a
-  """
+    if (params.astrocyte == true) {
+      """
+      module load python/3.6.1-2-anaconda
+      module load deeptools/2.5.0.1
+      python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign}
+      """
+    }
+    else {
+      """
+      python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign}
+      """
+    }

 }


--- a/workflow/scripts/experiment_qc.py
+++ b/workflow/scripts/experiment_qc.py
@@ -35,11 +35,6 @@ def get_args():
                        help="The design file to run QC (tsv format).",
                        required=True)

-    parser.add_argument('-a', '--atac',
-                        help="True/False if ATAC-seq or ChIP-seq.",
-                        default=False,
-                        action='store_true')
-
    args = parser.parse_args()
    return args

@@ -52,6 +47,15 @@ def check_tools():
    deeptools_path = shutil.which("deeptools")
    if deeptools_path:
        logger.info('Found deeptools: %s', deeptools_path)
+
+        # Get Version
+        deeptools_version_command = "deeptools --version"
+        deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
+
+        # Write to file
+        deeptools_file = open("version_deeptools.txt", "wb")
+        deeptools_file.write(deeptools_version)
+        deeptools_file.close()
    else:
        logger.error('Missing deeptools')
        raise Exception('Missing deeptools')
@@ -77,10 +81,10 @@ def generate_read_summary(design):
    return mbs_filename


-def check_correlation(mbs):
+def check_spearman_correlation(mbs):
    '''Check Spearman pairwise correlation of samples based on read counts.'''

-    spearman_filename = 'heatmap_SpearmanCorr.png'
+    spearman_filename = 'heatmap_SpearmanCorr.pdf'
    spearman_params = "--corMethod spearman --skipZero" + \
                    " --plotTitle \"Spearman Correlation of Read Counts\"" + \
                    " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
@@ -96,12 +100,31 @@ def check_correlation(mbs):
    out, err = spearman_correlation.communicate()


+def check_pearson_correlation(mbs):
+    '''Check Pearson pairwise correlation of samples based on read counts.'''
+
+    pearson_filename = 'heatmap_PearsonCorr.pdf'
+    pearson_params = "--corMethod pearson --skipZero" + \
+                    " --plotTitle \"Pearson Correlation of Read Counts\"" + \
+                    " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
+
+    pearson_command = (
+        "plotCorrelation -in %s %s -o %s"
+        % (mbs, pearson_params, pearson_filename)
+    )
+
+    logger.info("Running deeptools with %s", pearson_command)
+
+    pearson_correlation = subprocess.Popen(pearson_command, shell=True)
+    out, err = pearson_correlation.communicate()
+
+
 def check_coverage(design):
    '''Asses the sequencing depth of samples.'''

    bam_files = ' '.join(design['bam_reads'])
    labels = ' '.join(design['sample_id'])
-    coverage_filename = 'coverage.png'
+    coverage_filename = 'coverage.pdf'
    coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
                    " --ignoreDuplicates --minMappingQuality 10"

@@ -116,44 +139,9 @@ def check_coverage(design):
    out, err = coverage_summary.communicate()


-def update_controls(design):
-    '''Update design file to append controls list.'''
-
-    logger.info("Running control file update.")
-
-    file_dict = design[['sample_id', 'bam_reads']] \
-                .set_index('sample_id').T.to_dict()
-
-    design['control_reads'] = design['control_id'] \
-                                .apply(lambda x: file_dict[x]['bam_reads'])
-
-    logger.info("Removing rows that are there own control.")
-
-    design = design[design['control_id'] != design['sample_id']]
-
-    return design
-
-
-def check_enrichment(sample_id, control_id, sample_reads, control_reads):
-    '''Asses the enrichment per sample.'''
-
-    fingerprint_filename = sample_id + '_fingerprint.png'
-
-    fingerprint_command = (
-        "plotFingerprint -b %s %s --labels %s %s --plotFile %s"
-        % (sample_reads, control_reads, sample_id, control_id, fingerprint_filename)
-        )
-
-    logger.info("Running deeptools with %s", fingerprint_command)
-
-    fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True)
-    out, err = fingerprint_summary.communicate()
-
-
 def main():
    args = get_args()
    design = args.design
-    atac = args.atac

    # Create a file handler
    handler = logging.FileHandler('experiment_qc.log')
@@ -167,21 +155,11 @@ def main():

    # Run correlation
    mbs_filename = generate_read_summary(design_df)
-    check_correlation(mbs_filename)
+    check_spearman_correlation(mbs_filename)
+    check_pearson_correlation(mbs_filename)

    # Run coverage
    check_coverage(design_df)

-    # Run enrichment
-    if not atac:
-        new_design_df = update_controls(design_df)
-        for index, row in new_design_df.iterrows():
-            check_enrichment(
-                row['sample_id'],
-                row['control_id'],
-                row['bam_reads'],
-                row['control_reads'])
-
-
 if __name__ == '__main__':
    main()
--- a/workflow/tests/test_experiment_qc.py
+++ b/workflow/tests/test_experiment_qc.py
@@ -23,14 +23,49 @@ def design_bam():
    return design_df


-@pytest.mark.integration
-def test_experiment_qc_singleend():
+@pytest.mark.singleend_human
+def test_coverage_singleend_human():
    assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
-    assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
-    assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
+    assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
+
+
+@pytest.mark.singleend_human
+def test_spearman_singleend_human():
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
+
+
+@pytest.mark.singleend_human
+def test_pearson_singleend_human():
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
+
+
+@pytest.mark.singleend_human
+def test_fingerprint_singleend_human():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB170KFO_fingerprint.pdf'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX_fingerprint.pdf'))
+
+
+@pytest.mark.pairedend_mouse
+def test_coverage_pairedend_mouse():
+    assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
+    assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
+
+
+@pytest.mark.pairedend_mouse
+def test_spearman_pairedend_mouse():
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
+
+
+@pytest.mark.pairedend_mouse
+def test_pearson_pairedend_mouse():
+    assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
+
+
+@pytest.mark.pairedend_mouse
+def test_fingerprint_pairedend_mouse():
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB749GLW_fingerprint.pdf'))
+    assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP_fingerprint.pdf'))
+
+


-@pytest.mark.integration
-def test_experiment_qc_pairedend():
-    # Do the same thing for paired end data
-    pass