Skip to content
Snippets Groups Projects
Commit 89c92593 authored by Holly Ruess's avatar Holly Ruess
Browse files

fix experiment qc

parent a14bd619
Branches
Tags
1 merge request!10Resolve "Fix Experiment QC"
Pipeline #5290 failed with stages
in 3 hours, 1 minute, and 37 seconds
......@@ -12,6 +12,7 @@ user_configuration:
stage: unit
script:
- pytest -m unit
- pytest -m unit --cov=./workflow/scripts
single_end_human:
stage: integration
......
......@@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file.
### Fixed
- Removed biosample, factor, treatment from design file
- Updated documentation
- Output graphics are in pdf format
### Added
- Changelog
......
......@@ -48,6 +48,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
2. Trim adaptors with TrimGalore!
3. Map reads with BWA, filter with SamTools, and sort with Sambamba
4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
5. QC metrics with deep tools
## Output Files
......@@ -63,7 +64,11 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
filterReads | *.dedup.bam.bai | indexed filtered bam file
filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
filterReads | *.dedup.pbc.qc | QC metrics of library complexity
experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
experimentQC | *_fingerprint.pdf | plot to determine if the antibody-treatment enriched sufficiently
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
experimentQC | sample_mbs.npz | array of multiple BAM summaries
## Common Errors
......
......@@ -251,28 +251,35 @@ process filterReads {
// Define channel collecting dedup reads into new design file
dedupReads
.map{ sampleId, bam, bai, experimentId, replicate ->
"$sampleId\t$bam\t$bai\t$experimentId\t$replicate\n"}
.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir:"$baseDir/output/design")
"${sampleId}\t${bam}\t${bai}\t${experimentId}\t${replicate}\n"}
.collectFile(name: 'design_dedup.tsv', seed: "sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir: "${outDir}/design")
.into { dedupDesign; preDiffDesign }
// Quality Metrics using deeptools
process experimentQC {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
publishDir "${outDir}/${task.process}", mode: 'copy'
input:
file dedupDesign
file dedupDesign
output:
file '*.{png,npz}' into deepToolsStats
file '*.{png,npz}' into deepToolsStats
file('version_*.txt') into experimentQCVersions
script:
"""
python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -a
"""
if (params.astrocyte == true) {
"""
module load python/3.6.1-2-anaconda
module load deeptools/2.5.0.1
python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign}
"""
}
else {
"""
python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign}
"""
}
}
......
......@@ -35,11 +35,6 @@ def get_args():
help="The design file to run QC (tsv format).",
required=True)
parser.add_argument('-a', '--atac',
help="True/False if ATAC-seq or ChIP-seq.",
default=False,
action='store_true')
args = parser.parse_args()
return args
......@@ -52,6 +47,15 @@ def check_tools():
deeptools_path = shutil.which("deeptools")
if deeptools_path:
logger.info('Found deeptools: %s', deeptools_path)
# Get Version
deeptools_version_command = "deeptools --version"
deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
# Write to file
deeptools_file = open("version_deeptools.txt", "wb")
deeptools_file.write(deeptools_version)
deeptools_file.close()
else:
logger.error('Missing deeptools')
raise Exception('Missing deeptools')
......@@ -77,10 +81,10 @@ def generate_read_summary(design):
return mbs_filename
def check_correlation(mbs):
def check_spearman_correlation(mbs):
'''Check Spearman pairwise correlation of samples based on read counts.'''
spearman_filename = 'heatmap_SpearmanCorr.png'
spearman_filename = 'heatmap_SpearmanCorr.pdf'
spearman_params = "--corMethod spearman --skipZero" + \
" --plotTitle \"Spearman Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
......@@ -96,12 +100,31 @@ def check_correlation(mbs):
out, err = spearman_correlation.communicate()
def check_pearson_correlation(mbs):
'''Check Pearson pairwise correlation of samples based on read counts.'''
pearson_filename = 'heatmap_PearsonCorr.pdf'
pearson_params = "--corMethod pearson --skipZero" + \
" --plotTitle \"Pearson Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
pearson_command = (
"plotCorrelation -in %s %s -o %s"
% (mbs, pearson_params, pearson_filename)
)
logger.info("Running deeptools with %s", pearson_command)
pearson_correlation = subprocess.Popen(pearson_command, shell=True)
out, err = pearson_correlation.communicate()
def check_coverage(design):
'''Asses the sequencing depth of samples.'''
bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id'])
coverage_filename = 'coverage.png'
coverage_filename = 'coverage.pdf'
coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
" --ignoreDuplicates --minMappingQuality 10"
......@@ -116,44 +139,9 @@ def check_coverage(design):
out, err = coverage_summary.communicate()
def update_controls(design):
'''Update design file to append controls list.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
return design
def check_enrichment(sample_id, control_id, sample_reads, control_reads):
'''Asses the enrichment per sample.'''
fingerprint_filename = sample_id + '_fingerprint.png'
fingerprint_command = (
"plotFingerprint -b %s %s --labels %s %s --plotFile %s"
% (sample_reads, control_reads, sample_id, control_id, fingerprint_filename)
)
logger.info("Running deeptools with %s", fingerprint_command)
fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True)
out, err = fingerprint_summary.communicate()
def main():
args = get_args()
design = args.design
atac = args.atac
# Create a file handler
handler = logging.FileHandler('experiment_qc.log')
......@@ -167,21 +155,11 @@ def main():
# Run correlation
mbs_filename = generate_read_summary(design_df)
check_correlation(mbs_filename)
check_spearman_correlation(mbs_filename)
check_pearson_correlation(mbs_filename)
# Run coverage
check_coverage(design_df)
# Run enrichment
if not atac:
new_design_df = update_controls(design_df)
for index, row in new_design_df.iterrows():
check_enrichment(
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'])
if __name__ == '__main__':
main()
......@@ -23,14 +23,49 @@ def design_bam():
return design_df
@pytest.mark.integration
def test_experiment_qc_singleend():
@pytest.mark.singleend_human
def test_coverage_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
@pytest.mark.singleend_human
def test_spearman_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
@pytest.mark.singleend_human
def test_pearson_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
@pytest.mark.singleend_human
def test_fingerprint_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB170KFO_fingerprint.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX_fingerprint.pdf'))
@pytest.mark.pairedend_mouse
def test_coverage_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
@pytest.mark.pairedend_mouse
def test_spearman_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
@pytest.mark.pairedend_mouse
def test_pearson_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
@pytest.mark.pairedend_mouse
def test_fingerprint_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB749GLW_fingerprint.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP_fingerprint.pdf'))
@pytest.mark.integration
def test_experiment_qc_pairedend():
# Do the same thing for paired end data
pass
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment