Skip to content
Snippets Groups Projects
Commit 89c92593 authored by Holly Ruess's avatar Holly Ruess
Browse files

fix experiment qc

parent a14bd619
Branches
Tags
1 merge request!10Resolve "Fix Experiment QC"
Pipeline #5290 failed with stages
in 3 hours, 1 minute, and 37 seconds
...@@ -12,6 +12,7 @@ user_configuration: ...@@ -12,6 +12,7 @@ user_configuration:
stage: unit stage: unit
script: script:
- pytest -m unit - pytest -m unit
- pytest -m unit --cov=./workflow/scripts
single_end_human: single_end_human:
stage: integration stage: integration
......
...@@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. ...@@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file.
### Fixed ### Fixed
- Removed biosample, factor, treatment from design file - Removed biosample, factor, treatment from design file
- Updated documentation - Updated documentation
- Output graphics are in pdf format
### Added ### Added
- Changelog - Changelog
......
...@@ -48,6 +48,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git ...@@ -48,6 +48,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
2. Trim adaptors with TrimGalore! 2. Trim adaptors with TrimGalore!
3. Map reads with BWA, filter with SamTools, and sort with Sambamba 3. Map reads with BWA, filter with SamTools, and sort with Sambamba
4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools 4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
5. QC metrics with deep tools
## Output Files ## Output Files
...@@ -63,7 +64,11 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed ...@@ -63,7 +64,11 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
filterReads | *.dedup.bam.bai | indexed filtered bam file filterReads | *.dedup.bam.bai | indexed filtered bam file
filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools) filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
filterReads | *.dedup.pbc.qc | QC metrics of library complexity filterReads | *.dedup.pbc.qc | QC metrics of library complexity
experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
experimentQC | *_fingerprint.pdf | plot to determine if the antibody-treatment enriched sufficiently
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
experimentQC | sample_mbs.npz | array of multiple BAM summaries
## Common Errors ## Common Errors
......
...@@ -251,28 +251,35 @@ process filterReads { ...@@ -251,28 +251,35 @@ process filterReads {
// Define channel collecting dedup reads into new design file // Define channel collecting dedup reads into new design file
dedupReads dedupReads
.map{ sampleId, bam, bai, experimentId, replicate -> .map{ sampleId, bam, bai, experimentId, replicate ->
"$sampleId\t$bam\t$bai\t$experimentId\t$replicate\n"} "${sampleId}\t${bam}\t${bai}\t${experimentId}\t${replicate}\n"}
.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir:"$baseDir/output/design") .collectFile(name: 'design_dedup.tsv', seed: "sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir: "${outDir}/design")
.into { dedupDesign; preDiffDesign } .into { dedupDesign; preDiffDesign }
// Quality Metrics using deeptools // Quality Metrics using deeptools
process experimentQC { process experimentQC {
publishDir "$baseDir/output/${task.process}", mode: 'copy' publishDir "${outDir}/${task.process}", mode: 'copy'
input: input:
file dedupDesign
file dedupDesign
output: output:
file '*.{png,npz}' into deepToolsStats
file '*.{png,npz}' into deepToolsStats file('version_*.txt') into experimentQCVersions
script: script:
if (params.astrocyte == true) {
""" """
python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -a module load python/3.6.1-2-anaconda
""" module load deeptools/2.5.0.1
python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign}
"""
}
else {
"""
python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign}
"""
}
} }
......
...@@ -35,11 +35,6 @@ def get_args(): ...@@ -35,11 +35,6 @@ def get_args():
help="The design file to run QC (tsv format).", help="The design file to run QC (tsv format).",
required=True) required=True)
parser.add_argument('-a', '--atac',
help="True/False if ATAC-seq or ChIP-seq.",
default=False,
action='store_true')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -52,6 +47,15 @@ def check_tools(): ...@@ -52,6 +47,15 @@ def check_tools():
deeptools_path = shutil.which("deeptools") deeptools_path = shutil.which("deeptools")
if deeptools_path: if deeptools_path:
logger.info('Found deeptools: %s', deeptools_path) logger.info('Found deeptools: %s', deeptools_path)
# Get Version
deeptools_version_command = "deeptools --version"
deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
# Write to file
deeptools_file = open("version_deeptools.txt", "wb")
deeptools_file.write(deeptools_version)
deeptools_file.close()
else: else:
logger.error('Missing deeptools') logger.error('Missing deeptools')
raise Exception('Missing deeptools') raise Exception('Missing deeptools')
...@@ -77,10 +81,10 @@ def generate_read_summary(design): ...@@ -77,10 +81,10 @@ def generate_read_summary(design):
return mbs_filename return mbs_filename
def check_correlation(mbs): def check_spearman_correlation(mbs):
'''Check Spearman pairwise correlation of samples based on read counts.''' '''Check Spearman pairwise correlation of samples based on read counts.'''
spearman_filename = 'heatmap_SpearmanCorr.png' spearman_filename = 'heatmap_SpearmanCorr.pdf'
spearman_params = "--corMethod spearman --skipZero" + \ spearman_params = "--corMethod spearman --skipZero" + \
" --plotTitle \"Spearman Correlation of Read Counts\"" + \ " --plotTitle \"Spearman Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
...@@ -96,12 +100,31 @@ def check_correlation(mbs): ...@@ -96,12 +100,31 @@ def check_correlation(mbs):
out, err = spearman_correlation.communicate() out, err = spearman_correlation.communicate()
def check_pearson_correlation(mbs):
'''Check Pearson pairwise correlation of samples based on read counts.'''
pearson_filename = 'heatmap_PearsonCorr.pdf'
pearson_params = "--corMethod pearson --skipZero" + \
" --plotTitle \"Pearson Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
pearson_command = (
"plotCorrelation -in %s %s -o %s"
% (mbs, pearson_params, pearson_filename)
)
logger.info("Running deeptools with %s", pearson_command)
pearson_correlation = subprocess.Popen(pearson_command, shell=True)
out, err = pearson_correlation.communicate()
def check_coverage(design): def check_coverage(design):
'''Asses the sequencing depth of samples.''' '''Asses the sequencing depth of samples.'''
bam_files = ' '.join(design['bam_reads']) bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id']) labels = ' '.join(design['sample_id'])
coverage_filename = 'coverage.png' coverage_filename = 'coverage.pdf'
coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \ coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
" --ignoreDuplicates --minMappingQuality 10" " --ignoreDuplicates --minMappingQuality 10"
...@@ -116,44 +139,9 @@ def check_coverage(design): ...@@ -116,44 +139,9 @@ def check_coverage(design):
out, err = coverage_summary.communicate() out, err = coverage_summary.communicate()
def update_controls(design):
'''Update design file to append controls list.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
return design
def check_enrichment(sample_id, control_id, sample_reads, control_reads):
'''Asses the enrichment per sample.'''
fingerprint_filename = sample_id + '_fingerprint.png'
fingerprint_command = (
"plotFingerprint -b %s %s --labels %s %s --plotFile %s"
% (sample_reads, control_reads, sample_id, control_id, fingerprint_filename)
)
logger.info("Running deeptools with %s", fingerprint_command)
fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True)
out, err = fingerprint_summary.communicate()
def main(): def main():
args = get_args() args = get_args()
design = args.design design = args.design
atac = args.atac
# Create a file handler # Create a file handler
handler = logging.FileHandler('experiment_qc.log') handler = logging.FileHandler('experiment_qc.log')
...@@ -167,21 +155,11 @@ def main(): ...@@ -167,21 +155,11 @@ def main():
# Run correlation # Run correlation
mbs_filename = generate_read_summary(design_df) mbs_filename = generate_read_summary(design_df)
check_correlation(mbs_filename) check_spearman_correlation(mbs_filename)
check_pearson_correlation(mbs_filename)
# Run coverage # Run coverage
check_coverage(design_df) check_coverage(design_df)
# Run enrichment
if not atac:
new_design_df = update_controls(design_df)
for index, row in new_design_df.iterrows():
check_enrichment(
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'])
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -23,14 +23,49 @@ def design_bam(): ...@@ -23,14 +23,49 @@ def design_bam():
return design_df return design_df
@pytest.mark.integration @pytest.mark.singleend_human
def test_experiment_qc_singleend(): def test_coverage_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
@pytest.mark.singleend_human
def test_spearman_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
@pytest.mark.singleend_human
def test_pearson_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
@pytest.mark.singleend_human
def test_fingerprint_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB170KFO_fingerprint.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB969KTX_fingerprint.pdf'))
@pytest.mark.pairedend_mouse
def test_coverage_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
@pytest.mark.pairedend_mouse
def test_spearman_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
@pytest.mark.pairedend_mouse
def test_pearson_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
@pytest.mark.pairedend_mouse
def test_fingerprint_pairedend_mouse():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB749GLW_fingerprint.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB122XDP_fingerprint.pdf'))
@pytest.mark.integration
def test_experiment_qc_pairedend():
# Do the same thing for paired end data
pass
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment