Skip to content
Snippets Groups Projects
Commit 2d2cd6a5 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '17-FixExperimentQC' into 'master'

Resolve "Fix Experiment QC"

Closes #17

See merge request !10
parents a14bd619 ec9f9468
1 merge request!10Resolve "Fix Experiment QC"
Pipeline #5329 passed with stages
in 10 hours, 33 minutes, and 59 seconds
...@@ -12,6 +12,7 @@ user_configuration: ...@@ -12,6 +12,7 @@ user_configuration:
stage: unit stage: unit
script: script:
- pytest -m unit - pytest -m unit
- pytest -m unit --cov=./workflow/scripts
single_end_human: single_end_human:
stage: integration stage: integration
......
...@@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. ...@@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file.
### Fixed ### Fixed
- Removed biosample, factor, treatment from design file - Removed biosample, factor, treatment from design file
- Updated documentation - Updated documentation
- Output graphics are in pdf format
### Added ### Added
- Changelog - Changelog
......
...@@ -48,6 +48,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git ...@@ -48,6 +48,7 @@ $ git clone git@git.biohpc.swmed.edu:BICF/Astrocyte/atacseq_analysis.git
2. Trim adaptors with TrimGalore! 2. Trim adaptors with TrimGalore!
3. Map reads with BWA, filter with SamTools, and sort with Sambamba 3. Map reads with BWA, filter with SamTools, and sort with Sambamba
4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools 4. Mark duplicates with Sambamba, Filter reads with SamTools, and calculate library complexity with SamTools and bedtools
5. QC metrics with deep tools
## Output Files ## Output Files
...@@ -63,7 +64,11 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed ...@@ -63,7 +64,11 @@ filterReads | *.dedup.bam | filtered bam file with duplicate reads removed
filterReads | *.dedup.bam.bai | indexed filtered bam file filterReads | *.dedup.bam.bai | indexed filtered bam file
filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools) filterReads | *.dedup.flagstat.qc | QC metrics of filtered bam file (mapping stats, samtools)
filterReads | *.dedup.pbc.qc | QC metrics of library complexity filterReads | *.dedup.pbc.qc | QC metrics of library complexity
experimentQC | coverage.pdf | plot to assess the sequencing depth of a given sample
experimentQC | *_fingerprint.pdf | plot to determine if the antibody-treatment enriched sufficiently
experimentQC | heatmeap_SpearmanCorr.pdf | plot of Spearman correlation between samples
experimentQC | heatmeap_PearsonCorr.pdf | plot of Pearson correlation between samples
experimentQC | sample_mbs.npz | array of multiple BAM summaries
## Common Errors ## Common Errors
......
...@@ -251,28 +251,35 @@ process filterReads { ...@@ -251,28 +251,35 @@ process filterReads {
// Define channel collecting dedup reads into new design file // Define channel collecting dedup reads into new design file
dedupReads dedupReads
.map{ sampleId, bam, bai, experimentId, replicate -> .map{ sampleId, bam, bai, experimentId, replicate ->
"$sampleId\t$bam\t$bai\t$experimentId\t$replicate\n"} "${sampleId}\t${bam}\t${bai}\t${experimentId}\t${replicate}\n"}
.collectFile(name:'design_dedup.tsv', seed:"sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir:"$baseDir/output/design") .collectFile(name: 'design_dedup.tsv', seed: "sample_id\tbam_reads\tbam_index\texperiment_id\treplicate\n", storeDir: "${outDir}/design")
.into { dedupDesign; preDiffDesign } .into { dedupDesign; preDiffDesign }
// Quality Metrics using deeptools // Quality Metrics using deeptools
process experimentQC { process experimentQC {
publishDir "$baseDir/output/${task.process}", mode: 'copy' publishDir "${outDir}/${task.process}", mode: 'copy'
input: input:
file dedupDesign
file dedupDesign
output: output:
file '*.{pdf,npz}' into deepToolsStats
file '*.{png,npz}' into deepToolsStats file('version_*.txt') into experimentQCVersions
script: script:
if (params.astrocyte == true) {
""" """
python3 $baseDir/scripts/experiment_qc.py -d $dedupDesign -a module load python/3.6.1-2-anaconda
""" module load deeptools/2.5.0.1
python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign}
"""
}
else {
"""
python3 ${baseDir}/scripts/experiment_qc.py -d ${dedupDesign}
"""
}
} }
......
...@@ -35,11 +35,6 @@ def get_args(): ...@@ -35,11 +35,6 @@ def get_args():
help="The design file to run QC (tsv format).", help="The design file to run QC (tsv format).",
required=True) required=True)
parser.add_argument('-a', '--atac',
help="True/False if ATAC-seq or ChIP-seq.",
default=False,
action='store_true')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -52,6 +47,15 @@ def check_tools(): ...@@ -52,6 +47,15 @@ def check_tools():
deeptools_path = shutil.which("deeptools") deeptools_path = shutil.which("deeptools")
if deeptools_path: if deeptools_path:
logger.info('Found deeptools: %s', deeptools_path) logger.info('Found deeptools: %s', deeptools_path)
# Get Version
deeptools_version_command = "deeptools --version"
deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
# Write to file
deeptools_file = open("version_deeptools.txt", "wb")
deeptools_file.write(deeptools_version)
deeptools_file.close()
else: else:
logger.error('Missing deeptools') logger.error('Missing deeptools')
raise Exception('Missing deeptools') raise Exception('Missing deeptools')
...@@ -77,10 +81,10 @@ def generate_read_summary(design): ...@@ -77,10 +81,10 @@ def generate_read_summary(design):
return mbs_filename return mbs_filename
def check_correlation(mbs): def check_spearman_correlation(mbs):
'''Check Spearman pairwise correlation of samples based on read counts.''' '''Check Spearman pairwise correlation of samples based on read counts.'''
spearman_filename = 'heatmap_SpearmanCorr.png' spearman_filename = 'heatmap_SpearmanCorr.pdf'
spearman_params = "--corMethod spearman --skipZero" + \ spearman_params = "--corMethod spearman --skipZero" + \
" --plotTitle \"Spearman Correlation of Read Counts\"" + \ " --plotTitle \"Spearman Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
...@@ -96,12 +100,31 @@ def check_correlation(mbs): ...@@ -96,12 +100,31 @@ def check_correlation(mbs):
out, err = spearman_correlation.communicate() out, err = spearman_correlation.communicate()
def check_pearson_correlation(mbs):
'''Check Pearson pairwise correlation of samples based on read counts.'''
pearson_filename = 'heatmap_PearsonCorr.pdf'
pearson_params = "--corMethod pearson --skipZero" + \
" --plotTitle \"Pearson Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
pearson_command = (
"plotCorrelation -in %s %s -o %s"
% (mbs, pearson_params, pearson_filename)
)
logger.info("Running deeptools with %s", pearson_command)
pearson_correlation = subprocess.Popen(pearson_command, shell=True)
out, err = pearson_correlation.communicate()
def check_coverage(design): def check_coverage(design):
'''Asses the sequencing depth of samples.''' '''Asses the sequencing depth of samples.'''
bam_files = ' '.join(design['bam_reads']) bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id']) labels = ' '.join(design['sample_id'])
coverage_filename = 'coverage.png' coverage_filename = 'coverage.pdf'
coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \ coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
" --ignoreDuplicates --minMappingQuality 10" " --ignoreDuplicates --minMappingQuality 10"
...@@ -116,44 +139,9 @@ def check_coverage(design): ...@@ -116,44 +139,9 @@ def check_coverage(design):
out, err = coverage_summary.communicate() out, err = coverage_summary.communicate()
def update_controls(design):
'''Update design file to append controls list.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
return design
def check_enrichment(sample_id, control_id, sample_reads, control_reads):
'''Asses the enrichment per sample.'''
fingerprint_filename = sample_id + '_fingerprint.png'
fingerprint_command = (
"plotFingerprint -b %s %s --labels %s %s --plotFile %s"
% (sample_reads, control_reads, sample_id, control_id, fingerprint_filename)
)
logger.info("Running deeptools with %s", fingerprint_command)
fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True)
out, err = fingerprint_summary.communicate()
def main(): def main():
args = get_args() args = get_args()
design = args.design design = args.design
atac = args.atac
# Create a file handler # Create a file handler
handler = logging.FileHandler('experiment_qc.log') handler = logging.FileHandler('experiment_qc.log')
...@@ -167,21 +155,11 @@ def main(): ...@@ -167,21 +155,11 @@ def main():
# Run correlation # Run correlation
mbs_filename = generate_read_summary(design_df) mbs_filename = generate_read_summary(design_df)
check_correlation(mbs_filename) check_spearman_correlation(mbs_filename)
check_pearson_correlation(mbs_filename)
# Run coverage # Run coverage
check_coverage(design_df) check_coverage(design_df)
# Run enrichment
if not atac:
new_design_df = update_controls(design_df)
for index, row in new_design_df.iterrows():
check_enrichment(
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'])
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -23,14 +23,18 @@ def design_bam(): ...@@ -23,14 +23,18 @@ def design_bam():
return design_df return design_df
@pytest.mark.integration @pytest.mark.singleend_human
def test_experiment_qc_singleend(): def test_coverage_singleend_human():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz')) assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png')) assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.png')) assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
@pytest.mark.integration @pytest.mark.pairedend_mouse
def test_experiment_qc_pairedend(): def test_coverage_pairedend_mouse():
# Do the same thing for paired end data assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
pass assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment