Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • BICF/Astrocyte/chipseq_analysis
  • s190984/chipseq_analysis
  • astrocyte/workflows/bicf/chipseq_analysis
  • s219741/chipseq-analysis-containerized
Show changes
Showing
with 755 additions and 260 deletions
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''Make YAML of software versions.'''
from __future__ import print_function
from collections import OrderedDict
import re
import os
import logging
import glob
import argparse
import numpy as np
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
SOFTWARE_REGEX = {
'Nextflow': ['version_nextflow.txt', r"(\S+)"],
'Trim Galore!': ['trimReads_vf/version_trimgalore.txt', r"version (\S+)"],
'Cutadapt': ['trimReads_vf/version_cutadapt.txt', r"Version (\S+)"],
'BWA': ['alignReads_vf/version_bwa.txt', r"Version: (\S+)"],
'Samtools': ['alignReads_vf/version_samtools.txt', r"samtools (\S+)"],
'Sambamba': ['filterReads_vf/version_sambamba.txt', r"sambamba (\S+)"],
'BEDTools': ['convertReads_vf/version_bedtools.txt', r"bedtools v(\S+)"],
'R': ['crossReads_vf/version_r.txt', r"R version (\S+)"],
'SPP': ['crossReads_vf/version_spp.txt', r"\[1\] ‘(1.14)’"],
'MACS2': ['callPeaksMACS_vf/version_macs.txt', r"macs2 (\S+)"],
'bedGraphToBigWig': ['callPeaksMACS_vf/version_bedGraphToBigWig.txt', r"bedGraphToBigWig v (\S+)"],
'ChIPseeker': ['peakAnnotation_vf/version_ChIPseeker.txt', r"Version (\S+)\""],
'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"],
'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""],
'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"],
'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
}
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-o', '--output',
help="The out file name.",
required=True)
parser.add_argument('-t', '--test',
help='Used for testing purposes',
default=False,
action='store_true')
args = parser.parse_args()
return args
def check_files(files, test):
'''Check if version files are found.'''
logger.info("Running file check.")
software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0]
extra_files = set(files) - set(software_files)
if len(extra_files) > 0 and test:
logger.error('Missing regex: %s', list(extra_files))
raise Exception("Missing regex: %s" % list(extra_files))
def main():
args = get_args()
output = args.output
test = args.test
out_filename = output + '_mqc.yaml'
results = OrderedDict()
results['Nextflow'] = '<span style="color:#999999;\">Not Run</span>'
results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
results['Cutadapt'] = '<span style="color:#999999;\">Not Run</span>'
results['BWA'] = '<span style="color:#999999;\">Not Run</span>'
results['Samtools'] = '<span style="color:#999999;\">Not Run</span>'
results['Sambamba'] = '<span style="color:#999999;\">Not Run</span>'
results['BEDTools'] = '<span style="color:#999999;\">Not Run</span>'
results['R'] = '<span style="color:#999999;\">Not Run</span>'
results['SPP'] = '<span style="color:#999999;\">Not Run</span>'
results['MACS2'] = '<span style="color:#999999;\">Not Run</span>'
results['bedGraphToBigWig'] = '<span style="color:#999999;\">Not Run</span>'
results['ChIPseeker'] = '<span style="color:#999999;\">Not Run</span>'
results['MEME-ChIP'] = '<span style="color:#999999;\">Not Run</span>'
results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
# list all files
files = glob.glob('**/*.txt', recursive=True)
# Check for version files:
check_files(files, test)
# Search each file using its regex
for k, v in SOFTWARE_REGEX.items():
if os.path.isfile(v[0]):
with open(v[0]) as x:
versions = x.read()
match = re.search(v[1], versions)
if match:
results[k] = "v{}".format(match.group(1))
# Dump to YAML
print(
'''
id: 'Software Versions'
section_name: 'Software Versions'
section_href: 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/'
plot_type: 'html'
description: 'are collected at run time from the software output.'
data: |
<dl class="dl-horizontal">
'''
, file = open(out_filename, "w"))
for k, v in results.items():
print(" <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a"))
print(" </dl>", file = open(out_filename, "a"))
if __name__ == '__main__':
main()
......@@ -62,6 +62,15 @@ def check_tools():
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
......@@ -69,6 +78,18 @@ def check_tools():
sambamba_path = shutil.which("sambamba")
if sambamba_path:
logger.info('Found sambamba: %s', sambamba_path)
# Get Version
sambamba_version_command = "sambamba"
try:
subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
sambamba_version = e.output
# Write to file
sambamba_file = open("version_sambamba.txt", "wb")
sambamba_file.write(sambamba_version)
sambamba_file.close()
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
......@@ -76,6 +97,15 @@ def check_tools():
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
......@@ -106,7 +136,7 @@ def filter_mapped_pe(bam, bam_basename):
# Will produce name sorted BAM
"samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
if err:
logger.error("samtools filter error: %s" % (err))
logger.error("samtools filter error: %s", err)
# Remove orphan reads (pair was removed)
# and read pairs mapping to different chromosomes
......@@ -136,7 +166,7 @@ def filter_mapped_se(bam, bam_basename):
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Obtain name sorted BAM file
with open(filt_bam_filename, 'w') as fh:
with open(filt_bam_filename, 'w') as temp_file:
samtools_filter_command = (
"samtools view -F 1804 -q 30 -b %s"
% (bam)
......@@ -144,7 +174,7 @@ def filter_mapped_se(bam, bam_basename):
logger.info(samtools_filter_command)
subprocess.check_call(
shlex.split(samtools_filter_command),
stdout=fh)
stdout=temp_file)
return filt_bam_filename
......@@ -153,11 +183,11 @@ def dedup_mapped(bam, bam_basename, paired):
'''Use sambamba and samtools to remove duplicates.'''
# Markduplicates
dup_file_qc_filename = bam_basename + ".dup.qc"
dup_file_qc_filename = bam_basename + ".dedup.qc"
tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
sambamba_params = "--hash-table-size=17592186044416" + \
" --overflow-list-size=20000000 --io-buffer-size=256"
with open(dup_file_qc_filename, 'w') as fh:
with open(dup_file_qc_filename, 'w') as temp_file:
sambamba_markdup_command = (
"sambamba markdup -t %d %s --tmpdir=%s %s %s"
% (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename)
......@@ -165,11 +195,10 @@ def dedup_mapped(bam, bam_basename, paired):
logger.info(sambamba_markdup_command)
subprocess.check_call(
shlex.split(sambamba_markdup_command),
stderr=fh)
stderr=temp_file)
# Remove duplicates
final_bam_prefix = bam_basename + ".filt.nodup"
final_bam_prefix = bam_basename + ".dedup"
final_bam_filename = final_bam_prefix + ".bam"
if paired: # paired-end data
......@@ -179,11 +208,11 @@ def dedup_mapped(bam, bam_basename, paired):
samtools_dedupe_command = \
"samtools view -F 1804 -b %s" % (tmp_dup_mark_filename)
with open(final_bam_filename, 'w') as fh:
with open(final_bam_filename, 'w') as temp_file:
logger.info(samtools_dedupe_command)
subprocess.check_call(
shlex.split(samtools_dedupe_command),
stdout=fh)
stdout=temp_file)
# Index final bam file
sambamba_index_command = \
......@@ -192,12 +221,12 @@ def dedup_mapped(bam, bam_basename, paired):
subprocess.check_output(shlex.split(sambamba_index_command))
# Generate mapping statistics
final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"
with open(final_bam_file_mapstats_filename, 'w') as fh:
mapstats_filename = final_bam_prefix + ".flagstat.qc"
with open(mapstats_filename, 'w') as temp_file:
flagstat_command = "sambamba flagstat -t %d %s" \
% (cpu_count(), final_bam_filename)
logger.info(flagstat_command)
subprocess.check_call(shlex.split(flagstat_command), stdout=fh)
subprocess.check_call(shlex.split(flagstat_command), stdout=temp_file)
os.remove(bam)
return tmp_dup_mark_filename
......@@ -206,7 +235,7 @@ def dedup_mapped(bam, bam_basename, paired):
def compute_complexity(bam, paired, bam_basename):
'''Calculate library complexity .'''
pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
pbc_file_qc_filename = bam_basename + ".pbc.qc"
tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
# Sort by name
......@@ -215,6 +244,7 @@ def compute_complexity(bam, paired, bam_basename):
# Obtain unique count statistics
# PBC File output
# Sample Name[tab]
# TotalReadPairs [tab]
# DistinctReadPairs [tab]
# OneReadPair [tab]
......@@ -223,12 +253,12 @@ def compute_complexity(bam, paired, bam_basename):
# PBC1=OnePair/Distinct [tab]
# PBC2=OnePair/TwoPair
pbc_headers = ['TotalReadPairs',
'DistinctReadPairs',
'OneReadPair',
'TwoReadPairs',
'NRF',
'PBC1',
'PBC2']
'DistinctReadPairs',
'OneReadPair',
'TwoReadPairs',
'NRF',
'PBC1',
'PBC2']
if paired:
steps = [
......@@ -247,11 +277,15 @@ def compute_complexity(bam, paired, bam_basename):
])
out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename)
if err:
logger.error("PBC file error: %s" % (err))
logger.error("PBC file error: %s", err)
# Add headers
# Add Sample Name and headers
pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
names=pbc_headers)
names=pbc_headers)
pbc_file['Sample'] = bam_basename
pbc_headers_new = list(pbc_file)
pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample')))
pbc_file = pbc_file[pbc_headers_new]
pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
os.remove(bam)
os.remove(bam + '.bai')
......
......@@ -9,7 +9,6 @@ import shutil
import shlex
import logging
from multiprocessing import cpu_count
import json
import utils
EPILOG = '''
......@@ -33,7 +32,7 @@ STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
......@@ -47,6 +46,10 @@ def get_args():
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
......@@ -67,6 +70,18 @@ def check_tools():
bwa_path = shutil.which("bwa")
if bwa_path:
logger.info('Found bwa: %s', bwa_path)
# Get Version
bwa_version_command = "bwa"
try:
subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
bwa_version = e.output
# Write to file
bwa_file = open("version_bwa.txt", "wb")
bwa_file.write(bwa_version)
bwa_file.close()
else:
logger.error('Missing bwa')
raise Exception('Missing bwa')
......@@ -74,6 +89,15 @@ def check_tools():
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
......@@ -101,7 +125,7 @@ def generate_sa(fastq, reference):
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
bam_filename = '%s.srt.bam' % (fastq_basename)
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa samse %s %s %s"
......@@ -112,7 +136,7 @@ def align_se(fastq, sai, reference, fastq_basename):
out, err = utils.run_pipe(steps)
if err:
logger.error("samse/samtools error: %s" % (err))
logger.error("samse/samtools error: %s", err)
return bam_filename
......@@ -122,21 +146,21 @@ def align_pe(fastq, sai, reference, fastq_basename):
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
bam_filename = '%s.srt.bam' % (fastq_basename)
bam_filename = '%s.bam' % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
"bwa sampe -P %s %s %s %s %s"
% (reference, sai[0], sai[1],
fastq[0], fastq[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
"bwa sampe -P %s %s %s %s %s"
% (reference, sai[0], sai[1],
fastq[0], fastq[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
out, err = utils.run_pipe(steps, badcigar_filename)
if err:
logger.error("sampe error: %s" % (err))
logger.error("sampe error: %s", err)
steps = [
"cat %s" % (sam_filename),
......@@ -147,7 +171,7 @@ def align_pe(fastq, sai, reference, fastq_basename):
out, err = utils.run_pipe(steps)
if err:
logger.error("samtools error: %s" % (err))
logger.error("samtools error: %s", err)
return bam_filename
......@@ -157,6 +181,7 @@ def main():
paired = args.paired
fastq = args.fastq
reference = args.reference
sample = args.sample
# Create a file handler
handler = logging.FileHandler('map.log')
......@@ -167,31 +192,25 @@ def main():
# Run Suffix Array generation
sai = []
for fq in fastq:
sai_filename = generate_sa(fq, reference)
for fastq_file in fastq:
sai_filename = generate_sa(fastq_file, reference)
sai.append(sai_filename)
# Make file basename
fastq_basename = sample
# Run alignment for either PE or SE
if paired: # paired-end data
fastq_r1_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
fastq_r2_basename = os.path.basename(
utils.strip_extensions(fastq[1], STRIP_EXTENSIONS))
fastq_basename = fastq_r1_basename + fastq_r2_basename
bam_filename = align_pe(fastq, sai, reference, fastq_basename)
else:
fastq_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
bam_filename = align_se(fastq, sai, reference, fastq_basename)
bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as fh:
bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as temp_file:
subprocess.check_call(
shlex.split("samtools flagstat %s" % (bam_filename)),
stdout=fh)
stdout=temp_file)
# Remove sai files
for sai_file in sai:
......
......@@ -3,16 +3,14 @@
'''Call Motifs on called peaks.'''
import os
import sys
import re
from re import sub
import string
import argparse
import logging
import shutil
import subprocess
from multiprocessing import Pool
import pandas as pd
import utils
from multiprocessing import Pool
EPILOG = '''
For more details:
......@@ -27,6 +25,13 @@ logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
def get_args():
'''Define arguments.'''
......@@ -51,19 +56,66 @@ def get_args():
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
meme_path = shutil.which("meme")
if meme_path:
logger.info('Found meme: %s', meme_path)
# Get Version
memechip_version_command = "meme-chip --version"
memechip_version = subprocess.check_output(memechip_version_command, shell=True)
# Write to file
meme_file = open("version_memechip.txt", "wb")
meme_file.write(b"Version %s" % (memechip_version))
meme_file.close()
else:
logger.error('Missing meme')
raise Exception('Missing meme')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def run_wrapper(args):
motif_search(*args)
motif_search(*args)
def motif_search(filename, genome, experiment, peak):
'''Run motif serach on peaks.'''
file_basename = os.path.basename(
utils.strip_extensions(filename, STRIP_EXTENSIONS))
file_basename = os.path.basename(filename)
sorted_fn = 'sorted-%s' % (file_basename)
out_fa = '%s.fa' % (experiment)
out_motif = '%s_memechip' % (experiment)
# Sort Bed file and limit number of peaks
if peak == -1:
peak = utils.count_lines(filename)
peak_no = 'all'
else:
peak_no = peak
sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)
out, err = utils.run_pipe([
'sort -k %dgr,%dgr %s' % (5, 5, filename),
......@@ -73,9 +125,15 @@ def motif_search(filename, genome, experiment, peak):
out, err = utils.run_pipe([
'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
#Call memechip
if err:
logger.error("bedtools error: %s", err)
# Call memechip
out, err = utils.run_pipe([
'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
if err:
logger.error("meme-chip error: %s", err)
def main():
args = get_args()
......@@ -83,14 +141,22 @@ def main():
genome = args.genome
peak = args.peak
# Create a file handler
handler = logging.FileHandler('motif.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_df = pd.read_csv(design, sep='\t')
meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
work_pool = Pool(min(12,design_df.shape[0]))
return_list = work_pool.map(run_wrapper, meme_arglist)
meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
work_pool = Pool(min(12, design_df.shape[0]))
return_list = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
if __name__=="__main__":
main()
if __name__ == '__main__':
main()
......@@ -6,6 +6,7 @@ import os
import argparse
import logging
import shutil
import subprocess
import pandas as pd
import utils
......@@ -49,6 +50,15 @@ def check_tools():
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
......@@ -113,9 +123,9 @@ def overlap(experiment, design):
# with any one of the overlapping peak pairs >= 0.5
steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
awk_command,
cut_command,
'sort -u']
awk_command,
cut_command,
'sort -u']
iter_true_peaks = iter(true_rep_peaks)
next(iter_true_peaks)
......@@ -123,13 +133,13 @@ def overlap(experiment, design):
if len(true_rep_peaks) > 1:
for true_peak in true_rep_peaks[1:]:
steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak),
awk_command,
cut_command,
'sort -u'])
awk_command,
cut_command,
'sort -u'])
out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
print("%d peaks overlap with both true replicates" %
(utils.count_lines(overlap_tr_fn)))
(utils.count_lines(overlap_tr_fn)))
# Find pooled peaks that overlap PseudoRep1 and PseudoRep2
# where overlap is defined as the fractional overlap
......@@ -146,7 +156,7 @@ def overlap(experiment, design):
out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
print("%d peaks overlap with both pooled pseudoreplicates"
% (utils.count_lines(overlap_pr_fn)))
% (utils.count_lines(overlap_pr_fn)))
# Make union of peak lists
out, err = utils.run_pipe([
......@@ -154,7 +164,7 @@ def overlap(experiment, design):
'sort -u'
], overlapping_peaks_fn)
print("%d peaks overlap with true replicates or with pooled pseudorepliates"
% (utils.count_lines(overlapping_peaks_fn)))
% (utils.count_lines(overlapping_peaks_fn)))
# Make rejected peak list
out, err = utils.run_pipe([
......@@ -178,6 +188,9 @@ def main():
handler = logging.FileHandler('consensus_peaks.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files as dataframes
design_peaks_df = pd.read_csv(design, sep='\t')
design_files_df = pd.read_csv(files, sep='\t')
......@@ -187,7 +200,7 @@ def main():
# Make a design file for annotating Peaks
anno_cols = ['Condition', 'Peaks']
design_anno = pd.DataFrame(columns = anno_cols)
design_anno = pd.DataFrame(columns=anno_cols)
# Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
......@@ -197,16 +210,16 @@ def main():
# Write out design files
design_diff.columns = ['SampleID',
'bamReads',
'Condition',
'Tissue',
'Factor',
'Treatment',
'Replicate',
'ControlID',
'bamControl',
'Peaks',
'PeakCaller']
'bamReads',
'Condition',
'Tissue',
'Factor',
'Treatment',
'Replicate',
'ControlID',
'bamControl',
'Peaks',
'PeakCaller']
design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False)
design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
......
......@@ -3,11 +3,14 @@
'''Generate pooled and pseudoreplicate from data.'''
import argparse
import utils
import logging
import os
import subprocess
import shutil
import shlex
import pandas as pd
import numpy as np
import os
import utils
EPILOG = '''
For more details:
......@@ -21,6 +24,12 @@ logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']
def get_args():
'''Define arguments.'''
......@@ -87,21 +96,23 @@ def pool(tag_files, outfile, paired):
else:
file_extension = '.bedse.gz'
pooled_filename = outfile + file_extension
pool_basename = os.path.basename(
utils.strip_extensions(outfile, STRIP_EXTENSIONS))
pooled_filename = pool_basename + file_extension
# Merge files
out, err = utils.run_pipe([
'gzip -dc %s' % (' '.join(tag_files)),
'gzip -cn'],
outfile=pooled_filename)
'gzip -cn'], outfile=pooled_filename)
return pooled_filename
def bedpe_to_tagalign(tag_file, outfile):
'''Convert read pairs to reads itno standard tagAlign file.'''
'''Convert read pairs to reads into standard tagAlign file.'''
se_tag_filename = outfile + "bedse.tagAlign.gz"
se_tag_filename = outfile + ".tagAlign.gz"
# Convert read pairs to reads into standard tagAlign file
tag_steps = ["zcat -f %s" % (tag_file)]
......@@ -122,7 +133,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
lines_per_rep = (no_lines+1)/2
# Make an array of number of psuedoreplicatesfile names
pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz'
pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz'
for r in [0, 1]}
# Shuffle and split file into equal parts
......@@ -132,21 +143,25 @@ def self_psuedoreplication(tag_file, prefix, paired):
splits_prefix = 'temp_split'
out, err = utils.run_pipe([
'gzip -dc %s' % (tag_file),
'shuf --random-source=%s' % (tag_file),
'split -d -l %d - %s' % (lines_per_rep, splits_prefix)])
psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | '
psuedo_command += 'split -d -l {} - {}."'
psuedo_command = psuedo_command.format(
tag_file,
tag_file,
int(lines_per_rep),
splits_prefix)
logger.info("Running psuedo with %s", psuedo_command)
subprocess.check_call(shlex.split(psuedo_command))
# Convert read pairs to reads into standard tagAlign file
for i, index in enumerate([0, 1]):
string_index = '0' + str(index)
string_index = '.0' + str(index)
steps = ['cat %s' % (splits_prefix + string_index)]
if paired:
steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
steps.extend(['gzip -cn'])
out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
os.remove(splits_prefix + string_index)
return pseudoreplicate_dict
......@@ -213,7 +228,7 @@ def main():
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True)
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
design_new_df['replicate'] = design_new_df['replicate'].astype(str)
design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1'
......@@ -243,8 +258,6 @@ def main():
# Drop index column
design_new_df.drop(labels='index', axis=1, inplace=True)
else:
# Make pool of replicates
replicate_files = design_df.tag_align.unique()
......@@ -269,7 +282,7 @@ def main():
pool_pseudoreplicates_dict = {}
for index, row in pseudoreplicates_df.iterrows():
replicate_id = index + 1
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz'
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
pool_replicate = pool(row, pr_filename, False)
pool_pseudoreplicates_dict[replicate_id] = pool_replicate
......@@ -277,16 +290,16 @@ def main():
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True)
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
if not single_control:
path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > 1.2:
if control_df.values.max() > cutoff_ratio:
logger.info("Number of reads in controls differ by " +
" > factor of %f. Using pooled controls." % (cutoff_ratio))
" > factor of %f. Using pooled controls." % (cutoff_ratio))
design_new_df['control_tag_align'] = path_to_pool_control
else:
for index, row in design_new_df.iterrows():
......@@ -302,14 +315,14 @@ def main():
if paired:
control = row['control_tag_align']
control_basename = os.path.basename(
utils.strip_extensions(control, ['.filt.nodup.bedpe.gz']))
control_tmp = bedpe_to_tagalign(control , "control_basename")
utils.strip_extensions(control, STRIP_EXTENSIONS))
control_tmp = bedpe_to_tagalign(control, control_basename)
path_to_control = cwd + '/' + control_tmp
design_new_df.loc[index, 'control_tag_align'] = \
path_to_control
else:
path_to_pool_control = pool_control
path_to_pool_control = pool_control
# Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy()
......@@ -330,10 +343,9 @@ def main():
tmp_metadata['tag_align'] = path_to_file
design_new_df = design_new_df.append(tmp_metadata)
# Write out new dataframe
design_new_df.to_csv(experiment_id + '_ppr.tsv',
header=True, sep='\t', index=False)
header=True, sep='\t', index=False)
if __name__ == '__main__':
......
#!/usr/bin/python
# programmer : bbc
# usage:
import sys
import argparse as ap
import logging
import subprocess
import pandas as pd
from multiprocessing import Pool
logging.basicConfig(level=10)
def prepare_argparser():
description = "Make wig file for given bed using bam"
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file")
argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
#argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format")
#argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
#argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
return(argparser)
def run_qc(files, controls, labels):
mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz"
p = subprocess.Popen(mbs_command, shell=True)
#logging.debug(mbs_command)
p.communicate()
pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels)
#logging.debug(pcor_command)
p = subprocess.Popen(pcor_command, shell=True)
p.communicate()
#plotCoverage
pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10"
p = subprocess.Popen(pcov_command, shell=True)
p.communicate()
#draw fingerprints plots
for treat,ctrl,name in zip(files,controls,labels):
fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png"
p = subprocess.Popen(fp_command, shell=True)
p.communicate()
def bam2bw_wrapper(command):
p = subprocess.Popen(command, shell=True)
p.communicate()
def run_signal(files, labels, genome):
#compute matrix and draw profile and heatmap
gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed"
bw_commands = []
for f in files:
bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw"))
work_pool = Pool(min(len(files), 12))
work_pool.map(bam2bw_wrapper, bw_commands)
work_pool.close()
work_pool.join()
cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz"
p = subprocess.Popen(cm_command, shell=True)
p.communicate()
hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png"
p = subprocess.Popen(hm_command, shell=True)
p.communicate()
def run(dfile,genome):
#parse dfile, suppose data files are the same folder as design file
dfile = pd.read_csv(dfile)
#QC: multiBamSummary and plotCorrelation
run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID'])
#signal plots
run_signal(dfile['bamReads'],dfile['SampleID'],genome)
def main():
argparser = prepare_argparser()
args = argparser.parse_args()
run(args.infile, args.genome)
if __name__=="__main__":
main()
......@@ -5,6 +5,7 @@
import subprocess
import argparse
import shutil
import os
import logging
EPILOG = '''
......@@ -32,6 +33,10 @@ def get_args():
nargs='+',
required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
......@@ -49,6 +54,15 @@ def check_tools():
trimgalore_path = shutil.which("trim_galore")
if trimgalore_path:
logger.info('Found trimgalore: %s', trimgalore_path)
# Get Version
trim_version_command = "trim_galore --version"
trimgalore_version = subprocess.check_output(trim_version_command, shell=True)
# Write to file
trimgalore_file = open("version_trimgalore.txt", "wb")
trimgalore_file.write(trimgalore_version)
trimgalore_file.close()
else:
logger.error('Missing trimgalore')
raise Exception('Missing trimgalore')
......@@ -56,11 +70,46 @@ def check_tools():
cutadapt_path = shutil.which("cutadapt")
if cutadapt_path:
logger.info('Found cutadapt: %s', cutadapt_path)
# Get Version
cutadapt_version_command = "cutadapt --version"
cutadapt_version = subprocess.check_output(cutadapt_version_command, shell=True)
# Write to file
cutadapt_file = open("version_cutadapt.txt", "wb")
cutadapt_file.write(b"Version %s" % (cutadapt_version))
cutadapt_file.close()
else:
logger.error('Missing cutadapt')
raise Exception('Missing cutadapt')
def rename_reads(fastq, sample, paired):
'''Rename fastq files by sample name.'''
# Get current directory to build paths
cwd = os.getcwd()
renamed_fastq = []
if paired: # paired-end data
# Set file names
renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
renamed_fastq.append(cwd + '/' + sample + '_R2.fastq.gz')
# Great symbolic links
os.symlink(fastq[0], renamed_fastq[0])
os.symlink(fastq[1], renamed_fastq[1])
else:
# Set file names
renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
# Great symbolic links
os.symlink(fastq[0], renamed_fastq[0])
return renamed_fastq
def trim_reads(fastq, paired):
'''Run trim_galore on 1 or 2 files.'''
......@@ -82,6 +131,7 @@ def trim_reads(fastq, paired):
def main():
args = get_args()
fastq = args.fastq
sample = args.sample
paired = args.paired
# Create a file handler
......@@ -91,8 +141,11 @@ def main():
# Check if tools are present
check_tools()
# Rename fastq files by sample
fastq_rename = rename_reads(fastq, sample, paired)
# Run trim_reads
trim_reads(fastq, paired)
trim_reads(fastq_rename, paired)
if __name__ == '__main__':
......
......@@ -16,7 +16,6 @@ logger.propagate = True
def run_pipe(steps, outfile=None):
# TODO: capture stderr
from subprocess import Popen, PIPE
p = None
p_next = None
......@@ -98,13 +97,13 @@ def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
'head -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
max_score = float(out.strip())
logger.info("rescale_scores: max_score = %s" % (max_score))
logger.info("rescale_scores: max_score = %s", max_score)
out, err = run_pipe([
'tail -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
min_score = float(out.strip())
logger.info("rescale_scores: min_score = %s" % (min_score))
logger.info("rescale_scores: min_score = %s", min_score)
a = min_score
b = max_score
......
......@@ -5,6 +5,7 @@
import os
import argparse
import shutil
import subprocess
import logging
from multiprocessing import cpu_count
import utils
......@@ -22,6 +23,13 @@ logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']
def get_args():
'''Define arguments.'''
......@@ -52,28 +60,51 @@ def check_tools():
r_path = shutil.which("R")
if r_path:
logger.info('Found R: %s', r_path)
# Get Version
r_version_command = "R --version"
r_version = subprocess.check_output(r_version_command, shell=True)
# Write to file
r_file = open("version_r.txt", "wb")
r_file.write(r_version)
r_file.close()
else:
logger.error('Missing R')
raise Exception('Missing R')
phantompeak_path = shutil.which("run_spp.R")
if phantompeak_path:
logger.info('Found phantompeak: %s', phantompeak_path)
# Get Version
spp_version_command = "R -e \"packageVersion('spp')\""
spp_version = subprocess.check_output(spp_version_command, shell=True)
# Write to file
spp_file = open("version_spp.txt", "wb")
spp_file.write(spp_version)
spp_file.close()
else:
logger.error('Missing phantompeak')
raise Exception('Missing phantompeak')
def xcor(tag, paired):
'''Use spp to calculate cross-correlation stats.'''
tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz']))
tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS))
uncompressed_tag_filename = tag_basename
# Subsample tagAlign file
NREADS = 15000000
number_reads = 15000000
subsampled_tag_filename = \
tag_basename + ".%d.tagAlign.gz" % (NREADS/1000000)
tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000)
steps = [
'zcat %s' % (tag),
'grep -v "chrM"',
'shuf -n %d --random-source=%s' % (NREADS, tag)]
'shuf -n %d --random-source=%s' % (number_reads, tag)]
if paired:
steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
......@@ -83,8 +114,8 @@ def xcor(tag, paired):
out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename)
# Calculate Cross-correlation QC scores
cc_scores_filename = subsampled_tag_filename + ".cc.qc"
cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf"
cc_scores_filename = tag_basename + ".cc.qc"
cc_plot_filename = tag_basename + ".cc.plot.pdf"
# CC_SCORE FILE format
# Filename <tab>
......@@ -122,7 +153,7 @@ def main():
check_tools()
# Calculate Cross-correlation
xcor_filename = xcor(tag, paired)
xcor(tag, paired)
if __name__ == '__main__':
......
......@@ -11,18 +11,34 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.singleend
def test_annotate_peaks_singleend():
def test_pie_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_pie.pdf'))
@pytest.mark.singleend
def test_upsetplot_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_upsetplot.pdf'))
@pytest.mark.singleend
def test_annotation_singleend():
annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) == 152840
assert utils.count_lines(annotation_file) == 152255
@pytest.mark.pairedend
def test_annotate_peaks_pairedend():
def test_pie_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_pie.pdf'))
@pytest.mark.pairedend
def test_upsetplot_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_upsetplot.pdf'))
@pytest.mark.pairedend
def test_annotation_pairedend():
annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) == 25391
assert utils.count_lines(annotation_file) == 25494
......@@ -9,16 +9,42 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.singleend
def test_call_peaks_macs_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.fc_signal.bw'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT.pvalue_signal.bw'))
peak_file = test_output_path + 'ENCLB144FDT_peaks.narrowPeak'
def test_fc_signal_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC/1/', 'ENCLB144FDT.fc_signal.bw'))
@pytest.mark.singleend
def test_pvalue_signal_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC/1/', 'ENCLB144FDT.pvalue_signal.bw'))
@pytest.mark.singleend
def test_peaks_xls_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC/1/', 'ENCLB144FDT_peaks.xls'))
@pytest.mark.singleend
def test_peaks_bed_singleend():
peak_file = test_output_path + 'ENCSR238SGC/1/' + 'ENCLB144FDT.narrowPeak'
assert utils.count_lines(peak_file) == 227389
@pytest.mark.pairedend
def test_call_peaks_macs_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.fc_signal.bw'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX.pvalue_signal.bw'))
peak_file = test_output_path + 'ENCLB568IYX_peaks.narrowPeak'
assert utils.count_lines(peak_file) == 112652
def test_fc_signal_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX.fc_signal.bw'))
@pytest.mark.pairedend
def test_pvalue_signal_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX.pvalue_signal.bw'))
@pytest.mark.pairedend
def test_peaks_xls_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA/2/', 'ENCLB568IYX_peaks.xls'))
@pytest.mark.pairedend
def test_peaks_bed_pairedend():
peak_file = test_output_path + 'ENCSR729LGA/2/' + 'ENCLB568IYX.narrowPeak'
assert utils.count_lines(peak_file) == 113821
......@@ -63,6 +63,24 @@ def design_4(design):
return design
@pytest.fixture
def design_5(design):
# Update sample_id to have -, spaces or periods
design.loc[design['sample_id'] == 'A_1', 'sample_id'] = 'A 1'
design.loc[design['sample_id'] == 'A_2', 'sample_id'] = 'A.2'
design.loc[design['sample_id'] == 'B_1', 'sample_id'] = 'B-1'
return design
@pytest.fixture
def design_6(design):
# Update experiment_id to have -, spaces or periods
design.loc[design['sample_id'] == 'A_1', 'experiment_id'] = 'A ChIP'
design.loc[design['sample_id'] == 'A_2', 'experiment_id'] = 'A.ChIP'
design.loc[design['sample_id'] == 'B_1', 'experiment_id'] = 'B-ChIP'
return design
@pytest.fixture
def fastq_files_1(fastq_files):
# Drop B_2.fastq.gz
......@@ -115,10 +133,25 @@ def test_check_files_output_pairedend(design_3, fastq_files):
assert new_design.loc[0, 'fastq_read2'] == "/path/to/file/A_2.fastq.gz"
@pytest.mark.unit
def test_check_replicates(design_4):
paired = False
with pytest.raises(Exception) as excinfo:
new_design = check_design.check_replicates(design_4)
assert str(excinfo.value) == "Duplicate replicates in experiments: ['B']"
@pytest.mark.unit
def test_check_samples(design_5):
paired = False
with pytest.raises(Exception) as excinfo:
new_design = check_design.check_samples(design_5)
assert str(excinfo.value) == "Malformed samples from design file: ['A 1', 'A.2', 'B-1']"
@pytest.mark.unit
def test_check_experiments(design_6):
paired = False
with pytest.raises(Exception) as excinfo:
new_design = check_design.check_experiments(design_6)
assert str(excinfo.value) == "Malformed experiment from design file: ['A ChIP', 'A.ChIP', 'B-ChIP']"
......@@ -8,12 +8,20 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.singleend
def test_convert_reads_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.tagAlign.gz'))
assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bedse.gz'))
def test_tag_reads_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.tagAlign.gz'))
@pytest.mark.singleend
def test_bed_reads_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.bedse.gz'))
@pytest.mark.pairedend
def test_tag_reads_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.tagAlign.gz'))
@pytest.mark.pairedend
def test_map_qc_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.tagAlign.gz'))
assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bedpe.gz'))
def test_bed_reads_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.bedpe.gz'))
......@@ -10,30 +10,65 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/diffPeaks/'
@pytest.mark.singleend
@pytest.mark.singleskip_true
def test_diff_peaks_singleend_single_rep():
assert os.path.isdir(test_output_path) == False
@pytest.mark.pairedend
def test_annotate_peaks_pairedend_single_rep():
def test_diff_peaks_pairedend_single_rep():
assert os.path.isdir(test_output_path) == False
@pytest.mark.singlediff
def test_diff_peaks_singleend_multiple_rep():
def test_heatmap_singleend_multiple_rep():
assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf'))
@pytest.mark.singlediff
def test_pca_singleend_multiple_rep():
assert os.path.exists(os.path.join(test_output_path, 'pca.pdf'))
@pytest.mark.singlediff
def test_normcount_singleend_multiple_rep():
assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt'))
assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed'))
diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv'
@pytest.mark.singlediff
def test_diffbind_singleend_multiple_rep():
if os.path.isfile(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed')):
assert os.path.exists(os.path.join(test_output_path, 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.bed'))
diffbind_file = test_output_path + 'ENCSR272GNQ_vs_ENCSR238SGC_diffbind.csv'
elif os.path.isfile(os.path.join(test_output_path, 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.bed')):
assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.bed'))
diffbind_file = test_output_path + 'ENCSR238SGC_vs_ENCSR272GNQ_diffbind.csv'
assert os.path.exists(diffbind_file)
assert utils.count_lines(diffbind_file) == 201039
assert utils.count_lines(diffbind_file) == 197471
@pytest.mark.paireddiff
def test_annotate_peaks_pairedend_single_rep():
def test_heatmap_pairedend_single_rep():
assert os.path.exists(os.path.join(test_output_path, 'heatmap.pdf'))
@pytest.mark.paireddiff
def test_pca_pairedend_single_rep():
assert os.path.exists(os.path.join(test_output_path, 'pca.pdf'))
@pytest.mark.paireddiff
def test_normcount_pairedend_single_rep():
assert os.path.exists(os.path.join(test_output_path, 'normcount_peaksets.txt'))
assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed'))
diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv'
@pytest.mark.paireddiff
def test_diffbind_pairedend_single_rep():
if os.path.isfile(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed')):
assert os.path.exists(os.path.join(test_output_path, 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.bed'))
diffbind_file = test_output_path + 'ENCSR757EMK_vs_ENCSR729LGA_diffbind.csv'
elif os.path.isfile(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed')):
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.bed'))
diffbind_file = test_output_path + 'ENCSR729LGA_vs_ENCSR757EMK_diffbind.csv'
assert os.path.exists(diffbind_file)
assert utils.count_lines(diffbind_file) == 66201
......@@ -31,17 +31,44 @@ def test_check_update_controls(design_bam):
@pytest.mark.singleend
def test_experiment_qc_singleend():
def test_coverage_singleend():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.png'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.png'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
@pytest.mark.singleend
def test_spearman_singleend():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
@pytest.mark.singleend
def test_pearson_singleend():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
@pytest.mark.singleend
def test_fingerprint_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB144FDT_fingerprint.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI_fingerprint.pdf'))
@pytest.mark.pairdend
def test_experiment_qc_pairedend():
def test_coverage_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'sample_mbs.npz'))
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.png'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.png'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_fingerprint.png'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB637LZP_fingerprint.png'))
assert os.path.exists(os.path.join(test_output_path, 'coverage.pdf'))
@pytest.mark.pairdend
def test_spearman_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_SpearmanCorr.pdf'))
@pytest.mark.pairdend
def test_pearson_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'heatmap_PearsonCorr.pdf'))
@pytest.mark.pairdend
def test_fingerprint_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX_fingerprint.pdf'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB637LZP_fingerprint.pdf'))
#!/usr/bin/env python3
import pytest
import os
import utils
import yaml
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/multiqcReport/'
@pytest.mark.singleend
def test_software_references():
assert os.path.exists(os.path.join(test_output_path, 'software_references_mqc.yaml'))
@pytest.mark.singleend
def test_software_references_output():
software_references = os.path.join(test_output_path, 'software_references_mqc.yaml')
with open(software_references, 'r') as stream:
data_loaded = yaml.load(stream)
assert len(data_loaded['data'].split('<dt>')) == 17
#!/usr/bin/env python3
import pytest
import os
import utils
import yaml
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/multiqcReport/'
@pytest.mark.singleend
def test_software_versions():
assert os.path.exists(os.path.join(test_output_path, 'software_versions_mqc.yaml'))
@pytest.mark.singleend
def test_software_versions_output():
software_versions = os.path.join(test_output_path, 'software_versions_mqc.yaml')
with open(software_versions, 'r') as stream:
data_loaded = yaml.load(stream)
assert len(data_loaded['data'].split('<dt>')) == 17
......@@ -8,31 +8,49 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../output/filterReads/'
@pytest.mark.singleend
def test_dedup_files_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.bam'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.bam.bai'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.dedup.qc'))
@pytest.mark.singleend
def test_map_qc_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam'))
assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.filt.nodup.bam.bai'))
filtered_reads_report = test_output_path + 'ENCFF646LXU.filt.nodup.flagstat.qc'
filtered_reads_report = test_output_path + 'ENCLB831RUI/ENCLB831RUI.dedup.flagstat.qc'
samtools_report = open(filtered_reads_report).readlines()
assert '64962570 + 0 in total' in samtools_report[0]
assert '64962570 + 0 mapped (100.00%:N/A)' in samtools_report[4]
library_complexity = test_output_path + 'ENCFF646LXU.filt.nodup.pbc.qc'
@pytest.mark.singleend
def test_library_complexity_singleend():
library_complexity = test_output_path + 'ENCLB831RUI/ENCLB831RUI.pbc.qc'
df_library_complexity = pd.read_csv(library_complexity, sep='\t')
assert df_library_complexity["NRF"].iloc[0] == 0.926192
assert df_library_complexity["PBC1"].iloc[0] == 0.926775
assert df_library_complexity["PBC2"].iloc[0] == 13.706885
@pytest.mark.pairedend
def test_dedup_files_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.bam'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.bam.bai'))
assert os.path.exists(os.path.join(test_output_path, 'ENCLB568IYX/ENCLB568IYX.dedup.qc'))
@pytest.mark.pairedend
def test_map_qc_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam'))
assert os.path.exists(os.path.join(test_output_path, 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.bam.bai'))
filtered_reads_report = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.flagstat.qc'
filtered_reads_report = test_output_path + 'ENCLB568IYX/ENCLB568IYX.dedup.flagstat.qc'
samtools_report = open(filtered_reads_report).readlines()
assert '47389080 + 0 in total' in samtools_report[0]
assert '47389080 + 0 mapped (100.00%:N/A)' in samtools_report[4]
library_complexity = test_output_path + 'ENCFF293YFE_val_2ENCFF330MCZ_val_1.filt.nodup.pbc.qc'
assert '47388510 + 0 in total' in samtools_report[0]
assert '47388510 + 0 mapped (100.00%:N/A)' in samtools_report[4]
@pytest.mark.pairedend
def test_library_complexity_pairedend():
library_complexity = test_output_path + 'ENCLB568IYX/ENCLB568IYX.pbc.qc'
df_library_complexity = pd.read_csv(library_complexity, sep='\t')
assert df_library_complexity["NRF"].iloc[0] == 0.947064
assert df_library_complexity["PBC1"].iloc[0] == 0.946724
assert df_library_complexity["PBC2"].iloc[0] == 18.643039
assert round(df_library_complexity["PBC1"].iloc[0],6) == 0.946723
assert round(df_library_complexity["PBC2"].iloc[0],6) == 18.642645
......@@ -9,8 +9,8 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.singleend
def test_map_reads_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCFF646LXU.srt.bam'))
aligned_reads_report = test_output_path + 'ENCFF646LXU.srt.bam.flagstat.qc'
assert os.path.exists(os.path.join(test_output_path, 'ENCLB831RUI/ENCLB831RUI.bam'))
aligned_reads_report = test_output_path + 'ENCLB831RUI/ENCLB831RUI.flagstat.qc'
samtools_report = open(aligned_reads_report).readlines()
assert '80795025 + 0 in total' in samtools_report[0]
assert '80050072 + 0 mapped (99.08% : N/A)' in samtools_report[4]
......@@ -18,8 +18,8 @@ def test_map_reads_singleend():
@pytest.mark.pairedend
def test_map_reads_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam'))
aligned_reads_report = test_output_path + 'ENCFF002DTU_val_1ENCFF002EFI_val_2.srt.bam.flagstat.qc'
assert os.path.exists(os.path.join(test_output_path, 'ENCLB678IDC/ENCLB678IDC.bam'))
aligned_reads_report = test_output_path + 'ENCLB678IDC/ENCLB678IDC.flagstat.qc'
samtools_report = open(aligned_reads_report).readlines()
assert '72660890 + 0 in total' in samtools_report[0]
assert '72053925 + 0 mapped (99.16% : N/A)' in samtools_report[4]
......