diff --git a/alignment/convert_reads.py b/alignment/convert_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..7d57c88b6e38ddf160c1ffae3095aeddc8d0f84d --- /dev/null +++ b/alignment/convert_reads.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +'''Convert tagAlign files for further processing.''' + +import os +import argparse +import shutil +import subprocess +import shlex +import logging +from multiprocessing import cpu_count +import sys +sys.path.append(os.path.abspath('../python_utils')) +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-b', '--bam', + help="The bam file to convert.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + samtools_path = shutil.which("samtools") + if samtools_path: + logger.info('Found samtools: %s', samtools_path) + else: + logger.error('Missing samtools') + raise Exception('Missing samtools') + + +def convert_mapped(bam, tag_filename): + '''Use bedtools to convert to tagAlign.''' + + out, err = utils.run_pipe([ + "bamToBed -i %s" % (bam), + r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", + "gzip -nc"], outfile=tag_filename) + + +def convert_mapped_pe(bam, bam_basename): + '''Use bedtools to convert to tagAlign PE data.''' + + bedpe_filename = bam_basename + ".bedpe.gz" + + # Name sort bam to make BEDPE + nmsrt_bam_filename = bam_basename + ".nmsrt.bam" + samtools_sort_command = \ + "samtools sort -n -@%d -o %s %s" \ + % (cpu_count(), nmsrt_bam_filename, bam) + + logger.info(samtools_sort_command) + subprocess.check_output(shlex.split(samtools_sort_command)) + + out, err = utils.run_pipe([ + "bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), + "gzip -nc"], outfile=bedpe_filename) + + +def main(): + args = get_args() + paired = args.paired + bam = args.bam + + # Create a file handler + handler = logging.FileHandler('convert_reads.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Convert PE or SE to tagAlign + bam_basename = os.path.basename( + utils.strip_extensions(bam, ['.bam'])) + + tag_filename = bam_basename + '.tagAlign.gz' + convert_mapped(bam, tag_filename) + + if paired: # paired-end data + convert_mapped_pe(bam, bam_basename) + else: + bedse_filename = bam_basename + ".bedse.gz" + shutil.copy(tag_filename, bedse_filename) + + +if __name__ == '__main__': + main() diff --git a/alignment/map_qc.py b/alignment/map_qc.py new file mode 100644 index 0000000000000000000000000000000000000000..7959973adffb8443b8cb9c0d99846b84d260aeff --- /dev/null +++ b/alignment/map_qc.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 + +'''Remove duplicates and filter unmapped reads.''' + +import os +import subprocess +import argparse +import shutil +import shlex +import logging +from multiprocessing import cpu_count +import sys +import pandas as pd +sys.path.append(os.path.abspath('../python_utils')) +import utils + + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.bam', '.srt'] + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-b', '--bam', + help="The bam file to run filtering and qc on.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + samtools_path = shutil.which("samtools") + if samtools_path: + logger.info('Found samtools: %s', samtools_path) + else: + logger.error('Missing samtools') + raise Exception('Missing samtools') + + sambamba_path = shutil.which("sambamba") + if sambamba_path: + logger.info('Found sambamba: %s', sambamba_path) + else: + logger.error('Missing sambamba') + raise Exception('Missing sambamba') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + +def filter_mapped_pe(bam, bam_basename): + '''Use samtools to filter unmapped reads for PE data.''' + + filt_bam_prefix = bam_basename + ".filt.srt" + filt_bam_filename = filt_bam_prefix + ".bam" + tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix) + tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam" + + # Remove unmapped, mate unmapped + # not primary alignment, reads failing platform + # Remove low MAPQ reads + # Only keep properly paired reads + # Obtain name sorted BAM file + out, err = utils.run_pipe([ + # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire; + # -q 30 exclude MAPQ < 30; -u uncompressed output + # exclude FLAG 1804: unmapped, next segment unmapped, secondary + # alignments, not passing platform q, PCR or optical duplicates + # require FLAG 2: properly aligned + "samtools view -F 1804 -f 2 -q 30 -u %s" % (bam), + # sort: -n sort by name; - take input from stdin; + # out to specified filename + # Will produce name sorted BAM + "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)]) + if err: + logger.error("samtools filter error: %s" % (err)) + + # Remove orphan reads (pair was removed) + # and read pairs mapping to different chromosomes + # Obtain position sorted BAM + out, err = utils.run_pipe([ + # fill in mate coordinates, ISIZE and mate-related flags + # fixmate requires name-sorted alignment; -r removes secondary and + # unmapped (redundant here because already done above?) + # - send output to stdout + "samtools fixmate -r %s -" % (tmp_filt_bam_filename), + # repeat filtering after mate repair + "samtools view -F 1804 -f 2 -u -", + # produce the coordinate-sorted BAM + "samtools sort -@ %d -o %s" % (cpu_count(), filt_bam_filename)]) + + os.remove(tmp_filt_bam_filename) + return filt_bam_filename + + +def filter_mapped_se(bam, bam_basename): + '''Use samtools to filter unmapped reads for SE data.''' + + filt_bam_prefix = bam_basename + ".filt.srt" + filt_bam_filename = filt_bam_prefix + ".bam" + + # Remove unmapped, mate unmapped + # not primary alignment, reads failing platform + # Remove low MAPQ reads + # Obtain name sorted BAM file + with open(filt_bam_filename, 'w') as out_bam: + samtools_filter_command = ( + "samtools view -F 1804 -q 30 -b %s" + % (bam) + ) + logger.info(samtools_filter_command) + subprocess.check_call( + shlex.split(samtools_filter_command), + stdout=out_bam) + + return filt_bam_filename + + +def dedup_mapped(bam, bam_basename, paired): + '''Use sambamba and samtools to remove duplicates.''' + + # Markduplicates + dup_file_qc_filename = bam_basename + ".dup.qc" + tmp_dup_mark_filename = bam_basename + ".dupmark.bam" + sambamba_params = "--hash-table-size=17592186044416" + \ + " --overflow-list-size=20000000 --io-buffer-size=256" + with open(dup_file_qc_filename, 'w') as dup_qc: + sambamba_markdup_command = ( + "sambamba markdup -t %d %s --tmpdir=%s %s %s" + % (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename) + ) + logger.info(sambamba_markdup_command) + subprocess.check_call( + shlex.split(sambamba_markdup_command), + stderr=dup_qc) + + + # Remove duplicates + final_bam_prefix = bam_basename + ".filt.nodup" + final_bam_filename = final_bam_prefix + ".bam" + + if paired: # paired-end data + samtools_dedupe_command = \ + "samtools view -F 1804 -f 2 -b %s" % (tmp_dup_mark_filename) + else: + samtools_dedupe_command = \ + "samtools view -F 1804 -b %s" % (tmp_dup_mark_filename) + + with open(final_bam_filename, 'w') as out_bam: + logger.info(samtools_dedupe_command) + subprocess.check_call( + shlex.split(samtools_dedupe_command), + stdout=out_bam) + + # Index final bam file + sambamba_index_command = \ + "samtools index -@ %d %s" % (cpu_count(), final_bam_filename) + logger.info(sambamba_index_command) + subprocess.check_output(shlex.split(sambamba_index_command)) + + # Generate mapping statistics + final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc" + with open(final_bam_file_mapstats_filename, 'w') as out_stats: + flagstat_command = "sambamba flagstat -t %d %s" \ + % (cpu_count(), final_bam_filename) + logger.info(flagstat_command) + subprocess.check_call(shlex.split(flagstat_command), stdout=out_stats) + + os.remove(bam) + return tmp_dup_mark_filename + + +def compute_complexity(bam, paired, bam_basename): + '''Calculate library complexity .''' + + pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc" + tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename) + + # Sort by name + # convert to bedPE and obtain fragment coordinates + # sort by position and strand + # Obtain unique count statistics + + # PBC File output + # TotalReadPairs [tab] + # DistinctReadPairs [tab] + # OneReadPair [tab] + # TwoReadPairs [tab] + # NRF=Distinct/Total [tab] + # PBC1=OnePair/Distinct [tab] + # PBC2=OnePair/TwoPair + pbc_headers = [ + 'TotalReadPairs', + 'DistinctReadPairs', + 'OneReadPair', + 'TwoReadPairs', + 'NRF', + 'PBC1', + 'PBC2'] + + if paired: + steps = [ + "samtools sort -@%d -n %s" % (cpu_count(), bam), + "bamToBed -bedpe -i stdin", + r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""] + else: + steps = [ + "bamToBed -i %s" % (bam), + r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""] + steps.extend([ + "grep -v 'chrM'", + "sort", + "uniq -c", + r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'""" + ]) + out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename) + if err: + logger.error("PBC file error: %s", err) + + # Add headers + pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None, + names=pbc_headers) + pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False) + os.remove(bam) + os.remove(bam + '.bai') + os.remove(tmp_pbc_file_qc_filename) + + +def main(): + args = get_args() + paired = args.paired + bam = args.bam + + # Create a file handler + handler = logging.FileHandler('map_qc.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Run filtering for either PE or SE + bam_basename = os.path.basename( + utils.strip_extensions(bam, STRIP_EXTENSIONS)) + if paired: # paired-end data + filter_bam_filename = filter_mapped_pe(bam, bam_basename) + + else: + filter_bam_filename = filter_mapped_se(bam, bam_basename) + + # Remove duplicates + markdup_bam_filename = dedup_mapped(filter_bam_filename, bam_basename, paired) + + # Compute library complexity + compute_complexity(markdup_bam_filename, paired, bam_basename) + + +if __name__ == '__main__': + main() diff --git a/alignment/map_reads.py b/alignment/map_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..80f47089353b89f5b45a45ab5315440446c2c652 --- /dev/null +++ b/alignment/map_reads.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 + +'''Align reads to reference genome.''' + +import os +import subprocess +import argparse +import shutil +import shlex +import logging +from multiprocessing import cpu_count +import sys +sys.path.append(os.path.abspath('../python_utils')) +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + +# the order of this list is important. +# strip_extensions strips from the right inward, so +# the expected right-most extensions should appear first (like .gz) +# Modified from J. Seth Strattan +STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed'] + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-f', '--fastq', + help="The fastq file to run triming on.", + nargs='+', + required=True) + + parser.add_argument('-r', '--reference', + help="The bwa index of the reference genome.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + bwa_path = shutil.which("bwa") + if bwa_path: + logger.info('Found bwa: %s', bwa_path) + else: + logger.error('Missing bwa') + raise Exception('Missing bwa') + + samtools_path = shutil.which("samtools") + if samtools_path: + logger.info('Found samtools: %s', samtools_path) + else: + logger.error('Missing samtools') + raise Exception('Missing samtools') + + +def generate_sa(fastq, reference): + '''Use BWA to generate Suffix Arrays.''' + + fastq_basename = os.path.basename(utils.strip_extensions(fastq, STRIP_EXTENSIONS)) + + bwa_aln_params = '-q 5 -l 32 -k 2' + + sai = '%s.sai' % (fastq_basename) + with open(sai, 'w') as sai_file: + bwa_command = "bwa aln %s -t %d %s %s" \ + % (bwa_aln_params, cpu_count(), + reference, fastq) + + logger.info("Running bwa with %s", bwa_command) + subprocess.check_call(shlex.split(bwa_command), stdout=sai_file) + + return sai + + +def align_se(fastq, sai, reference, fastq_basename): + '''Use BWA to align SE data.''' + + bam_filename = '%s.srt.bam' % (fastq_basename) + + steps = [ + "bwa samse %s %s %s" + % (reference, sai[0], fastq[0]), + "samtools view -@%d -Su -" % (cpu_count()), + "samtools sort -@%d -o %s" + % (cpu_count(), bam_filename)] + + out, err = utils.run_pipe(steps) + if err: + logger.error("samse/samtools error: %s", err) + + return bam_filename + + +def align_pe(fastq, sai, reference, fastq_basename): + '''Use BWA to align PE data.''' + + sam_filename = "%s.sam" % (fastq_basename) + badcigar_filename = "%s.badReads" % (fastq_basename) + bam_filename = '%s.srt.bam' % (fastq_basename) + + # Remove read pairs with bad CIGAR strings and sort by position + steps = [ + "bwa sampe -P %s %s %s %s %s" + % (reference, sai[0], sai[1], + fastq[0], fastq[1]), + "tee %s" % (sam_filename), + r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", + "sort", + "uniq"] + + out, err = utils.run_pipe(steps, badcigar_filename) + if err: + logger.error("sampe error: %s", err) + + steps = [ + "cat %s" % (sam_filename), + "grep -v -F -f %s" % (badcigar_filename), + "samtools view -@%d -Su -" % (cpu_count()), + "samtools sort -@%d -o %s" + % (cpu_count(), bam_filename)] + + out, err = utils.run_pipe(steps) + if err: + logger.error("samtools error: %s", err) + + return bam_filename + + +def main(): + args = get_args() + paired = args.paired + fastq = args.fastq + reference = args.reference + + # Create a file handler + handler = logging.FileHandler('map.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Run Suffix Array generation + sai = [] + for fq in fastq: + sai_filename = generate_sa(fq, reference) + sai.append(sai_filename) + + # Run alignment for either PE or SE + if paired: # paired-end data + fastq_r1_basename = os.path.basename( + utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) + fastq_r2_basename = os.path.basename( + utils.strip_extensions(fastq[1], STRIP_EXTENSIONS)) + fastq_basename = fastq_r1_basename + fastq_r2_basename + + bam_filename = align_pe(fastq, sai, reference, fastq_basename) + + else: + fastq_basename = os.path.basename( + utils.strip_extensions(fastq[0], STRIP_EXTENSIONS)) + + bam_filename = align_se(fastq, sai, reference, fastq_basename) + + bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename) + with open(bam_mapstats_filename, 'w') as out_file: + subprocess.check_call( + shlex.split("samtools flagstat %s" % (bam_filename)), + stdout=out_file) + + # Remove sai files + for sai_file in sai: + os.remove(sai_file) + + +if __name__ == '__main__': + main() diff --git a/call_peaks/call_peaks_macs.py b/call_peaks/call_peaks_macs.py new file mode 100644 index 0000000000000000000000000000000000000000..154bdc668b95a854fb887702df30d4d6375a1565 --- /dev/null +++ b/call_peaks/call_peaks_macs.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 + +'''Generate peaks from data.''' + +import os +import argparse +import shutil +import logging +from multiprocessing import cpu_count +import sys +sys.path.append(os.path.abspath('../python_utils')) +import utils +sys.path.append(os.path.abspath('../cross_correlation')) +from xcor import xcor as calculate_xcor + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-t', '--tag', + help="The tagAlign file to perform peak calling on.", + required=True) + + parser.add_argument('-x', '--xcor', + help="The cross-correlation file (if already calculated).", + required=True) + + parser.add_argument('-c', '--con', + help="The control tagAling file used for peak calling.", + required=True) + + parser.add_argument('-s', '--sample', + help="The sample id to name the peak files.", + required=True) + + parser.add_argument('-g', '--genome', + help="The genome size of reference genome.", + required=True) + + parser.add_argument('-z', '--size', + help="The file with chromosome sizes of reference genome.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + r_path = shutil.which("R") + if r_path: + logger.info('Found R: %s', r_path) + else: + logger.error('Missing R') + raise Exception('Missing R') + + macs_path = shutil.which("macs2") + if r_path: + logger.info('Found MACS2: %s', macs_path) + else: + logger.error('Missing MACS2') + raise Exception('Missing MACS2') + + bg_bw_path = shutil.which("bedGraphToBigWig") + if bg_bw_path: + logger.info('Found bedGraphToBigWig: %s', bg_bw_path) + else: + logger.error('Missing bedGraphToBigWig') + raise Exception('Missing bedGraphToBigWig') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + +def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes): + + # Extract the fragment length estimate from column 3 of the + # cross-correlation scores file + with open(xcor, 'r') as xcor_fh: + firstline = xcor_fh.readline() + frag_lengths = firstline.split()[2] # third column + fragment_length = frag_lengths.split(',')[0] # grab first value + logger.info("Fraglen %s" % (fragment_length)) + + + # Generate narrow peaks and preliminary signal tracks + + command = 'macs2 callpeak ' + \ + '-t %s -c %s ' % (experiment, control) + \ + '-f BED -n %s ' % (prefix) + \ + '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("MACS2 exited with returncode %d" % (returncode)) + assert returncode == 0, "MACS2 non-zero return" + + # MACS2 sometimes calls features off the end of chromosomes. + # Remove coordinates outside chromosome sizes + + narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) + clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn) + + + steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)] + + out, err = utils.run_pipe(steps) + + # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format + # (score must be <1000) + rescaled_narrowpeak_fn = utils.rescale_scores( + clipped_narrowpeak_fn, scores_col=5) + + # Sort by Col8 in descending order and replace long peak names in Column 4 + # with Peak_<peakRank> + steps = [ + 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), + r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""" + ] + + out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn)) + + # For Fold enrichment signal tracks + + # This file is a tab delimited file with 2 columns Col1 (chromosome name), + # Col2 (chromosome size in bp). + + command = 'macs2 bdgcmp ' + \ + '-t %s_treat_pileup.bdg ' % (prefix) + \ + '-c %s_control_lambda.bdg ' % (prefix) + \ + '-o %s_FE.bdg ' % (prefix) + \ + '-m FE' + + logger.info(command) + returncode = utils.block_on(command) + logger.info("MACS2 exited with returncode %d" % (returncode)) + assert returncode == 0, "MACS2 non-zero return" + + # Remove coordinates outside chromosome sizes (MACS2 bug) + fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix) + fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn) + fc_signal_fn = "%s.fc_signal.bw" % (prefix) + steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)] + + out, err = utils.run_pipe(steps) + + # Sort file + out, err = utils.run_pipe([ + 'bedSort %s %s' % (fc_bedgraph_fn, fc_bedgraph_sorted_fn)]) + + # Convert bedgraph to bigwig + command = 'bedGraphToBigWig ' + \ + '%s ' % (fc_bedgraph_sorted_fn) + \ + '%s ' % (chrom_sizes) + \ + '%s' % (fc_signal_fn) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + assert returncode == 0, "bedGraphToBigWig non-zero return" + + # For -log10(p-value) signal tracks + + # Compute sval = + # min(no. of reads in ChIP, no. of reads in control) / 1,000,000 + out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l']) + chip_reads = out.strip() + out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l']) + control_reads = out.strip() + sval = str(min(float(chip_reads), float(control_reads)) / 1000000) + + logger.info("chip_reads = %s, control_reads = %s, sval = %s" % + (chip_reads, control_reads, sval)) + + command = 'macs2 bdgcmp ' + \ + '-t %s_treat_pileup.bdg ' % (prefix) + \ + '-c %s_control_lambda.bdg ' % (prefix) + \ + '-o %s_ppois.bdg ' % (prefix) + \ + '-m ppois -S %s' % (sval) + + logger.info(command) + returncode = utils.block_on(command) + assert returncode == 0, "MACS2 non-zero return" + + # Remove coordinates outside chromosome sizes (MACS2 bug) + pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix) + pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn) + pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix) + steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes), + 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)] + + out, err = utils.run_pipe(steps) + + # Sort file + out, err = utils.run_pipe([ + 'bedSort %s %s' % (fc_bedgraph_fn, pvalue_bedgraph_sorted_fn)]) + + # Convert bedgraph to bigwig + command = 'bedGraphToBigWig ' + \ + '%s ' % (pvalue_bedgraph_sorted_fn) + \ + '%s ' % (chrom_sizes) + \ + '%s' % (pvalue_signal_fn) + + logger.info(command) + returncode = utils.block_on(command) + logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) + assert returncode == 0, "bedGraphToBigWig non-zero return" + + # Remove temporary files + os.remove(clipped_narrowpeak_fn) + os.remove(rescaled_narrowpeak_fn) + + +def main(): + args = get_args() + tag = args.tag + xcor = args.xcor + con = args.con + sample = args.sample + genome_size = args.genome + chrom_size = args.size + paired = args.paired + + # Create a file handler + handler = logging.FileHandler('call_peaks.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Calculate Cross-correlation if not already calcualted + if xcor == 'Calculate': + xcor_file = calculate_xcor(tag, paired) + else: + xcor_file = xcor + + # Call Peaks using MACS2 + call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size) + + +if __name__ == '__main__': + main() diff --git a/call_peaks/overlap_peaks.py b/call_peaks/overlap_peaks.py new file mode 100644 index 0000000000000000000000000000000000000000..9bfd19c8edf5ddb8022009dcf555c4eb7390c31f --- /dev/null +++ b/call_peaks/overlap_peaks.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 + +'''Generate naive overlap peak files and design file for downstream processing.''' + +import os +import argparse +import logging +import shutil +import pandas as pd +import sys +sys.path.append(os.path.abspath('../python_utils')) +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file of peaks (tsv format).", + required=True) + + parser.add_argument('-f', '--files', + help="The design file of with bam files (tsv format).", + required=True) + + args = parser.parse_args() + return args + + +def check_tools(): + '''Checks for required componenets on user system.''' + + logger.info('Checking for required libraries and components on this system') + + bedtools_path = shutil.which("bedtools") + if bedtools_path: + logger.info('Found bedtools: %s', bedtools_path) + else: + logger.error('Missing bedtools') + raise Exception('Missing bedtools') + + +def update_design(design): + '''Update design file for diffBind and remove controls.''' + + logger.info("Running control file update.") + + file_dict = design[['sample_id', 'bam_reads']] \ + .set_index('sample_id').T.to_dict() + + design['control_bam_reads'] = design['control_id'] \ + .apply(lambda x: file_dict[x]['bam_reads']) + + logger.info("Removing rows that are there own control.") + + design = design[design['control_id'] != design['sample_id']] + + logger.info("Removing columns that are there own control.") + + design = design.drop('bam_index', axis=1) + + logger.info("Adding peaks column.") + + design = design.assign(peak='', peak_caller='bed') + + return design + + +def overlap(experiment, design): + '''Calculate the overlap of peaks''' + + logger.info("Determining consenus peaks for experiment %s.", experiment) + + # Output File names + peak_type = 'narrowPeak' + overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type) + rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type) + + # Intermediate File names + overlap_tr_fn = 'replicated_tr.%s' % (peak_type) + overlap_pr_fn = 'replicated_pr.%s' % (peak_type) + + # Assign Pooled and Psuedoreplicate peaks + pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0] + pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0] + pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0] + + # Remove non true replicate rows + not_replicates = ['1_pr', '2_pr', 'pooled'] + design_true_reps = design[~design['replicate'].isin(not_replicates)] + true_rep_peaks = design_true_reps.peaks.unique() + + # Find overlaps + awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" + cut_command = 'cut -f 1-10' + + # Find pooled peaks that overlap Rep1 and Rep2 + # where overlap is defined as the fractional overlap + # with any one of the overlapping peak pairs >= 0.5 + + steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]), + awk_command, + cut_command, + 'sort -u'] + + iter_true_peaks = iter(true_rep_peaks) + next(iter_true_peaks) + + if len(true_rep_peaks) > 1: + for true_peak in true_rep_peaks[1:]: + steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak), + awk_command, + cut_command, + 'sort -u']) + + out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn) + print("%d peaks overlap with both true replicates" % + (utils.count_lines(overlap_tr_fn))) + + # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 + # where overlap is defined as the fractional overlap + # with any one of the overlapping peak pairs >= 0.5 + + steps_pseudo = ['intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks), + awk_command, + cut_command, + 'sort -u', + 'intersectBed -wo -a stdin -b %s' % (pr2_peaks), + awk_command, + cut_command, + 'sort -u'] + + out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn) + print("%d peaks overlap with both pooled pseudoreplicates" + % (utils.count_lines(overlap_pr_fn))) + + # Make union of peak lists + out, err = utils.run_pipe([ + 'cat %s %s' % (overlap_tr_fn, overlap_pr_fn), + 'sort -u' + ], overlapping_peaks_fn) + print("%d peaks overlap with true replicates or with pooled pseudorepliates" + % (utils.count_lines(overlapping_peaks_fn))) + + # Make rejected peak list + out, err = utils.run_pipe([ + 'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn) + ], rejected_peaks_fn) + print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn))) + + # Remove temporary files + os.remove(overlap_tr_fn) + os.remove(overlap_pr_fn) + + return overlapping_peaks_fn + + +def main(): + args = get_args() + design = args.design + files = args.files + + # Create a file handler + handler = logging.FileHandler('consensus_peaks.log') + logger.addHandler(handler) + + # Read files as dataframes + design_peaks_df = pd.read_csv(design, sep='\t') + design_files_df = pd.read_csv(files, sep='\t') + + # Make a design file for + design_diff = update_design(design_files_df) + + # Find consenus overlap peaks for each experiment + for experiment, df_experiment in design_peaks_df.groupby('experiment_id'): + replicated_peak = overlap(experiment, df_experiment) + design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak + + # Write out file + design_diff.columns = ['SampleID', + 'bamReads', + 'Condition', + 'Tissue', + 'Factor', + 'Treatment', + 'Replicate', + 'ControlID', + 'bamControl', + 'Peaks', + 'PeakCaller'] + + design_diff.to_csv("design_diffPeaks.tsv", header=True, sep='\t', index=False) + + +if __name__ == '__main__': + main() diff --git a/call_peaks/pool_and_psuedoreplicate.py b/call_peaks/pool_and_psuedoreplicate.py new file mode 100644 index 0000000000000000000000000000000000000000..6c2d0cc35acb60ea8630b953a8f50d42906ff9f0 --- /dev/null +++ b/call_peaks/pool_and_psuedoreplicate.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 + +'''Generate pooled and pseudoreplicate from data.''' + +import argparse +import logging +import pandas as pd +import numpy as np +import os +import sys +sys.path.append(os.path.abspath('../python_utils')) +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to make experiemnts (tsv format).", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + parser.add_argument('-c', '--cutoff', + help="Cutoff ratio used for choosing controls.", + default=1.2) + + args = parser.parse_args() + return args + + +def check_replicates(design): + '''Check the number of replicates for the experiment.''' + + no_rep = design.shape[0] + + return no_rep + + +def check_controls(design): + '''Check the number of controls for the experiment.''' + + no_controls = len(design.control_tag_align.unique()) + + return no_controls + + +def get_read_count_ratio(design): + '''Get the ratio of read counts for unique controls.''' + + controls = design.control_tag_align.unique() + control_dict = {} + for con in controls: + no_reads = utils.count_lines(con) + control_dict[con] = no_reads + + control_matrix = {c: control_dict for c in controls} + control_df = pd.DataFrame.from_dict(control_matrix) + + control_ratio = control_df.divide(list(control_dict.values()), axis=0) + return control_ratio + + +def pool(tag_files, outfile, paired): + '''Pool files together.''' + + if paired: + file_extension = '.bedpe.gz' + else: + file_extension = '.bedse.gz' + + pooled_filename = outfile + file_extension + + # Merge files + out, err = utils.run_pipe([ + 'gzip -dc %s' % (' '.join(tag_files)), + 'gzip -cn'], outfile=pooled_filename) + + return pooled_filename + + +def self_psuedoreplication(tag_file, prefix, paired): + '''Make 2 self-psuedoreplicates.''' + + # Get total number of reads + no_lines = utils.count_lines(tag_file) + + # Number of lines to split into + lines_per_rep = (no_lines+1)/2 + + # Make an array of number of psuedoreplicatesfile names + pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz' + for r in [0, 1]} + + # Shuffle and split file into equal parts + # by using the input to seed shuf we ensure multiple runs with the same + # input will produce the same output + # Produces two files named splits_prefix0n, n=1,2 + + splits_prefix = 'temp_split' + + out, err = utils.run_pipe([ + 'gzip -dc %s' % (tag_file), + 'shuf --random-source=%s' % (tag_file), + 'split -d -l %d - %s' % (lines_per_rep, splits_prefix)]) + + # Convert read pairs to reads into standard tagAlign file + + for i, index in enumerate([0, 1]): + string_index = '0' + str(index) + steps = ['cat %s' % (splits_prefix + string_index)] + if paired: + steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) + steps.extend(['gzip -cn']) + out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) + os.remove(splits_prefix + string_index) + + return pseudoreplicate_dict + + +def main(): + args = get_args() + paired = args.paired + design = args.design + cutoff_ratio = args.cutoff + + # Create a file handler + handler = logging.FileHandler('experiment_generation.log') + logger.addHandler(handler) + + # Read files as dataframes + design_df = pd.read_csv(design, sep='\t') + + # Get current directory to build paths + cwd = os.getcwd() + + # Check Number of replicates and replicates + no_reps = check_replicates(design_df) + no_unique_controls = check_controls(design_df) + + if no_reps == 1: + logger.info("No other replicate specified " + "so processing as an unreplicated experiment.") + replicated = False + + else: + logger.info("Multiple replicates specified " + "so processing as a replicated experiment.") + replicated = True + + if no_unique_controls == 1 and replicated: + logger.info("Only a single control was specified " + "so using same control for replicates, pool and psuedoreplicates.") + single_control = True + else: + logger.info("Will merge only unique controls for pooled.") + single_control = False + + # Pool the controls for checking + if not single_control: + control_df = get_read_count_ratio(design_df) + control_files = design_df.control_tag_align.unique() + pool_control = pool(control_files, "pool_control", paired) + else: + pool_control = design_df.control_tag_align.unique()[0] + + # Psuedoreplicate and update design accordingly + if not replicated: + + # Duplicate rows and update for pool and psuedoreplicates + experiment_id = design_df.at[0, 'experiment_id'] + replicate = design_df.at[0, 'replicate'] + design_new_df = design_df.loc[np.repeat(design_df.index, 4)].reset_index() + design_new_df['replicate'] = design_new_df['replicate'].astype(str) + design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1' + design_new_df.at[1, 'replicate'] = '1_pr' + design_new_df.at[1, 'xcor'] = 'Calculate' + design_new_df.at[2, 'sample_id'] = experiment_id + '_pr2' + design_new_df.at[2, 'replicate'] = '2_pr' + design_new_df.at[2, 'xcor'] = 'Calculate' + design_new_df.at[3, 'sample_id'] = experiment_id + '_pooled' + design_new_df.at[3, 'replicate'] = 'pooled' + design_new_df.at[3, 'xcor'] = 'Calculate' + + # Make 2 self psuedoreplicates + self_pseudoreplicates_dict = {} + for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): + replicate_prefix = experiment_id + '_' + str(rep) + self_pseudoreplicates_dict = \ + self_psuedoreplication(tag_file, replicate_prefix, paired) + + # Update design to include new self pseudo replicates + for rep, pseudorep_file in self_pseudoreplicates_dict.items(): + path_to_file = cwd + '/' + pseudorep_file + replicate = rep + 1 + design_new_df.loc[replicate, 'tag_align'] = path_to_file + + # Drop index column + design_new_df.drop(labels='index', axis=1, inplace=True) + + else: + # Make pool of replicates + replicate_files = design_df.tag_align.unique() + experiment_id = design_df.at[0, 'experiment_id'] + pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired) + + # Make self psuedoreplicates equivalent to number of replicates + pseudoreplicates_dict = {} + for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']): + replicate_prefix = experiment_id + '_' + str(rep) + pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired) + pseudoreplicates_dict[rep] = pr_dict + + # Merge self psuedoreplication for each true replicate + pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict) + pool_pseudoreplicates_dict = {} + for index, row in pseudoreplicates_df.iterrows(): + replicate_id = index + 1 + pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz' + pool_replicate = pool(row, pr_filename, False) + pool_pseudoreplicates_dict[replicate_id] = pool_replicate + + design_new_df = design_df + # Check controls against cutoff_ratio + # if so replace with pool_control + # unless single control was used + + if not single_control: + path_to_pool_control = cwd + '/' + pool_control + if control_df.values.max() > 1.2: + logger.info("Number of reads in controls differ by " + + " > factor of %f. Using pooled controls." % (cutoff_ratio)) + design_new_df['control_tag_align'] = path_to_pool_control + else: + for index, row in design_new_df.iterrows(): + exp_no_reads = utils.count_lines(row['tag_align']) + con_no_reads = utils.count_lines(row['control_tag_align']) + if con_no_reads < exp_no_reads: + logger.info("Fewer reads in control than experiment." + + "Using pooled controls for replicate %s." + % row['replicate']) + design_new_df.loc[index, 'control_tag_align'] = \ + path_to_pool_control + else: + path_to_pool_control = pool_control + + # Add in pseudo replicates + tmp_metadata = design_new_df.loc[0].copy() + tmp_metadata['control_tag_align'] = path_to_pool_control + for rep, pseudorep_file in pool_pseudoreplicates_dict.items(): + tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep) + tmp_metadata['replicate'] = str(rep) + '_pr' + tmp_metadata['xcor'] = 'Calculate' + path_to_file = cwd + '/' + pseudorep_file + tmp_metadata['tag_align'] = path_to_file + design_new_df = design_new_df.append(tmp_metadata) + + # Add in pool experiment + tmp_metadata['sample_id'] = experiment_id + '_pooled' + tmp_metadata['replicate'] = 'pooled' + tmp_metadata['xcor'] = 'Calculate' + path_to_file = cwd + '/' + pool_experiment + tmp_metadata['tag_align'] = path_to_file + design_new_df = design_new_df.append(tmp_metadata) + + # Write out new dataframe + design_new_df.to_csv(experiment_id + '_ppr.tsv', + header=True, sep='\t', index=False) + + +if __name__ == '__main__': + main() diff --git a/cross_correlation/xcor.py b/cross_correlation/xcor.py new file mode 100644 index 0000000000000000000000000000000000000000..672fdbb4f14bc8edc64c018e6991a8c7b0d47be2 --- /dev/null +++ b/cross_correlation/xcor.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +'''Compute cross-correlation analysis.''' + +import os +import argparse +import shutil +import logging +from multiprocessing import cpu_count +import sys +sys.path.append(os.path.abspath('../python_utils')) +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-t', '--tag', + help="The tagAlign file to qc on.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + r_path = shutil.which("R") + if r_path: + logger.info('Found R: %s', r_path) + else: + logger.error('Missing R') + raise Exception('Missing R') + + +def xcor(tag, paired): + '''Use spp to calculate cross-correlation stats.''' + + tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz'])) + + uncompressed_tag_filename = tag_basename + out, err = utils.run_pipe([ + 'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename) + + # Subsample tagAlign file + numReads = 15000000 + + subsampled_tag_filename = \ + tag_basename + ".%d.tagAlign.gz" % (numReads/1000000) + steps = [ + 'grep -v "chrM" %s' % (uncompressed_tag_filename), + 'shuf -n %d --random-source=%s' % (numReads, uncompressed_tag_filename) + ] + if paired: + steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) + + steps.extend(['gzip -nc']) + + out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename) + + # Calculate Cross-correlation QC scores + cc_scores_filename = subsampled_tag_filename + ".cc.qc" + cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf" + + # CC_SCORE FILE format + # Filename <tab> + # numReads <tab> + # estFragLen <tab> + # corr_estFragLen <tab> + # PhantomPeak <tab> + # corr_phantomPeak <tab> + # argmin_corr <tab> + # min_corr <tab> + # phantomPeakCoef <tab> + # relPhantomPeakCoef <tab> + # QualityTag + + run_spp_command = shutil.which("run_spp.R") + out, err = utils.run_pipe([ + "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % + (run_spp_command, subsampled_tag_filename, cpu_count(), + cc_plot_filename, cc_scores_filename) + ]) + + return cc_scores_filename + + +def main(): + args = get_args() + paired = args.paired + tag = args.tag + + # Create a file handler + handler = logging.FileHandler('xcor.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Calculate Cross-correlation + xcor_filename = xcor(tag, paired) + + +if __name__ == '__main__': + main() diff --git a/design_file/check_design.py b/design_file/check_design.py new file mode 100644 index 0000000000000000000000000000000000000000..913e766c1663f79fcec17086c1879c6c184569f5 --- /dev/null +++ b/design_file/check_design.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 + +'''Check if design file is correctly formatted and matches files list.''' + +import argparse +import logging +import pandas as pd + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to run QC (tsv format).", + required=True) + + parser.add_argument('-f', '--fastq', + help="File with list of fastq files (tsv format).", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +def check_design_headers(design, paired): + '''Check if design file conforms to sequencing type.''' + + # Default headers + design_template = [ + 'sample_id', + 'experiment_id', + 'biosample', + 'factor', + 'treatment', + 'replicate', + 'control_id', + 'fastq_read1'] + + design_headers = list(design.columns.values) + + if paired: # paired-end data + design_template.extend(['fastq_read2']) + + # Check if headers + logger.info("Running header check.") + + missing_headers = set(design_template) - set(design_headers) + + if len(missing_headers) > 0: + logger.error('Missing column headers: %s', list(missing_headers)) + raise Exception("Missing column headers: %s" % list(missing_headers)) + + +def check_controls(design): + '''Check if design file has the correct control mapping.''' + + logger.info("Running control check.") + + missing_controls = set(design['control_id']) - set(design['sample_id']) + + if len(missing_controls) > 0: + logger.error('Missing control experiments: %s', list(missing_controls)) + raise Exception("Missing control experiments: %s" % + list(missing_controls)) + + +def check_replicates(design): + '''Check if design file has unique replicate numbersfor an experiment.''' + + logger.info("Running replicate check.") + + experiment_replicates = design.groupby('experiment_id')['replicate'] \ + .apply(list) + + duplicated_replicates = [] + for experiment in experiment_replicates.index.values: + replicates = experiment_replicates[experiment] + unique_replicates = set(replicates) + if len(replicates) != len(unique_replicates): + duplicated_replicates.append(experiment) + + if len(duplicated_replicates) > 0: + logger.error('Duplicate replicates in experiments: %s', list(duplicated_replicates)) + raise Exception("Duplicate replicates in experiments: %s" % + list(duplicated_replicates)) + + +def check_files(design, fastq, paired): + '''Check if design file has the files found.''' + + logger.info("Running file check.") + + if paired: # paired-end data + files = list(design['fastq_read1']) + list(design['fastq_read2']) + else: # single-end data + files = design['fastq_read1'] + + files_found = fastq['name'] + + missing_files = set(files) - set(files_found) + + if len(missing_files) > 0: + logger.error('Missing files from design file: %s', list(missing_files)) + raise Exception("Missing files from design file: %s" % + list(missing_files)) + else: + file_dict = fastq.set_index('name').T.to_dict() + + design['fastq_read1'] = design['fastq_read1'] \ + .apply(lambda x: file_dict[x]['path']) + if paired: # paired-end data + design['fastq_read2'] = design['fastq_read2'] \ + .apply(lambda x: file_dict[x]['path']) + return design + + +def main(): + args = get_args() + design = args.design + fastq = args.fastq + paired = args.paired + + # Create a file handler + handler = logging.FileHandler('design.log') + logger.addHandler(handler) + + # Read files as dataframes + design_df = pd.read_csv(design, sep='\t') + fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path']) + + # Check design file + check_design_headers(design_df, paired) + check_controls(design_df) + check_replicates(design_df) + new_design_df = check_files(design_df, fastq_df, paired) + + # Write out new design file + new_design_df.to_csv('design.tsv', header=True, sep='\t', index=False) + + +if __name__ == '__main__': + main() diff --git a/design_file/experiment_design.py b/design_file/experiment_design.py new file mode 100644 index 0000000000000000000000000000000000000000..f527b46d32f3dda20be3c84a4a8db822e9cb7310 --- /dev/null +++ b/design_file/experiment_design.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +'''Generate experiment design files for downstream processing.''' + +import argparse +import logging +import pandas as pd + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to make experiemnts (tsv format).", + required=True) + + args = parser.parse_args() + return args + + +def update_controls(design): + '''Update design file to append controls list.''' + + logger.info("Running control file update.") + + file_dict = design[['sample_id', 'tag_align']] \ + .set_index('sample_id').T.to_dict() + + design['control_tag_align'] = design['control_id'] \ + .apply(lambda x: file_dict[x]['tag_align']) + + logger.info("Removing rows that are there own control.") + + design = design[design['control_id'] != design['sample_id']] + + return design + + +def make_experiment_design(design): + '''Make design file by grouping for each experiment''' + + logger.info("Running experiment design generation.") + + for experiment, df_experiment in design.groupby('experiment_id'): + experiment_file = experiment + '.tsv' + df_experiment.to_csv(experiment_file, header=True, sep='\t', index=False) + + +def main(): + args = get_args() + design = args.design + + # Create a file handler + handler = logging.FileHandler('experiment_generation.log') + logger.addHandler(handler) + + # Read files as dataframes + design_df = pd.read_csv(design, sep='\t') + + # Update design file for check_controls + new_design_df = update_controls(design_df) + + # write out experiment design files + make_experiment_design(new_design_df) + + +if __name__ == '__main__': + main() diff --git a/preproc_fastq/trim_reads.py b/preproc_fastq/trim_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..036c5f700eb61011cce4008230b60c634ea1c582 --- /dev/null +++ b/preproc_fastq/trim_reads.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +'''Trim low quality reads and remove sequences less than 35 base pairs.''' + +import subprocess +import argparse +import shutil +import logging + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-f', '--fastq', + help="The fastq file to run triming on.", + nargs='+', + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + + +def check_tools(): + '''Checks for required componenets on user system.''' + + logger.info('Checking for required libraries and components on this system') + + trimgalore_path = shutil.which("trim_galore") + if trimgalore_path: + logger.info('Found trimgalore: %s', trimgalore_path) + else: + logger.error('Missing trimgalore') + raise Exception('Missing trimgalore') + + cutadapt_path = shutil.which("cutadapt") + if cutadapt_path: + logger.info('Found cutadapt: %s', cutadapt_path) + else: + logger.error('Missing cutadapt') + raise Exception('Missing cutadapt') + + +def trim_reads(fastq, paired): + '''Run trim_galore on 1 or 2 files.''' + + if paired: # paired-end data + trim_params = '--paired -q 25 --illumina --gzip --length 35' + trim_command = "trim_galore %s %s %s " \ + % (trim_params, fastq[0], fastq[1]) + else: + trim_params = '-q 25 --illumina --gzip --length 35' + trim_command = "trim_galore %s %s " \ + % (trim_params, fastq[0]) + + logger.info("Running trim_galore with %s", trim_command) + + trim = subprocess.Popen(trim_command, shell=True) + out, err = trim.communicate() + + +def main(): + args = get_args() + fastq = args.fastq + paired = args.paired + + # Create a file handler + handler = logging.FileHandler('trim.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Run trim_reads + trim_reads(fastq, paired) + + +if __name__ == '__main__': + main() diff --git a/python_utils/utils.py b/python_utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a50167367eec13fc0548bfbffbfa49fa8db58c60 --- /dev/null +++ b/python_utils/utils.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +'''General utilities.''' + + +import shlex +import logging +import subprocess +import sys +import os + + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = True + + +def run_pipe(steps, outfile=None): + # TODO: capture stderr + from subprocess import Popen, PIPE + p = None + p_next = None + first_step_n = 1 + last_step_n = len(steps) + for n, step in enumerate(steps, start=first_step_n): + logger.debug("step %d: %s" % (n, step)) + if n == first_step_n: + if n == last_step_n and outfile: # one-step pipeline with outfile + with open(outfile, 'w') as fh: + print("one step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p = Popen(shlex.split(step), stdout=fh) + break + print("first step shlex to stdout: %s" % (shlex.split(step))) + + p = Popen(shlex.split(step), stdout=PIPE) + elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file + with open(outfile, 'w') as fh: + print("last step shlex: %s to file: %s" % (shlex.split(step), outfile)) + p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh) + p.stdout.close() + p = p_last + else: # handles intermediate steps and, in the case of a pipe to stdout, the last step + print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step))) + p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE) + p.stdout.close() + p = p_next + out, err = p.communicate() + return out, err + + +def block_on(command): + process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + for line in iter(process.stdout.readline, b''): + sys.stdout.write(line.decode('utf-8')) + process.communicate() + return process.returncode + + +def strip_extensions(filename, extensions): + '''Strips extensions to get basename of file.''' + + basename = filename + for extension in extensions: + basename = basename.rpartition(extension)[0] or basename + + return basename + + +def count_lines(filename): + import mimetypes + compressed_mimetypes = [ + "compress", + "bzip2", + "gzip" + ] + mime_type = mimetypes.guess_type(filename)[1] + if mime_type in compressed_mimetypes: + catcommand = 'gzip -dc' + else: + catcommand = 'cat' + out, err = run_pipe([ + '%s %s' % (catcommand, filename), + 'wc -l' + ]) + return int(out) + + +def rescale_scores(filename, scores_col, new_min=10, new_max=1000): + sorted_fn = 'sorted-%s' % (filename) + rescaled_fn = 'rescaled-%s' % (filename) + + out, err = run_pipe([ + 'sort -k %dgr,%dgr %s' % (scores_col, scores_col, filename), + r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""], + sorted_fn) + + out, err = run_pipe([ + 'head -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + max_score = float(out.strip()) + logger.info("rescale_scores: max_score = %s" % (max_score)) + + out, err = run_pipe([ + 'tail -n 1 %s' % (sorted_fn), + 'cut -f %s' % (scores_col)]) + min_score = float(out.strip()) + logger.info("rescale_scores: min_score = %s" % (min_score)) + + a = min_score + b = max_score + x = new_min + y = new_max + + if min_score == max_score: # give all peaks new_min + rescale_formula = "x" + else: # n is the unscaled score from scores_col + rescale_formula = "((n-a)*(y-x)/(b-a))+x" + + out, err = run_pipe( + [ + 'cat %s' % (sorted_fn), + r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" + % (scores_col, a, b, x, y) + + r"""{$%d=int(%s) ; print $0}'""" + % (scores_col, rescale_formula) + ], + rescaled_fn) + + os.remove(sorted_fn) + + return rescaled_fn diff --git a/quality_metrics/experiment_qc.py b/quality_metrics/experiment_qc.py new file mode 100644 index 0000000000000000000000000000000000000000..bdad4e97bad160ee09baa7f80ad41f87723c9d4a --- /dev/null +++ b/quality_metrics/experiment_qc.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 + +'''Experiment correlation and enrichment of reads over genome-wide bins.''' + + +import argparse +import logging +import subprocess +import shutil +from multiprocessing import cpu_count +import pandas as pd + + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-d', '--design', + help="The design file to run QC (tsv format).", + required=True) + + args = parser.parse_args() + return args + + +def check_tools(): + '''Checks for required componenets on user system.''' + + logger.info('Checking for required libraries and components on this system') + + deeptools_path = shutil.which("deeptools") + if deeptools_path: + logger.info('Found deeptools: %s', deeptools_path) + else: + logger.error('Missing deeptools') + raise Exception('Missing deeptools') + + +def generate_read_summary(design): + '''Generate summary of data based on read counts.''' + + bam_files = ' '.join(design['bam_reads']) + labels = ' '.join(design['sample_id']) + mbs_filename = 'sample_mbs.npz' + + mbs_command = ( + "multiBamSummary bins -p %d --bamfiles %s --labels %s -out %s" + % (cpu_count(), bam_files, labels, mbs_filename) + ) + + logger.info("Running deeptools with %s", mbs_command) + + read_summary = subprocess.Popen(mbs_command, shell=True) + out, err = read_summary.communicate() + + return mbs_filename + + +def check_correlation(mbs): + '''Check Spearman pairwise correlation of samples based on read counts.''' + + spearman_filename = 'heatmap_SpearmanCorr.png' + spearman_params = "--corMethod spearman --skipZero" + \ + " --plotTitle \"Spearman Correlation of Read Counts\"" + \ + " --whatToPlot heatmap --colorMap RdYlBu --plotNumbers" + + spearman_command = ( + "plotCorrelation -in %s %s -o %s" + % (mbs, spearman_params, spearman_filename) + ) + + logger.info("Running deeptools with %s", spearman_command) + + spearman_correlation = subprocess.Popen(spearman_command, shell=True) + out, err = spearman_correlation.communicate() + + +def check_coverage(design): + '''Asses the sequencing depth of samples.''' + + bam_files = ' '.join(design['bam_reads']) + labels = ' '.join(design['sample_id']) + coverage_filename = 'coverage.png' + coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \ + " --ignoreDuplicates --minMappingQuality 10" + + coverage_command = ( + "plotCoverage -b %s --labels %s %s --plotFile %s" + % (bam_files, labels, coverage_params, coverage_filename) + ) + + logger.info("Running deeptools with %s", coverage_command) + + coverage_summary = subprocess.Popen(coverage_command, shell=True) + out, err = coverage_summary.communicate() + + +def update_controls(design): + '''Update design file to append controls list.''' + + logger.info("Running control file update.") + + file_dict = design[['sample_id', 'bam_reads']] \ + .set_index('sample_id').T.to_dict() + + design['control_reads'] = design['control_id'] \ + .apply(lambda x: file_dict[x]['bam_reads']) + + logger.info("Removing rows that are there own control.") + + design = design[design['control_id'] != design['sample_id']] + + return design + + +def check_enrichment(sample_id, control_id, sample_reads, control_reads): + '''Asses the enrichment per sample.''' + + fingerprint_filename = sample_id + '_fingerprint.png' + + fingerprint_command = ( + "plotFingerprint -b %s %s --labels %s %s --plotFile %s" + % (sample_reads, control_reads, sample_id, control_id, fingerprint_filename) + ) + + logger.info("Running deeptools with %s", fingerprint_command) + + fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True) + out, err = fingerprint_summary.communicate() + + +def main(): + args = get_args() + design = args.design + + # Create a file handler + handler = logging.FileHandler('experiment_qc.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Read files + design_df = pd.read_csv(design, sep='\t') + + # Run correlation + mbs_filename = generate_read_summary(design_df) + check_correlation(mbs_filename) + + # Run coverage + check_coverage(design_df) + + # Run enrichment + new_design_df = update_controls(design_df) + for index, row in new_design_df.iterrows(): + check_enrichment( + row['sample_id'], + row['control_id'], + row['bam_reads'], + row['control_reads']) + + +if __name__ == '__main__': + main() diff --git a/quality_metrics/xcor.py b/quality_metrics/xcor.py new file mode 100644 index 0000000000000000000000000000000000000000..672fdbb4f14bc8edc64c018e6991a8c7b0d47be2 --- /dev/null +++ b/quality_metrics/xcor.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +'''Compute cross-correlation analysis.''' + +import os +import argparse +import shutil +import logging +from multiprocessing import cpu_count +import sys +sys.path.append(os.path.abspath('../python_utils')) +import utils + +EPILOG = ''' +For more details: + %(prog)s --help +''' + +# SETTINGS + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) +logger.propagate = False +logger.setLevel(logging.INFO) + + +def get_args(): + '''Define arguments.''' + + parser = argparse.ArgumentParser( + description=__doc__, epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-t', '--tag', + help="The tagAlign file to qc on.", + required=True) + + parser.add_argument('-p', '--paired', + help="True/False if paired-end or single end.", + default=False, + action='store_true') + + args = parser.parse_args() + return args + +# Functions + + +def check_tools(): + '''Checks for required componenets on user system''' + + logger.info('Checking for required libraries and components on this system') + + r_path = shutil.which("R") + if r_path: + logger.info('Found R: %s', r_path) + else: + logger.error('Missing R') + raise Exception('Missing R') + + +def xcor(tag, paired): + '''Use spp to calculate cross-correlation stats.''' + + tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz'])) + + uncompressed_tag_filename = tag_basename + out, err = utils.run_pipe([ + 'gzip -dc %s > ' % (tag)], outfile=uncompressed_tag_filename) + + # Subsample tagAlign file + numReads = 15000000 + + subsampled_tag_filename = \ + tag_basename + ".%d.tagAlign.gz" % (numReads/1000000) + steps = [ + 'grep -v "chrM" %s' % (uncompressed_tag_filename), + 'shuf -n %d --random-source=%s' % (numReads, uncompressed_tag_filename) + ] + if paired: + steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""]) + + steps.extend(['gzip -nc']) + + out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename) + + # Calculate Cross-correlation QC scores + cc_scores_filename = subsampled_tag_filename + ".cc.qc" + cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf" + + # CC_SCORE FILE format + # Filename <tab> + # numReads <tab> + # estFragLen <tab> + # corr_estFragLen <tab> + # PhantomPeak <tab> + # corr_phantomPeak <tab> + # argmin_corr <tab> + # min_corr <tab> + # phantomPeakCoef <tab> + # relPhantomPeakCoef <tab> + # QualityTag + + run_spp_command = shutil.which("run_spp.R") + out, err = utils.run_pipe([ + "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" % + (run_spp_command, subsampled_tag_filename, cpu_count(), + cc_plot_filename, cc_scores_filename) + ]) + + return cc_scores_filename + + +def main(): + args = get_args() + paired = args.paired + tag = args.tag + + # Create a file handler + handler = logging.FileHandler('xcor.log') + logger.addHandler(handler) + + # Check if tools are present + check_tools() + + # Calculate Cross-correlation + xcor_filename = xcor(tag, paired) + + +if __name__ == '__main__': + main()