Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • BICF/Astrocyte/chipseq_analysis
  • s190984/chipseq_analysis
  • astrocyte/workflows/bicf/chipseq_analysis
  • s219741/chipseq-analysis-containerized
Show changes
Showing
with 2671 additions and 305 deletions
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate peaks from data.'''
import os
import argparse
import shutil
import subprocess
import logging
import utils
from xcor import xcor as calculate_xcor
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-t', '--tag',
help="The tagAlign file to perform peak calling on.",
required=True)
parser.add_argument('-x', '--xcor',
help="The cross-correlation file (if already calculated).",
required=True)
parser.add_argument('-c', '--con',
help="The control tagAling file used for peak calling.",
required=True)
parser.add_argument('-s', '--sample',
help="The sample id to name the peak files.",
required=True)
parser.add_argument('-g', '--genome',
help="The genome size of reference genome.",
required=True)
parser.add_argument('-z', '--size',
help="The file with chromosome sizes of reference genome.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
macs_path = shutil.which("macs2")
if macs_path:
logger.info('Found MACS2: %s', macs_path)
# Get Version
macs_version_command = "macs2 --version"
macs_version = subprocess.check_output(macs_version_command, shell=True, stderr=subprocess.STDOUT)
# Write to file
macs_file = open("version_macs.txt", "wb")
macs_file.write(macs_version)
macs_file.close()
else:
logger.error('Missing MACS2')
raise Exception('Missing MACS2')
bg_bw_path = shutil.which("bedGraphToBigWig")
if bg_bw_path:
logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
# Get Version
bg_bw_version_command = "bedGraphToBigWig"
try:
subprocess.check_output(bg_bw_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
bg_bw_version = e.output
# Write to file
bg_bw_file = open("version_bedGraphToBigWig.txt", "wb")
bg_bw_file.write(bg_bw_version)
bg_bw_file.close()
else:
logger.error('Missing bedGraphToBigWig')
raise Exception('Missing bedGraphToBigWig')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes):
'''Call peaks and signal tracks'''
# Extract the fragment length estimate from column 3 of the
# cross-correlation scores file
with open(xcor, 'r') as xcor_fh:
firstline = xcor_fh.readline()
frag_lengths = firstline.split()[2] # third column
frag_lengths_array = frag_lengths.split(',')
fragment_length = 0
fragment = False
# Loop through all values of fragment length
for f in frag_lengths.split(','):
fragment_length = f
logger.info("Fraglen %s", fragment_length)
if int(fragment_length) > 50:
fragment = True
break
if fragment == False:
logger.info('Error in cross-correlation analysis: %s', frag_lengths_array)
raise Exception("Error in cross-correlation analysis: %s" % frag_lengths_array)
# Generate narrow peaks and preliminary signal tracks
command = 'macs2 callpeak ' + \
'-t %s -c %s ' % (experiment, control) + \
'-f BED -n %s ' % (prefix) + \
'-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length)
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d", returncode)
assert returncode == 0, "MACS2 non-zero return"
# MACS2 sometimes calls features off the end of chromosomes.
# Remove coordinates outside chromosome sizes
int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
narrowpeak_fn = '%s.narrowPeak' % (prefix)
clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)
steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
out, err = utils.run_pipe(steps)
# Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
# (score must be <1000)
rescaled_narrowpeak_fn = utils.rescale_scores(
clipped_narrowpeak_fn, scores_col=5)
# Sort by Col8 in descending order and replace long peak names in Column 4
# with Peak_<peakRank>
steps = [
'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'"""
]
out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn))
# For Fold enrichment signal tracks
# This file is a tab delimited file with 2 columns Col1 (chromosome name),
# Col2 (chromosome size in bp).
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_FE.bdg ' % (prefix) + \
'-m FE'
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d", returncode)
assert returncode == 0, "MACS2 non-zero return"
# Remove coordinates outside chromosome sizes (MACS2 bug)
fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix)
fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn)
fc_signal_fn = "%s.fc_signal.bw" % (prefix)
steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
out, err = utils.run_pipe(steps)
# Sort file
out, err = utils.run_pipe([
'bedSort %s %s' % (fc_bedgraph_fn, fc_bedgraph_sorted_fn)])
# Convert bedgraph to bigwig
command = 'bedGraphToBigWig ' + \
'%s ' % (fc_bedgraph_sorted_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (fc_signal_fn)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d", returncode)
assert returncode == 0, "bedGraphToBigWig non-zero return"
# For -log10(p-value) signal tracks
# Compute sval =
# min(no. of reads in ChIP, no. of reads in control) / 1,000,000
out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
chip_reads = out.strip()
out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
control_reads = out.strip()
sval = str(min(float(chip_reads), float(control_reads)) / 1000000)
logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
(chip_reads, control_reads, sval))
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_ppois.bdg ' % (prefix) + \
'-m ppois -S %s' % (sval)
logger.info(command)
returncode = utils.block_on(command)
assert returncode == 0, "MACS2 non-zero return"
# Remove coordinates outside chromosome sizes (MACS2 bug)
pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix)
pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn)
pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
out, err = utils.run_pipe(steps)
# Sort file
out, err = utils.run_pipe([
'bedSort %s %s' % (fc_bedgraph_fn, pvalue_bedgraph_sorted_fn)])
# Convert bedgraph to bigwig
command = 'bedGraphToBigWig ' + \
'%s ' % (pvalue_bedgraph_sorted_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (pvalue_signal_fn)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d", returncode)
assert returncode == 0, "bedGraphToBigWig non-zero return"
# Remove temporary files
os.remove(clipped_narrowpeak_fn)
os.remove(rescaled_narrowpeak_fn)
os.remove(int_narrowpeak_fn)
def main():
args = get_args()
tag = args.tag
xcor = args.xcor
con = args.con
sample = args.sample
genome_size = args.genome
chrom_size = args.size
paired = args.paired
# Create a file handler
handler = logging.FileHandler('call_peaks.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Calculate Cross-correlation if not already calcualted
if xcor == 'Calculate':
xcor_file = calculate_xcor(tag, paired)
else:
xcor_file = xcor
# Call Peaks using MACS2
call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Check if design file is correctly formatted and matches files list.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (tsv format).",
required=True)
parser.add_argument('-f', '--fastq',
help="File with list of fastq files (tsv format).",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
def check_design_headers(design, paired):
'''Check if design file conforms to sequencing type.'''
# Default headers
design_template = [
'sample_id',
'experiment_id',
'biosample',
'factor',
'treatment',
'replicate',
'control_id',
'fastq_read1']
design_headers = list(design.columns.values)
if paired: # paired-end data
design_template.extend(['fastq_read2'])
# Check if headers
logger.info("Running header check.")
missing_headers = set(design_template) - set(design_headers)
if len(missing_headers) > 0:
logger.error('Missing column headers: %s', list(missing_headers))
raise Exception("Missing column headers: %s" % list(missing_headers))
def check_samples(design):
'''Check if design file has the correct sample name mapping.'''
logger.info("Running sample check.")
samples = design.groupby('sample_id') \
.apply(list)
malformated_samples = []
chars = set('-.')
for sample in samples.index.values:
if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)):
malformated_samples.append(sample)
if len(malformated_samples) > 0:
logger.error('Malformed samples from design file: %s', list(malformated_samples))
raise Exception("Malformed samples from design file: %s" %
list(malformated_samples))
def check_experiments(design):
'''Check if design file has the correct experiment name mapping.'''
logger.info("Running experiment check.")
experiments = design.groupby('experiment_id') \
.apply(list)
malformated_experiments = []
chars = set('-.')
for experiment in experiments.index.values:
if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)):
malformated_experiments.append(experiment)
if len(malformated_experiments) > 0:
logger.error('Malformed experiment from design file: %s', list(malformated_experiments))
raise Exception("Malformed experiment from design file: %s" %
list(malformated_experiments))
def check_controls(design):
'''Check if design file has the correct control mapping.'''
logger.info("Running control check.")
missing_controls = set(design['control_id']) - set(design['sample_id'])
if len(missing_controls) > 0:
logger.error('Missing control experiments: %s', list(missing_controls))
raise Exception("Missing control experiments: %s" %
list(missing_controls))
def check_replicates(design):
'''Check if design file has unique replicate numbersfor an experiment.'''
logger.info("Running replicate check.")
experiment_replicates = design.groupby('experiment_id')['replicate'] \
.apply(list)
duplicated_replicates = []
for experiment in experiment_replicates.index.values:
replicates = experiment_replicates[experiment]
unique_replicates = set(replicates)
if len(replicates) != len(unique_replicates):
duplicated_replicates.append(experiment)
if len(duplicated_replicates) > 0:
logger.error('Duplicate replicates in experiments: %s', list(duplicated_replicates))
raise Exception("Duplicate replicates in experiments: %s" %
list(duplicated_replicates))
def check_files(design, fastq, paired):
'''Check if design file has the files found.'''
logger.info("Running file check.")
if paired: # paired-end data
files = list(design['fastq_read1']) + list(design['fastq_read2'])
else: # single-end data
files = design['fastq_read1']
files_found = fastq['name']
missing_files = set(files) - set(files_found)
if len(missing_files) > 0:
logger.error('Missing files from design file: %s', list(missing_files))
raise Exception("Missing files from design file: %s" %
list(missing_files))
else:
file_dict = fastq.set_index('name').T.to_dict()
design['fastq_read1'] = design['fastq_read1'] \
.apply(lambda x: file_dict[x]['path'])
if paired: # paired-end data
design['fastq_read2'] = design['fastq_read2'] \
.apply(lambda x: file_dict[x]['path'])
return design
def main():
args = get_args()
design = args.design
fastq = args.fastq
paired = args.paired
# Create a file handler
handler = logging.FileHandler('design.log')
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(design, sep='\t')
fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path'])
# Check design file
check_design_headers(design_df, paired)
check_controls(design_df)
check_replicates(design_df)
new_design_df = check_files(design_df, fastq_df, paired)
# Write out new design file
new_design_df.to_csv('design.tsv', header=True, sep='\t', index=False)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Convert tagAlign files for further processing.'''
import os
import argparse
import shutil
import subprocess
import shlex
import logging
from multiprocessing import cpu_count
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-b', '--bam',
help="The bam file to convert.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def convert_mapped(bam, tag_filename):
'''Use bedtools to convert to tagAlign.'''
out, err = utils.run_pipe([
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
"gzip -nc"], outfile=tag_filename)
def convert_mapped_pe(bam, bam_basename):
'''Use bedtools to convert to tagAlign PE data.'''
bedpe_filename = bam_basename + ".bedpe.gz"
# Name sort bam to make BEDPE
nmsrt_bam_filename = bam_basename + ".nmsrt.bam"
samtools_sort_command = \
"samtools sort -n -@%d -o %s %s" \
% (cpu_count(), nmsrt_bam_filename, bam)
logger.info(samtools_sort_command)
subprocess.check_output(shlex.split(samtools_sort_command))
out, err = utils.run_pipe([
"bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename),
"gzip -nc"], outfile=bedpe_filename)
def main():
args = get_args()
paired = args.paired
bam = args.bam
# Create a file handler
handler = logging.FileHandler('convert_reads.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Convert PE or SE to tagAlign
bam_basename = os.path.basename(
utils.strip_extensions(bam, ['.bam', '.dedup']))
tag_filename = bam_basename + '.tagAlign.gz'
convert_mapped(bam, tag_filename)
if paired: # paired-end data
convert_mapped_pe(bam, bam_basename)
else:
bedse_filename = bam_basename + ".bedse.gz"
shutil.copy(tag_filename, bedse_filename)
if __name__ == '__main__':
main()
#!/bin/Rscript
#*
#* --------------------------------------------------------------------------
#* Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
#* --------------------------------------------------------------------------
#*
# Load libraries
library("DiffBind")
#build dba object from sample sheet and do analysis
args <- commandArgs(TRUE)
# Create parser object
args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 1) {
stop("Usage: diff_peaks.R annotate_design.tsv ", call.=FALSE)
}
# Output version of DiffBind
diffibind_version = packageVersion('DiffBind')
write.table(paste("Version", diffibind_version), file = "version_DiffBind.txt", sep = "\t",
row.names = FALSE, col.names = FALSE)
# Build DBA object from design file
data <- dba(sampleSheet=args[1])
data <- dba.count(data)
data <- dba.contrast(data, minMembers = 2, categories=DBA_CONDITION)
data <- dba.analyze(data)
#Draw figures
pdf("diffbind.samples.heatmap.pdf")
# Draw figures
pdf("heatmap.pdf")
plot(data)
dev.off()
pdf("diffbind.samples.pca.pdf")
pdf("pca.pdf")
dba.plotPCA(data, DBA_TISSUE, label=DBA_CONDITION)
dev.off()
#Save peak reads count
# Save peak reads count
normcount <- dba.peakset(data, bRetrieve=T)
write.table(as.data.frame(normcount),"diffbind.normcount.txt",sep="\t",quote=F,row.names=F)
write.table(as.data.frame(normcount),"normcount_peaksets.txt",sep="\t",quote=F,row.names=F)
#Retriving the differentially bound sites
#make new design file for chipseeker at the same time
# Reteriving the differentially bound sites
# Make new design file for peakAnnotation at the same time
new_SampleID = c()
new_Peaks = c()
for (i in c(1:length(data$contrasts)))
{
for (i in c(1:length(data$contrasts))) {
contrast_bed_name = paste(data$contrasts[[i]]$name1,"vs",
data$contrasts[[i]]$name2,"diffbind.bed",sep="_")
contrast_name = paste(data$contrasts[[i]]$name1,"vs",
data$contrasts[[i]]$name2,"diffbind.xls",sep="_")
data$contrasts[[i]]$name2,"diffbind.csv",sep="_")
new_SampleID = c(new_SampleID,paste(data$contrasts[[i]]$name1,"vs",data$contrasts[[i]]$name2,sep="_"))
new_Peaks = c(new_Peaks, contrast_bed_name)
report <- dba.report(data, contrast=i, th=1, bCount=TRUE)
report <- as.data.frame(report)
print(head(report))
colnames(report)[1:5]<-c("chrom","peak_start","peak_stop","peak_width","peak_strand")
#print(head(report))
write.table(report,contrast_name,sep="\t",quote=F,row.names=F)
write.table(report[,c(1:3)],contrast_bed_name,sep="\t",quote=F,row.names=F, col.names=F)
}
#Write new design file
newdesign = data.frame(SampleID=new_SampleID, Peaks=new_Peaks)
write.csv(newdesign,"diffpeak.design",row.names=F,quote=F)
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate experiment design files for downstream processing.'''
import argparse
import logging
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to make experiemnts (tsv format).",
required=True)
args = parser.parse_args()
return args
def update_controls(design):
'''Update design file to append controls list.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'tag_align']] \
.set_index('sample_id').T.to_dict()
design['control_tag_align'] = design['control_id'] \
.apply(lambda x: file_dict[x]['tag_align'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
return design
def make_experiment_design(design):
'''Make design file by grouping for each experiment'''
logger.info("Running experiment design generation.")
for experiment, df_experiment in design.groupby('experiment_id'):
experiment_file = experiment + '.tsv'
df_experiment.to_csv(experiment_file, header=True, sep='\t', index=False)
def main():
args = get_args()
design = args.design
# Create a file handler
handler = logging.FileHandler('experiment_generation.log')
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(design, sep='\t')
# Update design file for check_controls
new_design_df = update_controls(design_df)
# write out experiment design files
make_experiment_design(new_design_df)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Experiment correlation and enrichment of reads over genome-wide bins.'''
import argparse
import logging
import subprocess
import shutil
from multiprocessing import cpu_count
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run QC (tsv format).",
required=True)
parser.add_argument('-e', '--extension',
help="Number of base pairs to extend the reads",
type=int,
required=True)
args = parser.parse_args()
return args
def check_tools():
'''Checks for required componenets on user system.'''
logger.info('Checking for required libraries and components on this system')
deeptools_path = shutil.which("deeptools")
if deeptools_path:
logger.info('Found deeptools: %s', deeptools_path)
# Get Version
deeptools_version_command = "deeptools --version"
deeptools_version = subprocess.check_output(deeptools_version_command, shell=True, stderr=subprocess.STDOUT)
# Write to file
deeptools_file = open("version_deeptools.txt", "wb")
deeptools_file.write(deeptools_version)
deeptools_file.close()
else:
logger.error('Missing deeptools')
raise Exception('Missing deeptools')
def generate_read_summary(design, extension):
'''Generate summary of data based on read counts.'''
bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id'])
mbs_filename = 'sample_mbs.npz'
mbs_command = (
"multiBamSummary bins -p %d --bamfiles %s --extendReads %d --labels %s -out %s"
% (cpu_count(), bam_files, extension, labels, mbs_filename)
)
logger.info("Running deeptools with %s", mbs_command)
read_summary = subprocess.Popen(mbs_command, shell=True)
out, err = read_summary.communicate()
return mbs_filename
def check_spearman_correlation(mbs):
'''Check Spearman pairwise correlation of samples based on read counts.'''
spearman_filename = 'heatmap_SpearmanCorr.pdf'
spearman_params = "--corMethod spearman --skipZero" + \
" --plotTitle \"Spearman Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
spearman_command = (
"plotCorrelation -in %s %s -o %s"
% (mbs, spearman_params, spearman_filename)
)
logger.info("Running deeptools with %s", spearman_command)
spearman_correlation = subprocess.Popen(spearman_command, shell=True)
out, err = spearman_correlation.communicate()
def check_pearson_correlation(mbs):
'''Check Pearson pairwise correlation of samples based on read counts.'''
pearson_filename = 'heatmap_PearsonCorr.pdf'
pearson_params = "--corMethod pearson --skipZero" + \
" --plotTitle \"Pearson Correlation of Read Counts\"" + \
" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers"
pearson_command = (
"plotCorrelation -in %s %s -o %s"
% (mbs, pearson_params, pearson_filename)
)
logger.info("Running deeptools with %s", pearson_command)
pearson_correlation = subprocess.Popen(pearson_command, shell=True)
out, err = pearson_correlation.communicate()
def check_coverage(design, extension):
'''Asses the sequencing depth of samples.'''
bam_files = ' '.join(design['bam_reads'])
labels = ' '.join(design['sample_id'])
coverage_filename = 'coverage.pdf'
coverage_params = "-n 1000000 --plotTitle \"Sample Coverage\"" + \
" --ignoreDuplicates --minMappingQuality 10"
coverage_command = (
"plotCoverage -b %s --extendReads %d --labels %s %s --plotFile %s"
% (bam_files, extension, labels, coverage_params, coverage_filename)
)
logger.info("Running deeptools with %s", coverage_command)
coverage_summary = subprocess.Popen(coverage_command, shell=True)
out, err = coverage_summary.communicate()
def update_controls(design):
'''Update design file to append controls list.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
return design
def check_enrichment(sample_id, control_id, sample_reads, control_reads, extension):
'''Asses the enrichment per sample.'''
fingerprint_filename = sample_id + '_fingerprint.pdf'
fingerprint_command = (
"plotFingerprint -b %s %s --extendReads %d --labels %s %s --plotFile %s"
% (sample_reads, control_reads, extension, sample_id, control_id, fingerprint_filename)
)
logger.info("Running deeptools with %s", fingerprint_command)
fingerprint_summary = subprocess.Popen(fingerprint_command, shell=True)
out, err = fingerprint_summary.communicate()
def main():
args = get_args()
design = args.design
extension = args.extension
# Create a file handler
handler = logging.FileHandler('experiment_qc.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_df = pd.read_csv(design, sep='\t')
# Run correlation
mbs_filename = generate_read_summary(design_df, extension)
check_spearman_correlation(mbs_filename)
check_pearson_correlation(mbs_filename)
# Run coverage
check_coverage(design_df, extension)
# Run enrichment
new_design_df = update_controls(design_df)
for index, row in new_design_df.iterrows():
check_enrichment(
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'],
extension)
if __name__ == '__main__':
main()
#!/usr/bin/env python
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Make header for HTML of references.'''
import argparse
import subprocess
import shlex
import logging
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-r', '--reference',
help="The reference file (markdown format).",
required=True)
parser.add_argument('-o', '--output',
help="The out file name.",
default='references')
args = parser.parse_args()
return args
def main():
args = get_args()
reference = args.reference
output = args.output
out_filename = output + '_mqc.yaml'
# Header for HTML
print('''
id: 'Software References'
section_name: 'Software References'
description: 'This section describes references for the tools used.'
plot_type: 'html'
data: |
'''
, file = open(out_filename, "w")
)
# Turn Markdown into HTML
references_html = 'bash -c "pandoc -p {} | sed \'s/^/ /\' >> {}"'
references_html = references_html.format(reference, out_filename)
subprocess.check_call(shlex.split(references_html))
if __name__ == '__main__':
main()
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Make YAML of software versions.'''
from __future__ import print_function
from collections import OrderedDict
import re
import os
import logging
import glob
import argparse
import numpy as np
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
SOFTWARE_REGEX = {
'Nextflow': ['version_nextflow.txt', r"(\S+)"],
'Trim Galore!': ['trimReads_vf/version_trimgalore.txt', r"version (\S+)"],
'Cutadapt': ['trimReads_vf/version_cutadapt.txt', r"Version (\S+)"],
'BWA': ['alignReads_vf/version_bwa.txt', r"Version: (\S+)"],
'Samtools': ['alignReads_vf/version_samtools.txt', r"samtools (\S+)"],
'Sambamba': ['filterReads_vf/version_sambamba.txt', r"sambamba (\S+)"],
'BEDTools': ['convertReads_vf/version_bedtools.txt', r"bedtools v(\S+)"],
'R': ['crossReads_vf/version_r.txt', r"R version (\S+)"],
'SPP': ['crossReads_vf/version_spp.txt', r"\[1\] ‘(1.14)’"],
'MACS2': ['callPeaksMACS_vf/version_macs.txt', r"macs2 (\S+)"],
'bedGraphToBigWig': ['callPeaksMACS_vf/version_bedGraphToBigWig.txt', r"bedGraphToBigWig v (\S+)"],
'ChIPseeker': ['peakAnnotation_vf/version_ChIPseeker.txt', r"Version (\S+)\""],
'MEME-ChIP': ['motifSearch_vf/version_memechip.txt', r"Version (\S+)"],
'DiffBind': ['diffPeaks_vf/version_DiffBind.txt', r"Version (\S+)\""],
'deepTools': ['experimentQC_vf/version_deeptools.txt', r"deeptools (\S+)"],
'Python': ['version_python.txt', r"Python (\S+)"],
'MultiQC': ['version_multiqc.txt', r"multiqc, version (\S+)"],
}
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-o', '--output',
help="The out file name.",
required=True)
parser.add_argument('-t', '--test',
help='Used for testing purposes',
default=False,
action='store_true')
args = parser.parse_args()
return args
def check_files(files, test):
'''Check if version files are found.'''
logger.info("Running file check.")
software_files = np.array(list(SOFTWARE_REGEX.values()))[:,0]
extra_files = set(files) - set(software_files)
if len(extra_files) > 0 and test:
logger.error('Missing regex: %s', list(extra_files))
raise Exception("Missing regex: %s" % list(extra_files))
def main():
args = get_args()
output = args.output
test = args.test
out_filename = output + '_mqc.yaml'
results = OrderedDict()
results['Nextflow'] = '<span style="color:#999999;\">Not Run</span>'
results['Trim Galore!'] = '<span style="color:#999999;\">Not Run</span>'
results['Cutadapt'] = '<span style="color:#999999;\">Not Run</span>'
results['BWA'] = '<span style="color:#999999;\">Not Run</span>'
results['Samtools'] = '<span style="color:#999999;\">Not Run</span>'
results['Sambamba'] = '<span style="color:#999999;\">Not Run</span>'
results['BEDTools'] = '<span style="color:#999999;\">Not Run</span>'
results['R'] = '<span style="color:#999999;\">Not Run</span>'
results['SPP'] = '<span style="color:#999999;\">Not Run</span>'
results['MACS2'] = '<span style="color:#999999;\">Not Run</span>'
results['bedGraphToBigWig'] = '<span style="color:#999999;\">Not Run</span>'
results['ChIPseeker'] = '<span style="color:#999999;\">Not Run</span>'
results['MEME-ChIP'] = '<span style="color:#999999;\">Not Run</span>'
results['DiffBind'] = '<span style="color:#999999;\">Not Run</span>'
results['deepTools'] = '<span style="color:#999999;\">Not Run</span>'
results['MultiQC'] = '<span style="color:#999999;\">Not Run</span>'
results['Python'] = '<span style="color:#999999;\">Not Run</span>'
# list all files
files = glob.glob('**/*.txt', recursive=True)
# Check for version files:
check_files(files, test)
# Search each file using its regex
for k, v in SOFTWARE_REGEX.items():
if os.path.isfile(v[0]):
with open(v[0]) as x:
versions = x.read()
match = re.search(v[1], versions)
if match:
results[k] = "v{}".format(match.group(1))
# Dump to YAML
print(
'''
id: 'Software Versions'
section_name: 'Software Versions'
section_href: 'https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/'
plot_type: 'html'
description: 'are collected at run time from the software output.'
data: |
<dl class="dl-horizontal">
'''
, file = open(out_filename, "w"))
for k, v in results.items():
print(" <dt>{}</dt><dd>{}</dd>".format(k, v), file = open(out_filename, "a"))
print(" </dl>", file = open(out_filename, "a"))
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Remove duplicates and filter unmapped reads.'''
import os
import subprocess
import argparse
import shutil
import shlex
import logging
from multiprocessing import cpu_count
import utils
import pandas as pd
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.bam', '.srt']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-b', '--bam',
help="The bam file to run filtering and qc on.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
sambamba_path = shutil.which("sambamba")
if sambamba_path:
logger.info('Found sambamba: %s', sambamba_path)
# Get Version
sambamba_version_command = "sambamba"
try:
subprocess.check_output(sambamba_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
sambamba_version = e.output
# Write to file
sambamba_file = open("version_sambamba.txt", "wb")
sambamba_file.write(sambamba_version)
sambamba_file.close()
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def filter_mapped_pe(bam, bam_basename):
'''Use samtools to filter unmapped reads for PE data.'''
filt_bam_prefix = bam_basename + ".filt.srt"
filt_bam_filename = filt_bam_prefix + ".bam"
tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix)
tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Only keep properly paired reads
# Obtain name sorted BAM file
out, err = utils.run_pipe([
# filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
# -q 30 exclude MAPQ < 30; -u uncompressed output
# exclude FLAG 1804: unmapped, next segment unmapped, secondary
# alignments, not passing platform q, PCR or optical duplicates
# require FLAG 2: properly aligned
"samtools view -F 1804 -f 2 -q 30 -u %s" % (bam),
# sort: -n sort by name; - take input from stdin;
# out to specified filename
# Will produce name sorted BAM
"samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
if err:
logger.error("samtools filter error: %s", err)
# Remove orphan reads (pair was removed)
# and read pairs mapping to different chromosomes
# Obtain position sorted BAM
out, err = utils.run_pipe([
# fill in mate coordinates, ISIZE and mate-related flags
# fixmate requires name-sorted alignment; -r removes secondary and
# unmapped (redundant here because already done above?)
# - send output to stdout
"samtools fixmate -r %s -" % (tmp_filt_bam_filename),
# repeat filtering after mate repair
"samtools view -F 1804 -f 2 -u -",
# produce the coordinate-sorted BAM
"samtools sort -@ %d -o %s" % (cpu_count(), filt_bam_filename)])
os.remove(tmp_filt_bam_filename)
return filt_bam_filename
def filter_mapped_se(bam, bam_basename):
'''Use samtools to filter unmapped reads for SE data.'''
filt_bam_prefix = bam_basename + ".filt.srt"
filt_bam_filename = filt_bam_prefix + ".bam"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Obtain name sorted BAM file
with open(filt_bam_filename, 'w') as temp_file:
samtools_filter_command = (
"samtools view -F 1804 -q 30 -b %s"
% (bam)
)
logger.info(samtools_filter_command)
subprocess.check_call(
shlex.split(samtools_filter_command),
stdout=temp_file)
return filt_bam_filename
def dedup_mapped(bam, bam_basename, paired):
'''Use sambamba and samtools to remove duplicates.'''
# Markduplicates
dup_file_qc_filename = bam_basename + ".dedup.qc"
tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
sambamba_params = "--hash-table-size=17592186044416" + \
" --overflow-list-size=20000000 --io-buffer-size=256"
with open(dup_file_qc_filename, 'w') as temp_file:
sambamba_markdup_command = (
"sambamba markdup -t %d %s --tmpdir=%s %s %s"
% (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename)
)
logger.info(sambamba_markdup_command)
subprocess.check_call(
shlex.split(sambamba_markdup_command),
stderr=temp_file)
# Remove duplicates
final_bam_prefix = bam_basename + ".dedup"
final_bam_filename = final_bam_prefix + ".bam"
if paired: # paired-end data
samtools_dedupe_command = \
"samtools view -F 1804 -f 2 -b %s" % (tmp_dup_mark_filename)
else:
samtools_dedupe_command = \
"samtools view -F 1804 -b %s" % (tmp_dup_mark_filename)
with open(final_bam_filename, 'w') as temp_file:
logger.info(samtools_dedupe_command)
subprocess.check_call(
shlex.split(samtools_dedupe_command),
stdout=temp_file)
# Index final bam file
sambamba_index_command = \
"samtools index -@ %d %s" % (cpu_count(), final_bam_filename)
logger.info(sambamba_index_command)
subprocess.check_output(shlex.split(sambamba_index_command))
# Generate mapping statistics
mapstats_filename = final_bam_prefix + ".flagstat.qc"
with open(mapstats_filename, 'w') as temp_file:
flagstat_command = "sambamba flagstat -t %d %s" \
% (cpu_count(), final_bam_filename)
logger.info(flagstat_command)
subprocess.check_call(shlex.split(flagstat_command), stdout=temp_file)
os.remove(bam)
return tmp_dup_mark_filename
def compute_complexity(bam, paired, bam_basename):
'''Calculate library complexity .'''
pbc_file_qc_filename = bam_basename + ".pbc.qc"
tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
# Sort by name
# convert to bedPE and obtain fragment coordinates
# sort by position and strand
# Obtain unique count statistics
# PBC File output
# Sample Name[tab]
# TotalReadPairs [tab]
# DistinctReadPairs [tab]
# OneReadPair [tab]
# TwoReadPairs [tab]
# NRF=Distinct/Total [tab]
# PBC1=OnePair/Distinct [tab]
# PBC2=OnePair/TwoPair
pbc_headers = ['TotalReadPairs',
'DistinctReadPairs',
'OneReadPair',
'TwoReadPairs',
'NRF',
'PBC1',
'PBC2']
if paired:
steps = [
"samtools sort -@%d -n %s" % (cpu_count(), bam),
"bamToBed -bedpe -i stdin",
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
else:
steps = [
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
steps.extend([
"grep -v 'chrM'",
"sort",
"uniq -c",
r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
])
out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename)
if err:
logger.error("PBC file error: %s", err)
# Add Sample Name and headers
pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
names=pbc_headers)
pbc_file['Sample'] = bam_basename
pbc_headers_new = list(pbc_file)
pbc_headers_new.insert(0, pbc_headers_new.pop(pbc_headers_new.index('Sample')))
pbc_file = pbc_file[pbc_headers_new]
pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
os.remove(bam)
os.remove(bam + '.bai')
os.remove(tmp_pbc_file_qc_filename)
def main():
args = get_args()
paired = args.paired
bam = args.bam
# Create a file handler
handler = logging.FileHandler('map_qc.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run filtering for either PE or SE
bam_basename = os.path.basename(
utils.strip_extensions(bam, STRIP_EXTENSIONS))
if paired: # paired-end data
filter_bam_filename = filter_mapped_pe(bam, bam_basename)
else:
filter_bam_filename = filter_mapped_se(bam, bam_basename)
# Remove duplicates
markdup_bam_filename = dedup_mapped(filter_bam_filename, bam_basename, paired)
# Compute library complexity
compute_complexity(markdup_bam_filename, paired, bam_basename)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Align reads to reference genome.'''
import os
import subprocess
import argparse
import shutil
import shlex
import logging
from multiprocessing import cpu_count
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run triming on.",
nargs='+',
required=True)
parser.add_argument('-r', '--reference',
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
bwa_path = shutil.which("bwa")
if bwa_path:
logger.info('Found bwa: %s', bwa_path)
# Get Version
bwa_version_command = "bwa"
try:
subprocess.check_output(bwa_version_command, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
bwa_version = e.output
# Write to file
bwa_file = open("version_bwa.txt", "wb")
bwa_file.write(bwa_version)
bwa_file.close()
else:
logger.error('Missing bwa')
raise Exception('Missing bwa')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
# Get Version
samtools_version_command = "samtools --version"
samtools_version = subprocess.check_output(samtools_version_command, shell=True)
# Write to file
samtools_file = open("version_samtools.txt", "wb")
samtools_file.write(samtools_version)
samtools_file.close()
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def generate_sa(fastq, reference):
'''Use BWA to generate Suffix Arrays.'''
fastq_basename = os.path.basename(utils.strip_extensions(fastq, STRIP_EXTENSIONS))
bwa_aln_params = '-q 5 -l 32 -k 2'
sai = '%s.sai' % (fastq_basename)
with open(sai, 'w') as sai_file:
bwa_command = "bwa aln %s -t %d %s %s" \
% (bwa_aln_params, cpu_count(),
reference, fastq)
logger.info("Running bwa with %s", bwa_command)
subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
return sai
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa samse %s %s %s"
% (reference, sai[0], fastq[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samse/samtools error: %s", err)
return bam_filename
def align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA to align PE data.'''
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
bam_filename = '%s.bam' % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
"bwa sampe -P %s %s %s %s %s"
% (reference, sai[0], sai[1],
fastq[0], fastq[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
out, err = utils.run_pipe(steps, badcigar_filename)
if err:
logger.error("sampe error: %s", err)
steps = [
"cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samtools error: %s", err)
return bam_filename
def main():
args = get_args()
paired = args.paired
fastq = args.fastq
reference = args.reference
sample = args.sample
# Create a file handler
handler = logging.FileHandler('map.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run Suffix Array generation
sai = []
for fastq_file in fastq:
sai_filename = generate_sa(fastq_file, reference)
sai.append(sai_filename)
# Make file basename
fastq_basename = sample
# Run alignment for either PE or SE
if paired: # paired-end data
bam_filename = align_pe(fastq, sai, reference, fastq_basename)
else:
bam_filename = align_se(fastq, sai, reference, fastq_basename)
bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as temp_file:
subprocess.check_call(
shlex.split("samtools flagstat %s" % (bam_filename)),
stdout=temp_file)
#Genome/Bad fastq File Check
file_check = open(bam_mapstats_filename).readlines()
percent = file_check[4].split('(')[1]
percent = percent.split('%')[0]
if float(percent) < 10:
raise Exception ('Mapped Genes too low: Check for correct Genotype')
# Remove sai files
for sai_file in sai:
os.remove(sai_file)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Call Motifs on called peaks.'''
import os
import argparse
import logging
import shutil
import subprocess
from multiprocessing import Pool
import pandas as pd
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to run motif search.",
required=True)
parser.add_argument('-g', '--genome',
help="The genome FASTA file.",
required=True)
parser.add_argument('-p', '--peak',
help="The number of peaks to use.",
required=True)
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
meme_path = shutil.which("meme")
if meme_path:
logger.info('Found meme: %s', meme_path)
# Get Version
memechip_version_command = "meme-chip --version"
memechip_version = subprocess.check_output(memechip_version_command, shell=True)
# Write to file
meme_file = open("version_memechip.txt", "wb")
meme_file.write(b"Version %s" % (memechip_version))
meme_file.close()
else:
logger.error('Missing meme')
raise Exception('Missing meme')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def run_wrapper(args):
motif_search(*args)
def motif_search(filename, genome, experiment, peak):
'''Run motif serach on peaks.'''
file_basename = os.path.basename(
utils.strip_extensions(filename, STRIP_EXTENSIONS))
out_fa = '%s.fa' % (experiment)
out_motif = '%s_memechip' % (experiment)
# Sort Bed file and limit number of peaks
if peak == -1:
peak = utils.count_lines(filename)
peak_no = 'all'
else:
peak_no = peak
sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak_no)
out, err = utils.run_pipe([
'sort -k %dgr,%dgr %s' % (5, 5, filename),
'head -n %s' % (peak)], outfile=sorted_fn)
# Get fasta file
out, err = utils.run_pipe([
'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
if err:
logger.error("bedtools error: %s", err)
# Call memechip
out, err = utils.run_pipe([
'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
if err:
logger.error("meme-chip error: %s", err)
def main():
args = get_args()
design = args.design
genome = args.genome
peak = args.peak
# Create a file handler
handler = logging.FileHandler('motif.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_df = pd.read_csv(design, sep='\t')
meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
work_pool = Pool(min(12, design_df.shape[0]))
return_list = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate naive overlap peak files and design file for downstream processing.'''
import os
import argparse
import logging
import shutil
import subprocess
import pandas as pd
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file of peaks (tsv format).",
required=True)
parser.add_argument('-f', '--files',
help="The design file of with bam files (tsv format).",
required=True)
args = parser.parse_args()
return args
def check_tools():
'''Checks for required componenets on user system.'''
logger.info('Checking for required libraries and components on this system')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
# Get Version
bedtools_version_command = "bedtools --version"
bedtools_version = subprocess.check_output(bedtools_version_command, shell=True)
# Write to file
bedtools_file = open("version_bedtools.txt", "wb")
bedtools_file.write(bedtools_version)
bedtools_file.close()
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def update_design(design):
'''Update design file for diffBind and remove controls.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_bam_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
logger.info("Removing columns that are there own control.")
design = design.drop('bam_index', axis=1)
logger.info("Adding peaks column.")
design = design.assign(peak='', peak_caller='bed')
return design
def overlap(experiment, design):
'''Calculate the overlap of peaks'''
logger.info("Determining consenus peaks for experiment %s.", experiment)
# Output File names
peak_type = 'narrowPeak'
overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type)
rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type)
# Intermediate File names
overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
overlap_pr_fn = 'replicated_pr.%s' % (peak_type)
# Assign Pooled and Psuedoreplicate peaks
pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0]
pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0]
pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0]
# Remove non true replicate rows
not_replicates = ['1_pr', '2_pr', 'pooled']
design_true_reps = design[~design['replicate'].isin(not_replicates)]
true_rep_peaks = design_true_reps.peaks.unique()
# Find overlaps
awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
cut_command = 'cut -f 1-10'
# Find pooled peaks that overlap Rep1 and Rep2
# where overlap is defined as the fractional overlap
# with any one of the overlapping peak pairs >= 0.5
steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
awk_command,
cut_command,
'sort -u']
iter_true_peaks = iter(true_rep_peaks)
next(iter_true_peaks)
if len(true_rep_peaks) > 1:
for true_peak in true_rep_peaks[1:]:
steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak),
awk_command,
cut_command,
'sort -u'])
out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
print("%d peaks overlap with both true replicates" %
(utils.count_lines(overlap_tr_fn)))
# Find pooled peaks that overlap PseudoRep1 and PseudoRep2
# where overlap is defined as the fractional overlap
# with any one of the overlapping peak pairs >= 0.5
steps_pseudo = ['intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks),
awk_command,
cut_command,
'sort -u',
'intersectBed -wo -a stdin -b %s' % (pr2_peaks),
awk_command,
cut_command,
'sort -u']
out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
print("%d peaks overlap with both pooled pseudoreplicates"
% (utils.count_lines(overlap_pr_fn)))
# Make union of peak lists
out, err = utils.run_pipe([
'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
'sort -u'
], overlapping_peaks_fn)
print("%d peaks overlap with true replicates or with pooled pseudorepliates"
% (utils.count_lines(overlapping_peaks_fn)))
# Make rejected peak list
out, err = utils.run_pipe([
'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn)
], rejected_peaks_fn)
print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn)))
# Remove temporary files
os.remove(overlap_tr_fn)
os.remove(overlap_pr_fn)
return os.path.abspath(overlapping_peaks_fn)
def main():
args = get_args()
design = args.design
files = args.files
# Create a file handler
handler = logging.FileHandler('consensus_peaks.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files as dataframes
design_peaks_df = pd.read_csv(design, sep='\t')
design_files_df = pd.read_csv(files, sep='\t')
# Make a design file for differential binding
design_diff = update_design(design_files_df)
# Make a design file for annotating Peaks
anno_cols = ['Condition', 'Peaks']
design_anno = pd.DataFrame(columns=anno_cols)
# Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
replicated_peak = overlap(experiment, df_experiment)
design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
design_anno.loc[experiment] = [experiment, replicated_peak]
# Write out design files
design_diff.columns = ['SampleID',
'bamReads',
'Condition',
'Tissue',
'Factor',
'Treatment',
'Replicate',
'ControlID',
'bamControl',
'Peaks',
'PeakCaller']
design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False)
design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
# Write the unique conditions
unique_experiments = pd.DataFrame(design_diff['Condition'].unique().tolist(), columns=['Condition'])
unique_experiments.to_csv('unique_experiments.csv', index=False)
if __name__ == '__main__':
main()
#!/bin/bash
#plot_profile.sh
script_name="plot_profile.sh"
#Help function
usage() {
echo "-h --Help documentation for $script_name"
echo "-g --File path to gtf/bed files"
echo "Example: $script_name -g 'genome.gtf'"
exit 1
}
raise()
{
echo "${1}" >&2
}
check_tools() {
raise "
Checking for required libraries and components on this system
"
deeptools --version &> version_deeptools.txt
if [ $? -gt 0 ]
then
raise "Missing deeptools"
return 1
fi
}
compute_matrix() {
raise "
Computing matrix on ${1} using ${2}
"
computeMatrix reference-point \
--referencePoint TSS \
-S ${1} \
-R ${2} \
--skipZeros \
-o computeMatrix.gz \
-p max/2
if [ $? -gt 0 ]
then
raise "Problem building matrix"
return 1
fi
}
plot_profile() {
raise "
Plotting profile
"
plotProfile -m computeMatrix.gz \
-out plotProfile.png
if [ $? -gt 0 ]
then
raise "Problem plotting"
return 1
fi
}
run_main() {
# Parsing options
OPTIND=1 # Reset OPTIND
while getopts :g:h opt
do
case $opt in
g) gtf=$OPTARG;;
h) usage;;
esac
done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $gtf ]]; then
usage
fi
bws=$(ls *pooled.fc_signal.bw)
check_tools || exit 1
compute_matrix "${bws}" "${gtf}" || return 1
plot_profile || return 1
raise "ALL COMPLETE"
}
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]
then
run_main "$@"
if [ $? -gt 0 ]
then
exit 1
fi
fi
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''Generate pooled and pseudoreplicate from data.'''
import argparse
import logging
import os
import subprocess
import shutil
import shlex
import pandas as pd
import numpy as np
import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file to make experiemnts (tsv format).",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
parser.add_argument('-c', '--cutoff',
help="Cutoff ratio used for choosing controls.",
type=float,
default=1.2)
args = parser.parse_args()
return args
def check_replicates(design):
'''Check the number of replicates for the experiment.'''
no_rep = design.shape[0]
return no_rep
def check_controls(design):
'''Check the number of controls for the experiment.'''
no_controls = len(design.control_tag_align.unique())
return no_controls
def get_read_count_ratio(design):
'''Get the ratio of read counts for unique controls.'''
controls = design.control_tag_align.unique()
control_dict = {}
for con in controls:
no_reads = utils.count_lines(con)
control_dict[con] = no_reads
control_matrix = {c: control_dict for c in controls}
control_df = pd.DataFrame.from_dict(control_matrix)
control_ratio = control_df.divide(list(control_dict.values()), axis=0)
return control_ratio
def pool(tag_files, outfile, paired):
'''Pool files together.'''
if paired:
file_extension = '.bedpe.gz'
else:
file_extension = '.bedse.gz'
pool_basename = os.path.basename(
utils.strip_extensions(outfile, STRIP_EXTENSIONS))
pooled_filename = pool_basename + file_extension
# Merge files
out, err = utils.run_pipe([
'gzip -dc %s' % (' '.join(tag_files)),
'gzip -cn'], outfile=pooled_filename)
return pooled_filename
def bedpe_to_tagalign(tag_file, outfile):
'''Convert read pairs to reads into standard tagAlign file.'''
se_tag_filename = outfile + ".tagAlign.gz"
# Convert read pairs to reads into standard tagAlign file
tag_steps = ["zcat -f %s" % (tag_file)]
tag_steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
tag_steps.extend(['gzip -cn'])
out, err = utils.run_pipe(tag_steps, outfile=se_tag_filename)
return se_tag_filename
def self_psuedoreplication(tag_file, prefix, paired):
'''Make 2 self-psuedoreplicates.'''
# Get total number of reads
no_lines = utils.count_lines(tag_file)
# Number of lines to split into
lines_per_rep = (no_lines+1)/2
# Make an array of number of psuedoreplicatesfile names
pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz'
for r in [0, 1]}
# Shuffle and split file into equal parts
# by using the input to seed shuf we ensure multiple runs with the same
# input will produce the same output
# Produces two files named splits_prefix0n, n=1,2
splits_prefix = 'temp_split'
psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | '
psuedo_command += 'split -d -l {} - {}."'
psuedo_command = psuedo_command.format(
tag_file,
tag_file,
int(lines_per_rep),
splits_prefix)
logger.info("Running psuedo with %s", psuedo_command)
subprocess.check_call(shlex.split(psuedo_command))
# Convert read pairs to reads into standard tagAlign file
for i, index in enumerate([0, 1]):
string_index = '.0' + str(index)
steps = ['cat %s' % (splits_prefix + string_index)]
if paired:
steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
steps.extend(['gzip -cn'])
out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
return pseudoreplicate_dict
def generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls):
if no_reps == 1:
logger.info("No other replicate specified "
"so processing as an unreplicated experiment.")
replicated = False
else:
logger.info("Multiple replicates specified "
"so processing as a replicated experiment.")
replicated = True
if no_unique_controls == 1 and replicated:
logger.info("Only a single control was specified "
"so using same control for replicates, pool and psuedoreplicates.")
single_control = True
else:
logger.info("Will merge only unique controls for pooled.")
single_control = False
# Pool the controls for checking
if not single_control:
control_df = get_read_count_ratio(design_df)
control_files = design_df.control_tag_align.unique()
pool_control = pool(control_files, "pool_control", paired)
else:
pool_control = design_df.control_tag_align.unique()[0]
# if paired_end make tagAlign
if paired:
pool_control_tmp = bedpe_to_tagalign(pool_control, "pool_control")
pool_control = pool_control_tmp
# Duplicate rows and update for pool and psuedoreplicates and update tagAlign with single end data
experiment_id = design_df.at[0, 'experiment_id']
replicate_files = design_df.tag_align.unique()
pool_experiment = pool(replicate_files, experiment_id + "_pooled", paired)
# Make 2 self psuedoreplicates
pseudoreplicates_dict = {}
for rep, tag_file in zip(design_df['replicate'], design_df['tag_align']):
replicate_prefix = experiment_id + '_' + str(rep)
pr_dict = self_psuedoreplication(tag_file, replicate_prefix, paired)
pseudoreplicates_dict[rep] = pr_dict
# Update design to include new self pseudo replicates
pseudoreplicates_df = pd.DataFrame.from_dict(pseudoreplicates_dict)
pool_pseudoreplicates_dict = {}
for index, row in pseudoreplicates_df.iterrows():
replicate_id = index + 1
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
pool_replicate = pool(row, pr_filename, False)
pool_pseudoreplicates_dict[replicate_id] = pool_replicate
design_new_df = design_df #.loc[np.repeat(design_df.index, 4)].reset_index()
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
# If paired change to single End
if paired:
pool_experiment_se = bedpe_to_tagalign(pool_experiment, experiment_id + "_pooled")
else:
pool_experiment_se = pool_experiment
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
if not single_control:
path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > cutoff_ratio:
logger.info("Number of reads in controls differ by " +
" > factor of %f. Using pooled controls." % (cutoff_ratio))
design_new_df['control_tag_align'] = path_to_pool_control
else:
for index, row in design_new_df.iterrows():
exp_no_reads = utils.count_lines(row['tag_align'])
con_no_reads = utils.count_lines(row['control_tag_align'])
if con_no_reads < exp_no_reads:
logger.info("Fewer reads in control than experiment." +
"Using pooled controls for replicate %s."
% row['replicate'])
design_new_df.loc[index, 'control_tag_align'] = \
path_to_pool_control
else:
if paired:
control = row['control_tag_align']
control_basename = os.path.basename(
utils.strip_extensions(control, STRIP_EXTENSIONS))
control_tmp = bedpe_to_tagalign(control, control_basename)
path_to_control = cwd + '/' + control_tmp
design_new_df.loc[index, 'control_tag_align'] = \
path_to_control
else:
if paired:
path_to_pool_control = cwd + '/' + pool_control
else:
path_to_pool_control = pool_control
design_new_df['control_tag_align'] = path_to_pool_control
# Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy()
tmp_metadata['control_tag_align'] = path_to_pool_control
for rep, pseudorep_file in pool_pseudoreplicates_dict.items():
tmp_metadata['sample_id'] = experiment_id + '_pr' + str(rep)
tmp_metadata['replicate'] = str(rep) + '_pr'
tmp_metadata['xcor'] = 'Calculate'
path_to_file = cwd + '/' + pseudorep_file
tmp_metadata['tag_align'] = path_to_file
design_new_df = design_new_df.append(tmp_metadata)
# Add in pool experiment
tmp_metadata['sample_id'] = experiment_id + '_pooled'
tmp_metadata['replicate'] = 'pooled'
tmp_metadata['xcor'] = 'Calculate'
path_to_file = cwd + '/' + pool_experiment_se
tmp_metadata['tag_align'] = path_to_file
design_new_df = design_new_df.append(tmp_metadata)
return design_new_df
def main():
args = get_args()
paired = args.paired
design = args.design
cutoff_ratio = args.cutoff
# Create a file handler
handler = logging.FileHandler('experiment_generation.log')
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(design, sep='\t')
# Get current directory to build paths
cwd = os.getcwd()
# Check Number of replicates and replicates
no_reps = check_replicates(design_df)
no_unique_controls = check_controls(design_df)
# Generate new design file
design_new_df = generate_design(paired, cutoff_ratio, design_df, cwd, no_reps, no_unique_controls)
# Write out new dataframe
experiment_id = design_df.at[0, 'experiment_id']
design_new_df.to_csv(experiment_id + '_ppr.tsv',
header=True, sep='\t', index=False)
if __name__ == '__main__':
main()
#!/usr/bin/python
# programmer : bbc
# usage: main function to call all the procedures for chip-seq analysis
import sys
import os
import argparse as ap
import logging
import pandas as pd
import glob
import subprocess
from multiprocessing import Pool
import runDeepTools
import runMemechip
logging.basicConfig(level=10)
def prepare_argparser():
description = "Make wig file for given bed using bam"
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input design file")
argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
argparser.add_argument("--top-peak",dest="toppeak",type=int, default=-1, help = "Only use top peaks for motif call")
#argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
#argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
return(argparser)
def memechip_wrapper(args):
#print args
runMemechip.run(*args)
def main():
argparser = prepare_argparser()
args = argparser.parse_args()
#dfile = pd.read_csv(args.infile)
#for testing, add testing path to all input files
test_path = "/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/"
designfile = pd.read_csv(args.infile)
designfile['Peaks'] = designfile['Peaks'].apply(lambda x: test_path+x)
designfile['bamReads'] = designfile['bamReads'].apply(lambda x: test_path+x)
designfile['bamControl'] = designfile['bamControl'].apply(lambda x: test_path+x)
designfile.to_csv(args.infile+"_new",index=False)
dfile = pd.read_csv(args.infile+"_new")
#call deeptools
runDeepTools.run(args.infile+"_new", args.genome)
#call diffbind
this_script = os.path.abspath(__file__).split("/")
folder = "/".join(this_script[0:len(this_script)-1])
diffbind_command = "Rscript "+folder+"/runDiffBind.R "+args.infile+"_new"
#logging.debug(diffbind_command)
p = subprocess.Popen(diffbind_command, shell=True)
p.communicate()
#call chipseeker on original peaks and overlapping peaks
chipseeker_command = "Rscript "+folder+"/runChipseeker.R "+",".join(dfile['Peaks'].tolist())+" "+",".join(dfile['SampleID'])
#BC## logging.debug(chipseeker_command)
p = subprocess.Popen(chipseeker_command, shell=True)
p.communicate()
overlapping_peaks = glob.glob('*diffbind.bed')
overlapping_peak_names = []
for pn in overlapping_peaks:
overlapping_peak_names.append(pn.split("_diffbind")[0].replace("!","non"))
chipseeker_overlap_command = "Rscript "+folder+"/runChipseeker.R "+",".join(overlapping_peaks)+" "+",".join(overlapping_peak_names)
p = subprocess.Popen(chipseeker_overlap_command, shell=True)
p.communicate()
#MEME-chip on all peaks
meme_arglist = zip(dfile['Peaks'].tolist(),[test_path+"hg19.2bit"]*dfile.shape[0],[str(args.toppeak)]*dfile.shape[0],dfile['SampleID'].tolist())
#BC# #print meme_arglist
work_pool = Pool(min(12,dfile.shape[0]))
resultList = work_pool.map(memechip_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
if __name__=="__main__":
main()
args = commandArgs(trailingOnly=TRUE)
#if (length(args)==0) {
# stop("At least one argument must be supplied (input file).n", call.=FALSE)
#} else if (length(args)==1) {
# # default output file
# args[3] = "out.txt"
#}
library(ChIPseeker)
#Parse the genome path and get genome version
path_elements = unlist(strsplit(args[2],"[/]"))
genome = path_elements[length(path_elements)]
if(genome=="GRCh37")
{
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
}
if(genome=="GRCm38")
{
library(TxDb.Hsapiens.UCSC.mm10.knownGene)
txdb <- TxDb.Hsapiens.UCSC.mm10.knownGene
}
if(genome=="GRCh38")
{
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
}
design<-read.csv(args[1])
files<-as.list(as.character(design$Peaks))
names(files)<-design$SampleID
peakAnnoList <- lapply(files, annotatePeak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
for(index in c(1:length(peakAnnoList)))
{
filename<-paste(names(files)[index],".chipseeker_annotation.xls",sep="")
write.table(as.data.frame(peakAnnoList[[index]]),filename,sep="\t",quote=F)
#draw individual plot
pie_name <- paste(names(files)[index],".chipseeker_pie.pdf",sep="")
vennpie_name <- paste(names(files)[index],".chipseeker_vennpie.pdf",sep="")
upsetplot_name <- paste(names(files)[index],".chipseeker_upsetplot.pdf",sep="")
pdf(pie_name)
plotAnnoPie(peakAnnoList[[index]])
dev.off()
pdf(vennpie_name)
vennpie(peakAnnoList[[index]])
dev.off()
pdf(upsetplot_name)
upsetplot(peakAnnoList[[index]])
dev.off()
}
This diff is collapsed.
File deleted
This diff is collapsed.
File deleted