Skip to content
Snippets Groups Projects
Commit 56539919 authored by Venkat Malladi's avatar Venkat Malladi
Browse files

Merge branch '23-rename_output' into 'master'

Resolve "Use SampleIds/ Experiment Id as file names throughtout pipeline"

Closes #23

See merge request BICF/Astrocyte/chipseq_analysis!20
parents 4a653c8f d9919554
1 merge request!20Resolve "Use SampleIds/ Experiment Id as file names throughtout pipeline"
Pipeline #3192 passed with stage
in 9 seconds
Showing
with 342 additions and 245 deletions
......@@ -97,3 +97,6 @@ ENV/
# mypy
.mypy_cache/
# Mac OS
.DS_Store
before_script:
- module add python/3.6.1-2-anaconda
- pip install --user pytest-pythonpath pytest-cov
- pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
- module load nextflow/0.31.0
- ln -s /work/BICF/s163035/chipseq/*fastq.gz test_data/
- ln -s /project/shared/bicf_workflow_ref/workflow_testdata/chipseq/*fastq.gz test_data/
stages:
- unit
......@@ -17,6 +17,10 @@ user_configuration:
single_end_mouse:
stage: single
only:
- master
except:
- branches
script:
- nextflow run workflow/main.nf -resume
- pytest -m singleend
......@@ -25,6 +29,10 @@ single_end_mouse:
paired_end_human:
stage: single
only:
- branches
except:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_ENCSR729LGA_PE.txt" --genome 'GRCh38' --pairedEnd true -resume
- pytest -m pairedend
......@@ -33,6 +41,10 @@ paired_end_human:
single_end_diff:
stage: multiple
only:
- branches
except:
- master
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_SE.txt" --genome 'GRCm38' -resume
- pytest -m singlediff
......@@ -40,6 +52,10 @@ single_end_diff:
expire_in: 2 days
paired_end_diff:
only:
- master
except:
- branches
stage: multiple
script:
- nextflow run workflow/main.nf --designFile "$CI_PROJECT_DIR/test_data/design_diff_PE.txt" --genome 'GRCh38' --pairedEnd true -resume
......
File deleted
......@@ -105,12 +105,12 @@ process trimReads {
if (pairedEnd) {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -p
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} ${reads[1]} -s $sampleId -p
"""
}
else {
"""
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]}
python3 $baseDir/scripts/trim_reads.py -f ${reads[0]} -s $sampleId
"""
}
......@@ -130,18 +130,18 @@ process alignReads {
output:
set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into mappedReads
file '*.srt.bam.flagstat.qc' into mappedReadsStats
file '*.flagstat.qc' into mappedReadsStats
script:
if (pairedEnd) {
"""
python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -p
python3 $baseDir/scripts/map_reads.py -f ${reads[0]} ${reads[1]} -r ${index}/genome.fa -s $sampleId -p
"""
}
else {
"""
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa
python3 $baseDir/scripts/map_reads.py -f $reads -r ${index}/genome.fa -s $sampleId
"""
}
......@@ -161,9 +161,9 @@ process filterReads {
set sampleId, file('*.bam'), file('*.bai'), experimentId, biosample, factor, treatment, replicate, controlId into dedupReads
set sampleId, file('*.bam'), experimentId, biosample, factor, treatment, replicate, controlId into convertReads
file '*flagstat.qc' into dedupReadsStats
file '*pbc.qc' into dedupReadsComplexity
file '*dup.qc' into dupReads
file '*.flagstat.qc' into dedupReadsStats
file '*.pbc.qc' into dedupReadsComplexity
file '*.dedup.qc' into dupReads
script:
......@@ -342,6 +342,7 @@ process callPeaksMACS {
output:
set sampleId, file('*.narrowPeak'), file('*.fc_signal.bw'), file('*.pvalue_signal.bw'), experimentId, biosample, factor, treatment, replicate, controlId into experimentPeaks
file '*.xls' into summit
script:
......@@ -368,6 +369,7 @@ peaksDesign = experimentPeaks
process consensusPeaks {
publishDir "$outDir/${task.process}", mode: 'copy'
publishDir "$outDir/design", mode: 'copy', pattern: '*.{csv|tsv}'
input:
......@@ -393,7 +395,7 @@ process consensusPeaks {
// Annotate Peaks
process peakAnnotation {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -414,7 +416,7 @@ process peakAnnotation {
// Motif Search Peaks
process motifSearch {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -423,7 +425,7 @@ process motifSearch {
output:
file "*memechip" into motifSearch
file "sorted-*" into filteredPeaks
file "*narrowPeak" into filteredPeaks
script:
......@@ -439,7 +441,7 @@ uniqueExperimentsList = uniqueExperiments
// Calculate Differential Binding Activity
process diffPeaks {
publishDir "$baseDir/output/${task.process}", mode: 'copy'
publishDir "$outDir/${task.process}", mode: 'copy'
input:
......@@ -456,7 +458,6 @@ process diffPeaks {
when:
noUniqueExperiments > 1
script:
"""
Rscript $baseDir/scripts/diff_peaks.R $designDiffPeaks
......
......@@ -17,7 +17,7 @@ args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 2) {
stop("Usage: annotate_peaks.R [ annotate_design.tsv ] [ genome_assembly ]", call.=FALSE)
stop("Usage: annotate_peaks.R annotate_design.tsv genome_assembly", call.=FALSE)
}
design_file <- args[1]
......
......@@ -6,7 +6,6 @@ import os
import argparse
import shutil
import logging
from multiprocessing import cpu_count
import utils
from xcor import xcor as calculate_xcor
......@@ -100,6 +99,7 @@ def check_tools():
def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes):
'''Call peaks and signal tracks'''
# Extract the fragment length estimate from column 3 of the
# cross-correlation scores file
......@@ -107,7 +107,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
firstline = xcor_fh.readline()
frag_lengths = firstline.split()[2] # third column
fragment_length = frag_lengths.split(',')[0] # grab first value
logger.info("Fraglen %s" % (fragment_length))
logger.info("Fraglen %s", fragment_length)
# Generate narrow peaks and preliminary signal tracks
......@@ -119,18 +119,19 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d" % (returncode))
logger.info("MACS2 exited with returncode %d", returncode)
assert returncode == 0, "MACS2 non-zero return"
# MACS2 sometimes calls features off the end of chromosomes.
# Remove coordinates outside chromosome sizes
narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
int_narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
narrowpeak_fn = '%s.narrowPeak' % (prefix)
clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)
steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
steps = ['slopBed -i %s -g %s -b 0' % (int_narrowpeak_fn, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
out, err = utils.run_pipe(steps)
......@@ -161,7 +162,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d" % (returncode))
logger.info("MACS2 exited with returncode %d", returncode)
assert returncode == 0, "MACS2 non-zero return"
# Remove coordinates outside chromosome sizes (MACS2 bug)
......@@ -169,7 +170,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn)
fc_signal_fn = "%s.fc_signal.bw" % (prefix)
steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
out, err = utils.run_pipe(steps)
......@@ -185,7 +186,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
logger.info("bedGraphToBigWig exited with returncode %d", returncode)
assert returncode == 0, "bedGraphToBigWig non-zero return"
# For -log10(p-value) signal tracks
......@@ -199,7 +200,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
sval = str(min(float(chip_reads), float(control_reads)) / 1000000)
logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
(chip_reads, control_reads, sval))
(chip_reads, control_reads, sval))
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
......@@ -216,7 +217,7 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn)
pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
out, err = utils.run_pipe(steps)
......@@ -232,12 +233,13 @@ def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
logger.info("bedGraphToBigWig exited with returncode %d", returncode)
assert returncode == 0, "bedGraphToBigWig non-zero return"
# Remove temporary files
os.remove(clipped_narrowpeak_fn)
os.remove(rescaled_narrowpeak_fn)
os.remove(int_narrowpeak_fn)
def main():
......
......@@ -72,6 +72,46 @@ def check_design_headers(design, paired):
raise Exception("Missing column headers: %s" % list(missing_headers))
def check_samples(design):
'''Check if design file has the correct sample name mapping.'''
logger.info("Running sample check.")
samples = design.groupby('sample_id') \
.apply(list)
malformated_samples = []
chars = set('-.')
for sample in samples.index.values:
if(any(char.isspace() for char in sample) | any((char in chars) for char in sample)):
malformated_samples.append(sample)
if len(malformated_samples) > 0:
logger.error('Malformed samples from design file: %s', list(malformated_samples))
raise Exception("Malformed samples from design file: %s" %
list(malformated_samples))
def check_experiments(design):
'''Check if design file has the correct experiment name mapping.'''
logger.info("Running experiment check.")
experiments = design.groupby('experiment_id') \
.apply(list)
malformated_experiments = []
chars = set('-.')
for experiment in experiments.index.values:
if(any(char.isspace() for char in experiment) | any((char in chars) for char in experiment)):
malformated_experiments.append(experiment)
if len(malformated_experiments) > 0:
logger.error('Malformed experiment from design file: %s', list(malformated_experiments))
raise Exception("Malformed experiment from design file: %s" %
list(malformated_experiments))
def check_controls(design):
'''Check if design file has the correct control mapping.'''
......@@ -146,8 +186,8 @@ def main():
logger.addHandler(handler)
# Read files as dataframes
design_df = pd.read_csv(args.design, sep='\t')
fastq_df = pd.read_csv(args.fastq, sep='\t', names=['name', 'path'])
design_df = pd.read_csv(design, sep='\t')
fastq_df = pd.read_csv(fastq, sep='\t', names=['name', 'path'])
# Check design file
check_design_headers(design_df, paired)
......
......@@ -91,8 +91,7 @@ def convert_mapped_pe(bam, bam_basename):
out, err = utils.run_pipe([
"bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename),
"gzip -nc"],
outfile=bedpe_filename)
"gzip -nc"], outfile=bedpe_filename)
def main():
......@@ -109,7 +108,7 @@ def main():
# Convert PE or SE to tagAlign
bam_basename = os.path.basename(
utils.strip_extensions(bam, ['.bam']))
utils.strip_extensions(bam, ['.bam', '.dedup']))
tag_filename = bam_basename + '.tagAlign.gz'
convert_mapped(bam, tag_filename)
......@@ -117,7 +116,7 @@ def main():
if paired: # paired-end data
convert_mapped_pe(bam, bam_basename)
else:
bedse_filename = bam_basename + ".bedse.gz"
bedse_filename = bam_basename + ".bedse.gz"
shutil.copy(tag_filename, bedse_filename)
......
......@@ -8,7 +8,7 @@ args <- commandArgs(trailingOnly=TRUE)
# Check input args
if (length(args) != 1) {
stop("Usage: diff_peaks.R [ annotate_design.tsv ] ", call.=FALSE)
stop("Usage: diff_peaks.R annotate_design.tsv ", call.=FALSE)
}
# Build DBA object from design file
......
......@@ -7,8 +7,9 @@ import argparse
import logging
import subprocess
import shutil
import pandas as pd
from multiprocessing import cpu_count
import pandas as pd
EPILOG = '''
For more details:
......@@ -196,11 +197,11 @@ def main():
new_design_df = update_controls(design_df)
for index, row in new_design_df.iterrows():
check_enrichment(
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'],
extension)
row['sample_id'],
row['control_id'],
row['bam_reads'],
row['control_reads'],
extension)
if __name__ == '__main__':
......
......@@ -106,7 +106,7 @@ def filter_mapped_pe(bam, bam_basename):
# Will produce name sorted BAM
"samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
if err:
logger.error("samtools filter error: %s" % (err))
logger.error("samtools filter error: %s", err)
# Remove orphan reads (pair was removed)
# and read pairs mapping to different chromosomes
......@@ -136,7 +136,7 @@ def filter_mapped_se(bam, bam_basename):
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Obtain name sorted BAM file
with open(filt_bam_filename, 'w') as fh:
with open(filt_bam_filename, 'w') as temp_file:
samtools_filter_command = (
"samtools view -F 1804 -q 30 -b %s"
% (bam)
......@@ -144,7 +144,7 @@ def filter_mapped_se(bam, bam_basename):
logger.info(samtools_filter_command)
subprocess.check_call(
shlex.split(samtools_filter_command),
stdout=fh)
stdout=temp_file)
return filt_bam_filename
......@@ -153,11 +153,11 @@ def dedup_mapped(bam, bam_basename, paired):
'''Use sambamba and samtools to remove duplicates.'''
# Markduplicates
dup_file_qc_filename = bam_basename + ".dup.qc"
dup_file_qc_filename = bam_basename + ".dedup.qc"
tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
sambamba_params = "--hash-table-size=17592186044416" + \
" --overflow-list-size=20000000 --io-buffer-size=256"
with open(dup_file_qc_filename, 'w') as fh:
with open(dup_file_qc_filename, 'w') as temp_file:
sambamba_markdup_command = (
"sambamba markdup -t %d %s --tmpdir=%s %s %s"
% (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename)
......@@ -165,11 +165,11 @@ def dedup_mapped(bam, bam_basename, paired):
logger.info(sambamba_markdup_command)
subprocess.check_call(
shlex.split(sambamba_markdup_command),
stderr=fh)
stderr=temp_file)
# Remove duplicates
final_bam_prefix = bam_basename + ".filt.nodup"
final_bam_prefix = bam_basename + ".dedup"
final_bam_filename = final_bam_prefix + ".bam"
if paired: # paired-end data
......@@ -179,11 +179,11 @@ def dedup_mapped(bam, bam_basename, paired):
samtools_dedupe_command = \
"samtools view -F 1804 -b %s" % (tmp_dup_mark_filename)
with open(final_bam_filename, 'w') as fh:
with open(final_bam_filename, 'w') as temp_file:
logger.info(samtools_dedupe_command)
subprocess.check_call(
shlex.split(samtools_dedupe_command),
stdout=fh)
stdout=temp_file)
# Index final bam file
sambamba_index_command = \
......@@ -192,12 +192,12 @@ def dedup_mapped(bam, bam_basename, paired):
subprocess.check_output(shlex.split(sambamba_index_command))
# Generate mapping statistics
final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"
with open(final_bam_file_mapstats_filename, 'w') as fh:
mapstats_filename = final_bam_prefix + ".flagstat.qc"
with open(mapstats_filename, 'w') as temp_file:
flagstat_command = "sambamba flagstat -t %d %s" \
% (cpu_count(), final_bam_filename)
logger.info(flagstat_command)
subprocess.check_call(shlex.split(flagstat_command), stdout=fh)
subprocess.check_call(shlex.split(flagstat_command), stdout=temp_file)
os.remove(bam)
return tmp_dup_mark_filename
......@@ -206,7 +206,7 @@ def dedup_mapped(bam, bam_basename, paired):
def compute_complexity(bam, paired, bam_basename):
'''Calculate library complexity .'''
pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
pbc_file_qc_filename = bam_basename + ".pbc.qc"
tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
# Sort by name
......@@ -223,12 +223,12 @@ def compute_complexity(bam, paired, bam_basename):
# PBC1=OnePair/Distinct [tab]
# PBC2=OnePair/TwoPair
pbc_headers = ['TotalReadPairs',
'DistinctReadPairs',
'OneReadPair',
'TwoReadPairs',
'NRF',
'PBC1',
'PBC2']
'DistinctReadPairs',
'OneReadPair',
'TwoReadPairs',
'NRF',
'PBC1',
'PBC2']
if paired:
steps = [
......@@ -247,11 +247,11 @@ def compute_complexity(bam, paired, bam_basename):
])
out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename)
if err:
logger.error("PBC file error: %s" % (err))
logger.error("PBC file error: %s", err)
# Add headers
pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
names=pbc_headers)
names=pbc_headers)
pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
os.remove(bam)
os.remove(bam + '.bai')
......
......@@ -9,7 +9,6 @@ import shutil
import shlex
import logging
from multiprocessing import cpu_count
import json
import utils
EPILOG = '''
......@@ -33,7 +32,7 @@ STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
......@@ -47,6 +46,10 @@ def get_args():
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
......@@ -101,7 +104,7 @@ def generate_sa(fastq, reference):
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
bam_filename = '%s.srt.bam' % (fastq_basename)
bam_filename = '%s.bam' % (fastq_basename)
steps = [
"bwa samse %s %s %s"
......@@ -112,7 +115,7 @@ def align_se(fastq, sai, reference, fastq_basename):
out, err = utils.run_pipe(steps)
if err:
logger.error("samse/samtools error: %s" % (err))
logger.error("samse/samtools error: %s", err)
return bam_filename
......@@ -122,21 +125,21 @@ def align_pe(fastq, sai, reference, fastq_basename):
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
bam_filename = '%s.srt.bam' % (fastq_basename)
bam_filename = '%s.bam' % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
"bwa sampe -P %s %s %s %s %s"
% (reference, sai[0], sai[1],
fastq[0], fastq[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
"bwa sampe -P %s %s %s %s %s"
% (reference, sai[0], sai[1],
fastq[0], fastq[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
out, err = utils.run_pipe(steps, badcigar_filename)
if err:
logger.error("sampe error: %s" % (err))
logger.error("sampe error: %s", err)
steps = [
"cat %s" % (sam_filename),
......@@ -147,7 +150,7 @@ def align_pe(fastq, sai, reference, fastq_basename):
out, err = utils.run_pipe(steps)
if err:
logger.error("samtools error: %s" % (err))
logger.error("samtools error: %s", err)
return bam_filename
......@@ -157,6 +160,7 @@ def main():
paired = args.paired
fastq = args.fastq
reference = args.reference
sample = args.sample
# Create a file handler
handler = logging.FileHandler('map.log')
......@@ -167,31 +171,25 @@ def main():
# Run Suffix Array generation
sai = []
for fq in fastq:
sai_filename = generate_sa(fq, reference)
for fastq_file in fastq:
sai_filename = generate_sa(fastq_file, reference)
sai.append(sai_filename)
# Make file basename
fastq_basename = sample
# Run alignment for either PE or SE
if paired: # paired-end data
fastq_r1_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
fastq_r2_basename = os.path.basename(
utils.strip_extensions(fastq[1], STRIP_EXTENSIONS))
fastq_basename = fastq_r1_basename + fastq_r2_basename
bam_filename = align_pe(fastq, sai, reference, fastq_basename)
else:
fastq_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
bam_filename = align_se(fastq, sai, reference, fastq_basename)
bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as fh:
bam_mapstats_filename = '%s.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as temp_file:
subprocess.check_call(
shlex.split("samtools flagstat %s" % (bam_filename)),
stdout=fh)
stdout=temp_file)
# Remove sai files
for sai_file in sai:
......
......@@ -3,16 +3,13 @@
'''Call Motifs on called peaks.'''
import os
import sys
import re
from re import sub
import string
import argparse
import logging
import subprocess
import shutil
from multiprocessing import Pool
import pandas as pd
import utils
from multiprocessing import Pool
EPILOG = '''
For more details:
......@@ -27,6 +24,13 @@ logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.narrowPeak', '.replicated']
def get_args():
'''Define arguments.'''
......@@ -51,19 +55,47 @@ def get_args():
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
meme_path = shutil.which("meme")
if meme_path:
logger.info('Found meme: %s', meme_path)
else:
logger.error('Missing meme')
raise Exception('Missing meme')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def run_wrapper(args):
motif_search(*args)
motif_search(*args)
def motif_search(filename, genome, experiment, peak):
'''Run motif serach on peaks.'''
file_basename = os.path.basename(
utils.strip_extensions(filename, STRIP_EXTENSIONS))
file_basename = os.path.basename(filename)
sorted_fn = 'sorted-%s' % (file_basename)
out_fa = '%s.fa' % (experiment)
out_motif = '%s_memechip' % (experiment)
# Sort Bed file and limit number of peaks
if peak == -1:
peak = utils.count_lines(filename)
peak_no = 'all'
else:
peak_no = peak
sorted_fn = '%s.%s.narrowPeak' % (file_basename, peak)
out, err = utils.run_pipe([
'sort -k %dgr,%dgr %s' % (5, 5, filename),
......@@ -73,9 +105,16 @@ def motif_search(filename, genome, experiment, peak):
out, err = utils.run_pipe([
'bedtools getfasta -fi %s -bed %s -fo %s' % (genome, sorted_fn, out_fa)])
if err:
logger.error("bedtools error: %s", err)
#Call memechip
out, err = utils.run_pipe([
'meme-chip -oc %s -meme-minw 5 -meme-maxw 15 -meme-nmotifs 10 %s -norand' % (out_motif, out_fa)])
if err:
logger.error("meme-chip error: %s", err)
def main():
args = get_args()
......@@ -83,14 +122,22 @@ def main():
genome = args.genome
peak = args.peak
# Create a file handler
handler = logging.FileHandler('motif.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Read files
design_df = pd.read_csv(design, sep='\t')
meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
work_pool = Pool(min(12,design_df.shape[0]))
return_list = work_pool.map(run_wrapper, meme_arglist)
meme_arglist = zip(design_df['Peaks'].tolist(), [genome]*design_df.shape[0], design_df['Condition'].tolist(), [peak]*design_df.shape[0])
work_pool = Pool(min(12, design_df.shape[0]))
return_list = work_pool.map(run_wrapper, meme_arglist)
work_pool.close()
work_pool.join()
if __name__=="__main__":
main()
if __name__ == '__main__':
main()
......@@ -113,9 +113,9 @@ def overlap(experiment, design):
# with any one of the overlapping peak pairs >= 0.5
steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
awk_command,
cut_command,
'sort -u']
awk_command,
cut_command,
'sort -u']
iter_true_peaks = iter(true_rep_peaks)
next(iter_true_peaks)
......@@ -123,13 +123,13 @@ def overlap(experiment, design):
if len(true_rep_peaks) > 1:
for true_peak in true_rep_peaks[1:]:
steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak),
awk_command,
cut_command,
'sort -u'])
awk_command,
cut_command,
'sort -u'])
out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
print("%d peaks overlap with both true replicates" %
(utils.count_lines(overlap_tr_fn)))
(utils.count_lines(overlap_tr_fn)))
# Find pooled peaks that overlap PseudoRep1 and PseudoRep2
# where overlap is defined as the fractional overlap
......@@ -146,7 +146,7 @@ def overlap(experiment, design):
out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
print("%d peaks overlap with both pooled pseudoreplicates"
% (utils.count_lines(overlap_pr_fn)))
% (utils.count_lines(overlap_pr_fn)))
# Make union of peak lists
out, err = utils.run_pipe([
......@@ -154,7 +154,7 @@ def overlap(experiment, design):
'sort -u'
], overlapping_peaks_fn)
print("%d peaks overlap with true replicates or with pooled pseudorepliates"
% (utils.count_lines(overlapping_peaks_fn)))
% (utils.count_lines(overlapping_peaks_fn)))
# Make rejected peak list
out, err = utils.run_pipe([
......@@ -187,7 +187,7 @@ def main():
# Make a design file for annotating Peaks
anno_cols = ['Condition', 'Peaks']
design_anno = pd.DataFrame(columns = anno_cols)
design_anno = pd.DataFrame(columns=anno_cols)
# Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
......@@ -197,16 +197,16 @@ def main():
# Write out design files
design_diff.columns = ['SampleID',
'bamReads',
'Condition',
'Tissue',
'Factor',
'Treatment',
'Replicate',
'ControlID',
'bamControl',
'Peaks',
'PeakCaller']
'bamReads',
'Condition',
'Tissue',
'Factor',
'Treatment',
'Replicate',
'ControlID',
'bamControl',
'Peaks',
'PeakCaller']
design_diff.to_csv("design_diffPeaks.csv", header=True, sep=',', index=False)
design_anno.to_csv("design_annotatePeaks.tsv", header=True, sep='\t', index=False)
......
......@@ -3,11 +3,14 @@
'''Generate pooled and pseudoreplicate from data.'''
import argparse
import utils
import logging
import os
import subprocess
import shutil
import shlex
import pandas as pd
import numpy as np
import os
import utils
EPILOG = '''
For more details:
......@@ -21,6 +24,12 @@ logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']
def get_args():
'''Define arguments.'''
......@@ -87,21 +96,23 @@ def pool(tag_files, outfile, paired):
else:
file_extension = '.bedse.gz'
pooled_filename = outfile + file_extension
pool_basename = os.path.basename(
utils.strip_extensions(outfile, STRIP_EXTENSIONS))
pooled_filename = pool_basename + file_extension
# Merge files
out, err = utils.run_pipe([
'gzip -dc %s' % (' '.join(tag_files)),
'gzip -cn'],
outfile=pooled_filename)
'gzip -cn'], outfile=pooled_filename)
return pooled_filename
def bedpe_to_tagalign(tag_file, outfile):
'''Convert read pairs to reads itno standard tagAlign file.'''
'''Convert read pairs to reads into standard tagAlign file.'''
se_tag_filename = outfile + "bedse.tagAlign.gz"
se_tag_filename = outfile + ".tagAlign.gz"
# Convert read pairs to reads into standard tagAlign file
tag_steps = ["zcat -f %s" % (tag_file)]
......@@ -122,7 +133,7 @@ def self_psuedoreplication(tag_file, prefix, paired):
lines_per_rep = (no_lines+1)/2
# Make an array of number of psuedoreplicatesfile names
pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz'
pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.tagAlign.gz'
for r in [0, 1]}
# Shuffle and split file into equal parts
......@@ -132,21 +143,26 @@ def self_psuedoreplication(tag_file, prefix, paired):
splits_prefix = 'temp_split'
out, err = utils.run_pipe([
'gzip -dc %s' % (tag_file),
'shuf --random-source=%s' % (tag_file),
'split -d -l %d - %s' % (lines_per_rep, splits_prefix)])
psuedo_command = 'bash -c "zcat {} | shuf --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt </dev/zero 2>/dev/null) | '
psuedo_command += 'split -d -l {} - {}."'
psuedo_command = psuedo_command.format(
tag_file,
tag_file,
int(lines_per_rep),
splits_prefix)
logger.info("Running psuedo with %s", psuedo_command)
subprocess.check_call(shlex.split(psuedo_command))
# Convert read pairs to reads into standard tagAlign file
for i, index in enumerate([0, 1]):
string_index = '0' + str(index)
string_index = '.0' + str(index)
steps = ['cat %s' % (splits_prefix + string_index)]
if paired:
steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
steps.extend(['gzip -cn'])
out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
os.remove(splits_prefix + string_index)
return pseudoreplicate_dict
......@@ -213,7 +229,7 @@ def main():
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True)
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
design_new_df['replicate'] = design_new_df['replicate'].astype(str)
design_new_df.at[1, 'sample_id'] = experiment_id + '_pr1'
......@@ -243,8 +259,6 @@ def main():
# Drop index column
design_new_df.drop(labels='index', axis=1, inplace=True)
else:
# Make pool of replicates
replicate_files = design_df.tag_align.unique()
......@@ -269,7 +283,7 @@ def main():
pool_pseudoreplicates_dict = {}
for index, row in pseudoreplicates_df.iterrows():
replicate_id = index + 1
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.bedse.gz'
pr_filename = experiment_id + ".pr" + str(replicate_id) + '.tagAlign.gz'
pool_replicate = pool(row, pr_filename, False)
pool_pseudoreplicates_dict[replicate_id] = pool_replicate
......@@ -277,16 +291,16 @@ def main():
# Update tagAlign with single end data
if paired:
design_new_df['tag_align'] = design_new_df['se_tag_align']
design_new_df.drop(labels = 'se_tag_align', axis = 1, inplace = True)
design_new_df.drop(labels='se_tag_align', axis=1, inplace=True)
# Check controls against cutoff_ratio
# if so replace with pool_control
# unless single control was used
if not single_control:
path_to_pool_control = cwd + '/' + pool_control
if control_df.values.max() > 1.2:
if control_df.values.max() > cutoff_ratio:
logger.info("Number of reads in controls differ by " +
" > factor of %f. Using pooled controls." % (cutoff_ratio))
" > factor of %f. Using pooled controls." % (cutoff_ratio))
design_new_df['control_tag_align'] = path_to_pool_control
else:
for index, row in design_new_df.iterrows():
......@@ -302,14 +316,14 @@ def main():
if paired:
control = row['control_tag_align']
control_basename = os.path.basename(
utils.strip_extensions(control, ['.filt.nodup.bedpe.gz']))
control_tmp = bedpe_to_tagalign(control , "control_basename")
utils.strip_extensions(control, STRIP_EXTENSIONS))
control_tmp = bedpe_to_tagalign(control, control_basename)
path_to_control = cwd + '/' + control_tmp
design_new_df.loc[index, 'control_tag_align'] = \
path_to_control
else:
path_to_pool_control = pool_control
path_to_pool_control = pool_control
# Add in pseudo replicates
tmp_metadata = design_new_df.loc[0].copy()
......@@ -333,7 +347,7 @@ def main():
# Write out new dataframe
design_new_df.to_csv(experiment_id + '_ppr.tsv',
header=True, sep='\t', index=False)
header=True, sep='\t', index=False)
if __name__ == '__main__':
......
#!/usr/bin/python
# programmer : bbc
# usage:
import sys
import argparse as ap
import logging
import subprocess
import pandas as pd
from multiprocessing import Pool
logging.basicConfig(level=10)
def prepare_argparser():
description = "Make wig file for given bed using bam"
epilog = "For command line options of each command, type %(prog)% COMMAND -h"
argparser = ap.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile",type=str,required=True, help="input BAM file")
argparser.add_argument("-g","--genome",dest = "genome",type=str,required=True, help="genome", default="hg19")
#argparser.add_argument("-b","--bed",dest="bedfile",type=str,required=True, help = "Gene locus in bed format")
#argparser.add_argument("-s","--strandtype",dest="stranded",type=str,default="none", choices=["none","reverse","yes"])
#argparser.add_argument("-n","--name",dest="trackName",type=str,default="UserTrack",help = "track name for bedgraph header")
return(argparser)
def run_qc(files, controls, labels):
mbs_command = "multiBamSummary bins --bamfiles "+' '.join(files)+" -out sample_mbs.npz"
p = subprocess.Popen(mbs_command, shell=True)
#logging.debug(mbs_command)
p.communicate()
pcor_command = "plotCorrelation -in sample_mbs.npz --corMethod spearman --skipZeros --plotTitle \"Spearman Correlation of Read Counts\" --whatToPlot heatmap --colorMap RdYlBu --plotNumbers -o experiment.deeptools.heatmap_spearmanCorr_readCounts_v2.png --labels "+" ".join(labels)
#logging.debug(pcor_command)
p = subprocess.Popen(pcor_command, shell=True)
p.communicate()
#plotCoverage
pcov_command = "plotCoverage -b "+" ".join(files)+" --plotFile experiment.deeptools_coverage.png -n 1000000 --plotTitle \"sample coverage\" --ignoreDuplicates --minMappingQuality 10"
p = subprocess.Popen(pcov_command, shell=True)
p.communicate()
#draw fingerprints plots
for treat,ctrl,name in zip(files,controls,labels):
fp_command = "plotFingerprint -b "+treat+" "+ctrl+" --labels "+name+" control --plotFile "+name+".deeptools_fingerprints.png"
p = subprocess.Popen(fp_command, shell=True)
p.communicate()
def bam2bw_wrapper(command):
p = subprocess.Popen(command, shell=True)
p.communicate()
def run_signal(files, labels, genome):
#compute matrix and draw profile and heatmap
gene_bed = genome+"/gene.bed"#"/project/BICF/BICF_Core/bchen4/chipseq_analysis/test/genome/"+genome+"/gene.bed"
bw_commands = []
for f in files:
bw_commands.append("bamCoverage -bs 10 -b "+f+" -o "+f.replace("bam","bw"))
work_pool = Pool(min(len(files), 12))
work_pool.map(bam2bw_wrapper, bw_commands)
work_pool.close()
work_pool.join()
cm_command = "computeMatrix scale-regions -R "+gene_bed+" -a 3000 -b 3000 --regionBodyLength 5000 --skipZeros -S *.bw -o samples.deeptools_generegionscalematrix.gz"
p = subprocess.Popen(cm_command, shell=True)
p.communicate()
hm_command = "plotHeatmap -m samples.deeptools_generegionscalematrix.gz -out samples.deeptools_readsHeatmap.png"
p = subprocess.Popen(hm_command, shell=True)
p.communicate()
def run(dfile,genome):
#parse dfile, suppose data files are the same folder as design file
dfile = pd.read_csv(dfile)
#QC: multiBamSummary and plotCorrelation
run_qc(dfile['bamReads'], dfile['bamControl'], dfile['SampleID'])
#signal plots
run_signal(dfile['bamReads'],dfile['SampleID'],genome)
def main():
argparser = prepare_argparser()
args = argparser.parse_args()
run(args.infile, args.genome)
if __name__=="__main__":
main()
......@@ -5,6 +5,7 @@
import subprocess
import argparse
import shutil
import os
import logging
EPILOG = '''
......@@ -32,6 +33,10 @@ def get_args():
nargs='+',
required=True)
parser.add_argument('-s', '--sample',
help="The name of the sample.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
......@@ -61,6 +66,32 @@ def check_tools():
raise Exception('Missing cutadapt')
def rename_reads(fastq, sample, paired):
'''Rename fastq files by sample name.'''
# Get current directory to build paths
cwd = os.getcwd()
renamed_fastq = []
if paired: # paired-end data
# Set file names
renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
renamed_fastq.append(cwd + '/' + sample + '_R2.fastq.gz')
# Great symbolic links
os.symlink(fastq[0], renamed_fastq[0])
os.symlink(fastq[1], renamed_fastq[1])
else:
# Set file names
renamed_fastq.append(cwd + '/' + sample + '_R1.fastq.gz')
# Great symbolic links
os.symlink(fastq[0], renamed_fastq[0])
return renamed_fastq
def trim_reads(fastq, paired):
'''Run trim_galore on 1 or 2 files.'''
......@@ -82,6 +113,7 @@ def trim_reads(fastq, paired):
def main():
args = get_args()
fastq = args.fastq
sample = args.sample
paired = args.paired
# Create a file handler
......@@ -91,8 +123,11 @@ def main():
# Check if tools are present
check_tools()
# Rename fastq files by sample
fastq_rename = rename_reads(fastq, sample, paired)
# Run trim_reads
trim_reads(fastq, paired)
trim_reads(fastq_rename, paired)
if __name__ == '__main__':
......
......@@ -16,7 +16,6 @@ logger.propagate = True
def run_pipe(steps, outfile=None):
# TODO: capture stderr
from subprocess import Popen, PIPE
p = None
p_next = None
......@@ -98,13 +97,13 @@ def rescale_scores(filename, scores_col, new_min=10, new_max=1000):
'head -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
max_score = float(out.strip())
logger.info("rescale_scores: max_score = %s" % (max_score))
logger.info("rescale_scores: max_score = %s", max_score)
out, err = run_pipe([
'tail -n 1 %s' % (sorted_fn),
'cut -f %s' % (scores_col)])
min_score = float(out.strip())
logger.info("rescale_scores: min_score = %s" % (min_score))
logger.info("rescale_scores: min_score = %s", min_score)
a = min_score
b = max_score
......
......@@ -22,6 +22,13 @@ logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.tagAlign', '.bedse', '.bedpe']
def get_args():
'''Define arguments.'''
......@@ -60,20 +67,20 @@ def check_tools():
def xcor(tag, paired):
'''Use spp to calculate cross-correlation stats.'''
tag_basename = os.path.basename(utils.strip_extensions(tag, ['.gz']))
tag_basename = os.path.basename(utils.strip_extensions(tag, STRIP_EXTENSIONS))
uncompressed_tag_filename = tag_basename
# Subsample tagAlign file
NREADS = 15000000
number_reads = 15000000
subsampled_tag_filename = \
tag_basename + ".%d.tagAlign.gz" % (NREADS/1000000)
tag_basename + ".%d.tagAlign.gz" % (number_reads/1000000)
steps = [
'zcat %s' % (tag),
'grep -v "chrM"',
'shuf -n %d --random-source=%s' % (NREADS, tag)]
'shuf -n %d --random-source=%s' % (number_reads, tag)]
if paired:
steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
......@@ -83,8 +90,8 @@ def xcor(tag, paired):
out, err = utils.run_pipe(steps, outfile=subsampled_tag_filename)
# Calculate Cross-correlation QC scores
cc_scores_filename = subsampled_tag_filename + ".cc.qc"
cc_plot_filename = subsampled_tag_filename + ".cc.plot.pdf"
cc_scores_filename = tag_basename + ".cc.qc"
cc_plot_filename = tag_basename + ".cc.plot.pdf"
# CC_SCORE FILE format
# Filename <tab>
......@@ -109,6 +116,7 @@ def xcor(tag, paired):
return cc_scores_filename
def main():
args = get_args()
paired = args.paired
......@@ -122,7 +130,7 @@ def main():
check_tools()
# Calculate Cross-correlation
xcor_filename = xcor(tag, paired)
xcor(tag, paired)
if __name__ == '__main__':
......
......@@ -11,18 +11,33 @@ test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
@pytest.mark.singleend
def test_annotate_peaks_singleend():
def test_pie_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_pie.pdf'))
@pytest.mark.singleend
def test_upsetplot_singleend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR238SGC.chipseeker_upsetplot.pdf'))
@pytest.mark.singleend
def test_annotation_singleend():
annotation_file = test_output_path + 'ENCSR238SGC.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) == 152840
assert utils.count_lines(annotation_file) == 152255
@pytest.mark.pairedend
def test_annotate_peaks_pairedend():
def test_pie_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_pie.pdf'))
@pytest.mark.pairedend
def test_upsetplot_pairedend():
assert os.path.exists(os.path.join(test_output_path, 'ENCSR729LGA.chipseeker_upsetplot.pdf'))
@pytest.mark.pairedend
def test_annotation_pairedend():
annotation_file = test_output_path + 'ENCSR729LGA.chipseeker_annotation.tsv'
assert os.path.exists(annotation_file)
assert utils.count_lines(annotation_file) == 25391
assert utils.count_lines(annotation_file) == 25494
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment