Commits (29)
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
......@@ -18,6 +18,7 @@ def get_args():
return args
# set the tag names - take a look at SAM spec to pick an appropriate one
args = get_args()
infile = pysam.AlignmentFile(args.sam, "r")
out = pysam.AlignmentFile(args.out, "wb", template=infile)
......@@ -25,5 +26,6 @@ for read in infile.fetch():
read.set_tag('RX', read.qname.split(":")[-1])
out.write(read)
infile.close()
out.close()
#!/bin/bash
#indexbams.sh
usage() {
echo "-h --Help documentation for markdups.sh"
echo "Example: bash indexbams.sh"
echo "-r --Reference Genome: GRCh38 or GRCm38"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:b:p:h opt
do
case $opt in
h) usage;;
r) index_path=$OPTARG;;
p) pair_id=$OPTARG;;
b) bam=$OPTARG;;
esac
done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
baseDir="`dirname \"$0\"`"
source /etc/profile.d/modules.sh
module load igvtools/2.3.71 samtools/1.6
samtools index -@ $SLURM_CPUS_ON_NODE $bam
igvtools count -z 5 $bam ${pair_id}.tdf ${index_path}/igv/human.genome
......@@ -44,17 +44,15 @@ fi
if [[ $nuctype == 'dna' ]]; then
module load bedtools/2.26.0 picard/2.10.3
samtools view -b --threads $SLURM_CPUS_ON_NODE -L ${bed} -o ${pair_id}.ontarget.bam ${sbam}
samtools index ${pair_id}.ontarget.bam
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
samtools view -b -F 1024 ${pair_id}.ontarget.bam | bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b stdin -hist | grep ^all > ${pair_id}.dedupcov.txt
java -Xmx64g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar $PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
samtools view -@ $SLURM_CPUS_ON_NODE -b -L ${bed} -o ${pair_id}.ontarget.bam ${sbam}
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.ontarget.bam
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
java -Xmx64g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt
samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
java -Xmx64g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt
samtools view -@ $SLURM_CPUS_ON_NODE -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt
perl $baseDir/calculate_depthcov.pl ${pair_id}.covhist.txt
grep ^all ${pair_id}.covhist.txt > ${pair_id}.genomecov.txt
fi
samtools view -@ $SLURM_CPUS_ON_NODE ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
perl $baseDir/calculate_depthcov.pl ${pair_id}.covhist.txt
fi
#!/usr/bin/env python3
'''Convert tagAlign files for further processing.'''
import os
import argparse
import shutil
import subprocess
import shlex
import logging
from multiprocessing import cpu_count
from python_utils import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-b', '--bam',
help="The bam file to convert.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def convert_mapped(bam, tag_filename):
'''Use bedtools to convert to tagAlign.'''
out, err = utils.run_pipe([
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
"gzip -nc"], outfile=tag_filename)
def convert_mapped_pe(bam, bam_basename):
'''Use bedtools to convert to tagAlign PE data.'''
bedpe_filename = bam_basename + ".bedpe.gz"
# Name sort bam to make BEDPE
nmsrt_bam_filename = bam_basename + ".nmsrt.bam"
samtools_sort_command = \
"samtools sort -n -@%d -o %s %s" \
% (cpu_count(), nmsrt_bam_filename, bam)
logger.info(samtools_sort_command)
subprocess.check_output(shlex.split(samtools_sort_command))
out, err = utils.run_pipe([
"bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename),
"gzip -nc"], outfile=bedpe_filename)
def main():
args = get_args()
paired = args.paired
bam = args.bam
# Create a file handler
handler = logging.FileHandler('convert_reads.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Convert PE or SE to tagAlign
bam_basename = os.path.basename(
utils.strip_extensions(bam, ['.bam']))
tag_filename = bam_basename + '.tagAlign.gz'
convert_mapped(bam, tag_filename)
if paired: # paired-end data
convert_mapped_pe(bam, bam_basename)
else:
bedse_filename = bam_basename + ".bedse.gz"
shutil.copy(tag_filename, bedse_filename)
if __name__ == '__main__':
main()
......@@ -41,23 +41,28 @@ module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 picard/2.10.3
baseDir="`dirname \"$0\"`"
if [[ $fq1 == $fq2 ]]
diff $fq1 $fq2 > difffile
if [ -s difffile ]
then
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
else
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam
else
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
fi
if [[ $umi == 'umi' ]]
if [[ $umi == 'umi' ]] && [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
then
k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam | python ${baseDir}/add_umi_sam.py -s - -o output.unsort.bam
k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p tmphla ${index_path}/genome.fa.alt out.sam | python ${baseDir}/add_umi_sam.py -s - -o output.unsort.bam
elif [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
then
k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam| samtools view -1 - > output.unsort.bam
else
k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p tmphla ${index_path}/genome.fa.alt out.sam| samtools view -1 - > output.unsort.bam
elif [[ $umi == 'umi' ]]
then
python ${baseDir}/add_umi_sam.py -s out.sam -o output.unsort.bam
else
samtools view -1 -o output.unsort.bam out.sam
fi
which samtools
samtools sort -n --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
samtools index ${pair_id}.bam
......@@ -5,7 +5,6 @@ use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'fusion|f=s','prefix|p=s','help|h');
my %entrez;
open ENT, "</project/shared/bicf_workflow_ref/gene_info.human.txt" or die $!;
my $headline = <ENT>;
......@@ -36,7 +35,7 @@ print OUT join("\t","FusionName","LeftGene","RightGene","LefttBreakpoint",
print OUTIR join("\t","Hugo_Symbol","Entrez_Gene_Id","Center","Tumor_Sample_Barcode",
"Fusion","DNA_support","RNA_support","Method","Frame"),"\n";
my $sname = (split(/_DNA_panel1385/,$opt{prefix}))[0];
my $sname = $opt{prefix};
open FUSION, "<$opt{fusion}" or die $!;
my $header = <FUSION>;
......@@ -62,15 +61,16 @@ while (my $line = <FUSION>) {
$hash{SumRNAReads} += $hash{JunctionReadCount}+$hash{SpanningFragCount};
my $fname = join("--",$hash{LeftGene},$hash{RightGene});
my $fname2 = join("--",sort {$a cmp $b} $hash{LeftGene},$hash{RightGene});
my $ename = join("--",$entrez{$hash{LeftGene}},$entrez{$hash{RightGene}});
my ($dna_support,$rna_support)=("no","no");
my ($dna_support,$rna_support)=("no") x 2;
if ($known{$fname2} && ($hash{SumRNAReads} >= 3)|| ($hash{SumRNAReads} >= 5)) {
$rna_support = "yes";
print OUT join("\t",$fname,$hash{LeftGene},$hash{RightGene},
$hash{LeftBreakpoint},$hash{RightBreakpoint},$hash{LeftStrand},
$hash{RightStrand},$hash{SumRNAReads},0),"\n";
print OUTIR join("\t",$fname,$ename,"UTSW NGS Clinical Sequencing Lab",$sname,$fname." fusion",
0,$rna_support,"STAR 2.5.2b","N/A"),"\n";
print OUTIR join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}},"UTSW",$sname,$fname." fusion",
$dna_support,$rna_support,"STAR Fusion","N/A"),"\n";
print OUTIR join("\t",$hash{RightGene},$entrez{$hash{RightGene}},"UTSW",$sname,$fname." fusion",
$dna_support,$rna_support,"STAR Fusion","N/A"),"\n";
}
}
......
#!/bin/bash
#hlatyping.sh
usage() {
echo "-h Help documentation for dnaseqalign.sh"
echo "-r --Reference Genome: GRCh38 or GRCm38"
echo "-x --FastQ R1"
echo "-y --FastQ R2"
echo "-p --Prefix for output file name"
echo "Example: bash hisat_genotype.sh -p prefix"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:x:y:p:uh opt
do
case $opt in
r) index_path=$OPTARG;;
x) fq1=$OPTARG;;
y) fq2=$OPTARG;;
u) umi='umi';;
p) pair_id=$OPTARG;;
h) usage;;
esac
done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $index_path ]]; then
usage
fi
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
source /etc/profile.d/modules.sh
module load hisat-genotype/1.0.1
diff $fq1 $fq2 > difffile
if [ -s difffile ]
then
hisatgenotype_extract_reads.py -p 16 --database-list hla --base /project/shared/bicf_workflow_ref/hisat_genotype_hla/genotype_genome -1 $fq1 -2 $fq2 --out-dir hisatgeno_out
else
hisatgenotype_extract_reads.py -p 16 --database-list hla --base /project/shared/bicf_workflow_ref/hisat_genotype_hla/genotype_genome -U $fq1 --out-dir hisatgeno_out
fi
hisatgenotype_locus_samples.py -p 16 --region-list hla.A,hla.B,hla.C,hla.DQA1,hla.DQB1,hla.DRB1,hla.DPB1 --assembly --read-dir hisatgeno_out --out-dir ${pair_id}.hisat_hla > ${pair_id}.hisat_hla.txt
tar cf ${pair_id}.hisat_hla.tar ${pair_id}.hisat_hla
gzip ${pair_id}.hisat_hla.tar
#!/usr/bin/env python3
'''Remove duplicates and filter unmapped reads.'''
import os
import subprocess
import argparse
import shutil
import shlex
import logging
from multiprocessing import cpu_count
import pandas as pd
from python_utils import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.bam', '.srt']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-b', '--bam',
help="The bam file to run filtering and qc on.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
sambamba_path = shutil.which("sambamba")
if sambamba_path:
logger.info('Found sambamba: %s', sambamba_path)
else:
logger.error('Missing sambamba')
raise Exception('Missing sambamba')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def filter_mapped_pe(bam, bam_basename):
'''Use samtools to filter unmapped reads for PE data.'''
filt_bam_prefix = bam_basename + ".filt.srt"
filt_bam_filename = filt_bam_prefix + ".bam"
tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix)
tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Only keep properly paired reads
# Obtain name sorted BAM file
out, err = utils.run_pipe([
# filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
# -q 30 exclude MAPQ < 30; -u uncompressed output
# exclude FLAG 1804: unmapped, next segment unmapped, secondary
# alignments, not passing platform q, PCR or optical duplicates
# require FLAG 2: properly aligned
"samtools view -F 1804 -f 2 -q 30 -u %s" % (bam),
# sort: -n sort by name; - take input from stdin;
# out to specified filename
# Will produce name sorted BAM
"samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
if err:
logger.error("samtools filter error: %s" % (err))
# Remove orphan reads (pair was removed)
# and read pairs mapping to different chromosomes
# Obtain position sorted BAM
out, err = utils.run_pipe([
# fill in mate coordinates, ISIZE and mate-related flags
# fixmate requires name-sorted alignment; -r removes secondary and
# unmapped (redundant here because already done above?)
# - send output to stdout
"samtools fixmate -r %s -" % (tmp_filt_bam_filename),
# repeat filtering after mate repair
"samtools view -F 1804 -f 2 -u -",
# produce the coordinate-sorted BAM
"samtools sort -@ %d -o %s" % (cpu_count(), filt_bam_filename)])
os.remove(tmp_filt_bam_filename)
return filt_bam_filename
def filter_mapped_se(bam, bam_basename):
'''Use samtools to filter unmapped reads for SE data.'''
filt_bam_prefix = bam_basename + ".filt.srt"
filt_bam_filename = filt_bam_prefix + ".bam"
# Remove unmapped, mate unmapped
# not primary alignment, reads failing platform
# Remove low MAPQ reads
# Obtain name sorted BAM file
with open(filt_bam_filename, 'w') as out_bam:
samtools_filter_command = (
"samtools view -F 1804 -q 30 -b %s"
% (bam)
)
logger.info(samtools_filter_command)
subprocess.check_call(
shlex.split(samtools_filter_command),
stdout=out_bam)
return filt_bam_filename
def dedup_mapped(bam, bam_basename, paired):
'''Use sambamba and samtools to remove duplicates.'''
# Markduplicates
dup_file_qc_filename = bam_basename + ".dup.qc"
tmp_dup_mark_filename = bam_basename + ".dupmark.bam"
sambamba_params = "--hash-table-size=17592186044416" + \
" --overflow-list-size=20000000 --io-buffer-size=256"
with open(dup_file_qc_filename, 'w') as dup_qc:
sambamba_markdup_command = (
"sambamba markdup -t %d %s --tmpdir=%s %s %s"
% (cpu_count(), sambamba_params, os.getcwd(), bam, tmp_dup_mark_filename)
)
logger.info(sambamba_markdup_command)
subprocess.check_call(
shlex.split(sambamba_markdup_command),
stderr=dup_qc)
# Remove duplicates
final_bam_prefix = bam_basename + ".filt.nodup"
final_bam_filename = final_bam_prefix + ".bam"
if paired: # paired-end data
samtools_dedupe_command = \
"samtools view -F 1804 -f 2 -b %s" % (tmp_dup_mark_filename)
else:
samtools_dedupe_command = \
"samtools view -F 1804 -b %s" % (tmp_dup_mark_filename)
with open(final_bam_filename, 'w') as out_bam:
logger.info(samtools_dedupe_command)
subprocess.check_call(
shlex.split(samtools_dedupe_command),
stdout=out_bam)
# Index final bam file
sambamba_index_command = \
"samtools index -@ %d %s" % (cpu_count(), final_bam_filename)
logger.info(sambamba_index_command)
subprocess.check_output(shlex.split(sambamba_index_command))
# Generate mapping statistics
final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"
with open(final_bam_file_mapstats_filename, 'w') as out_stats:
flagstat_command = "sambamba flagstat -t %d %s" \
% (cpu_count(), final_bam_filename)
logger.info(flagstat_command)
subprocess.check_call(shlex.split(flagstat_command), stdout=out_stats)
os.remove(bam)
return tmp_dup_mark_filename
def compute_complexity(bam, paired, bam_basename):
'''Calculate library complexity .'''
pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)
# Sort by name
# convert to bedPE and obtain fragment coordinates
# sort by position and strand
# Obtain unique count statistics
# PBC File output
# TotalReadPairs [tab]
# DistinctReadPairs [tab]
# OneReadPair [tab]
# TwoReadPairs [tab]
# NRF=Distinct/Total [tab]
# PBC1=OnePair/Distinct [tab]
# PBC2=OnePair/TwoPair
pbc_headers = [
'TotalReadPairs',
'DistinctReadPairs',
'OneReadPair',
'TwoReadPairs',
'NRF',
'PBC1',
'PBC2']
if paired:
steps = [
"samtools sort -@%d -n %s" % (cpu_count(), bam),
"bamToBed -bedpe -i stdin",
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
else:
steps = [
"bamToBed -i %s" % (bam),
r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
steps.extend([
"grep -v 'chrM'",
"sort",
"uniq -c",
r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
])
out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename)
if err:
logger.error("PBC file error: %s", err)
# Add headers
pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
names=pbc_headers)
pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
os.remove(bam)
os.remove(bam + '.bai')
os.remove(tmp_pbc_file_qc_filename)
def main():
args = get_args()
paired = args.paired
bam = args.bam
# Create a file handler
handler = logging.FileHandler('map_qc.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run filtering for either PE or SE
bam_basename = os.path.basename(
utils.strip_extensions(bam, STRIP_EXTENSIONS))
if paired: # paired-end data
filter_bam_filename = filter_mapped_pe(bam, bam_basename)
else:
filter_bam_filename = filter_mapped_se(bam, bam_basename)
# Remove duplicates
markdup_bam_filename = dedup_mapped(filter_bam_filename, bam_basename, paired)
# Compute library complexity
compute_complexity(markdup_bam_filename, paired, bam_basename)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
'''Align reads to reference genome.'''
import os
import subprocess
import argparse
import shutil
import shlex
import logging
from multiprocessing import cpu_count
from python_utils import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
# the order of this list is important.
# strip_extensions strips from the right inward, so
# the expected right-most extensions should appear first (like .gz)
# Modified from J. Seth Strattan
STRIP_EXTENSIONS = ['.gz', '.fq', '.fastq', '_trimmed']
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-f', '--fastq',
help="The fastq file to run triming on.",
nargs='+',
required=True)
parser.add_argument('-r', '--reference',
help="The bwa index of the reference genome.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
bwa_path = shutil.which("bwa")
if bwa_path:
logger.info('Found bwa: %s', bwa_path)
else:
logger.error('Missing bwa')
raise Exception('Missing bwa')
samtools_path = shutil.which("samtools")
if samtools_path:
logger.info('Found samtools: %s', samtools_path)
else:
logger.error('Missing samtools')
raise Exception('Missing samtools')
def generate_sa(fastq, reference):
'''Use BWA to generate Suffix Arrays.'''
fastq_basename = os.path.basename(utils.strip_extensions(fastq, STRIP_EXTENSIONS))
bwa_aln_params = '-q 5 -l 32 -k 2'
sai = '%s.sai' % (fastq_basename)
with open(sai, 'w') as sai_file:
bwa_command = "bwa aln %s -t %d %s %s" \
% (bwa_aln_params, cpu_count(),
reference, fastq)
logger.info("Running bwa with %s", bwa_command)
subprocess.check_call(shlex.split(bwa_command), stdout=sai_file)
return sai
def align_se(fastq, sai, reference, fastq_basename):
'''Use BWA to align SE data.'''
bam_filename = '%s.srt.bam' % (fastq_basename)
steps = [
"bwa samse %s %s %s"
% (reference, sai[0], fastq[0]),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samse/samtools error: %s", err)
return bam_filename
def align_pe(fastq, sai, reference, fastq_basename):
'''Use BWA to align PE data.'''
sam_filename = "%s.sam" % (fastq_basename)
badcigar_filename = "%s.badReads" % (fastq_basename)
bam_filename = '%s.srt.bam' % (fastq_basename)
# Remove read pairs with bad CIGAR strings and sort by position
steps = [
"bwa sampe -P %s %s %s %s %s"
% (reference, sai[0], sai[1],
fastq[0], fastq[1]),
"tee %s" % (sam_filename),
r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
"sort",
"uniq"]
out, err = utils.run_pipe(steps, badcigar_filename)
if err:
logger.error("sampe error: %s", err)
steps = [
"cat %s" % (sam_filename),
"grep -v -F -f %s" % (badcigar_filename),
"samtools view -@%d -Su -" % (cpu_count()),
"samtools sort -@%d -o %s"
% (cpu_count(), bam_filename)]
out, err = utils.run_pipe(steps)
if err:
logger.error("samtools error: %s", err)
return bam_filename
def main():
args = get_args()
paired = args.paired
fastq = args.fastq
reference = args.reference
# Create a file handler
handler = logging.FileHandler('map.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Run Suffix Array generation
sai = []
for fq in fastq:
sai_filename = generate_sa(fq, reference)
sai.append(sai_filename)
# Run alignment for either PE or SE
if paired: # paired-end data
fastq_r1_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
fastq_r2_basename = os.path.basename(
utils.strip_extensions(fastq[1], STRIP_EXTENSIONS))
fastq_basename = fastq_r1_basename + fastq_r2_basename
bam_filename = align_pe(fastq, sai, reference, fastq_basename)
else:
fastq_basename = os.path.basename(
utils.strip_extensions(fastq[0], STRIP_EXTENSIONS))
bam_filename = align_se(fastq, sai, reference, fastq_basename)
bam_mapstats_filename = '%s.srt.bam.flagstat.qc' % (fastq_basename)
with open(bam_mapstats_filename, 'w') as out_file:
subprocess.check_call(
shlex.split("samtools flagstat %s" % (bam_filename)),
stdout=out_file)
# Remove sai files
for sai_file in sai:
os.remove(sai_file)
if __name__ == '__main__':
main()
......@@ -41,7 +41,7 @@ then
SLURM_CPUS_ON_NODE=1
fi
diff $fq1 $fq2 > difffile.out
diff $fq1 $fq2 > difffile
if [ $algo == 'star' ]
then
......@@ -59,9 +59,9 @@ else
module load hisat2/2.1.0-intel
if [ -s difffile ]
then
hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal --dta -x ${index_path}/hisat_index/genome -U $fq1 -S out.sam --summary-file ${pair_id}.alignerout.txt
hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal --dta -x ${index_path}/hisat_index/genome -1 $fq1 -2 $fq2 -S out.sam --summary-file ${pair_id}.alignerout.txt
else
hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal --dta -x ${index_path}/hisat_index/genome -1 $fq1 -2 $fq2 -S out.sam --summary-file ${pair_id}.alignerout.txt
hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal --dta -x ${index_path}/hisat_index/genome -U $fq1 -S out.sam --summary-file ${pair_id}.alignerout.txt
fi
if [[ $umi==1 ]]
then
......
......@@ -33,11 +33,16 @@ fi
index_path=${refgeno}/CTAT_lib/
baseDir="`dirname \"$0\"`"
source /etc/profile.d/modules.sh
module add python/2.7.x-anaconda star/2.5.2b
module add python/2.7.x-anaconda star/2.5.2b bedtools/2.26.0
STAR-Fusion --genome_lib_dir ${index_path} --left_fq ${fq1} --right_fq ${fq2} --output_dir star_fusion &> star_fusion.err
mv star_fusion/star-fusion.fusion_candidates.final.abridged ${pair_id}.starfusion.txt
if [[ $filter==1 ]]
then
cut -f 6,8 ${pair_id}.starfusion.txt |grep -v Breakpoint |perl -pe 's/\t/\n/g' |awk -F ':' '{print $1"\t"$2-1"\t"$2}' > temp.bed
bedtools intersect -wao -a temp.bed -b /project/shared/bicf_workflow_ref/GRCh38/cytoBand.txt |cut -f 1,2,7 > cytoband_pos.txt
#cut -f 6,8 ${pair_id}.starfusion.txt |grep -v Breakpoint|perl -pe 's/:/\t/g' |awk '{print $1"\t"$2"\t"$4"\t"$5"\tAVG"}' > coords.txt
#java -Xmx1G -jar /project/shared/bicf_workflow_ref/seqprg/oncofuse-1.1.1/Oncofuse.jar -a hg38 coords.txt coord AVG oncofuse.out
perl $baseDir/filter_genefusions.pl -p ${pair_id} -f ${pair_id}.starfusion.txt
fi
#!/usr/bin/env python3
'''Generate peaks from data.'''
import os
import argparse
import shutil
import logging
from multiprocessing import cpu_count
from python_utils import utils
from quality_metrics.xcor import xcor as calculate_xcor
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-t', '--tag',
help="The tagAlign file to perform peak calling on.",
required=True)
parser.add_argument('-x', '--xcor',
help="The cross-correlation file (if already calculated).",
required=True)
parser.add_argument('-c', '--con',
help="The control tagAling file used for peak calling.",
required=True)
parser.add_argument('-s', '--sample',
help="The sample id to name the peak files.",
required=True)
parser.add_argument('-g', '--genome',
help="The genome size of reference genome.",
required=True)
parser.add_argument('-z', '--size',
help="The file with chromosome sizes of reference genome.",
required=True)
parser.add_argument('-p', '--paired',
help="True/False if paired-end or single end.",
default=False,
action='store_true')
args = parser.parse_args()
return args
# Functions
def check_tools():
'''Checks for required componenets on user system'''
logger.info('Checking for required libraries and components on this system')
r_path = shutil.which("R")
if r_path:
logger.info('Found R: %s', r_path)
else:
logger.error('Missing R')
raise Exception('Missing R')
macs_path = shutil.which("macs2")
if r_path:
logger.info('Found MACS2: %s', macs_path)
else:
logger.error('Missing MACS2')
raise Exception('Missing MACS2')
bg_bw_path = shutil.which("bedGraphToBigWig")
if bg_bw_path:
logger.info('Found bedGraphToBigWig: %s', bg_bw_path)
else:
logger.error('Missing bedGraphToBigWig')
raise Exception('Missing bedGraphToBigWig')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes):
# Extract the fragment length estimate from column 3 of the
# cross-correlation scores file
with open(xcor, 'r') as xcor_fh:
firstline = xcor_fh.readline()
frag_lengths = firstline.split()[2] # third column
fragment_length = frag_lengths.split(',')[0] # grab first value
logger.info("Fraglen %s" % (fragment_length))
# Generate narrow peaks and preliminary signal tracks
command = 'macs2 callpeak ' + \
'-t %s -c %s ' % (experiment, control) + \
'-f BED -n %s ' % (prefix) + \
'-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length)
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d" % (returncode))
assert returncode == 0, "MACS2 non-zero return"
# MACS2 sometimes calls features off the end of chromosomes.
# Remove coordinates outside chromosome sizes
narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)
steps = ['slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)]
out, err = utils.run_pipe(steps)
# Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
# (score must be <1000)
rescaled_narrowpeak_fn = utils.rescale_scores(
clipped_narrowpeak_fn, scores_col=5)
# Sort by Col8 in descending order and replace long peak names in Column 4
# with Peak_<peakRank>
steps = [
'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'"""
]
out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn))
# For Fold enrichment signal tracks
# This file is a tab delimited file with 2 columns Col1 (chromosome name),
# Col2 (chromosome size in bp).
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_FE.bdg ' % (prefix) + \
'-m FE'
logger.info(command)
returncode = utils.block_on(command)
logger.info("MACS2 exited with returncode %d" % (returncode))
assert returncode == 0, "MACS2 non-zero return"
# Remove coordinates outside chromosome sizes (MACS2 bug)
fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix)
fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn)
fc_signal_fn = "%s.fc_signal.bw" % (prefix)
steps = ['slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)]
out, err = utils.run_pipe(steps)
# Sort file
out, err = utils.run_pipe([
'bedSort %s %s' % (fc_bedgraph_fn, fc_bedgraph_sorted_fn)])
# Convert bedgraph to bigwig
command = 'bedGraphToBigWig ' + \
'%s ' % (fc_bedgraph_sorted_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (fc_signal_fn)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
assert returncode == 0, "bedGraphToBigWig non-zero return"
# For -log10(p-value) signal tracks
# Compute sval =
# min(no. of reads in ChIP, no. of reads in control) / 1,000,000
out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
chip_reads = out.strip()
out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
control_reads = out.strip()
sval = str(min(float(chip_reads), float(control_reads)) / 1000000)
logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
(chip_reads, control_reads, sval))
command = 'macs2 bdgcmp ' + \
'-t %s_treat_pileup.bdg ' % (prefix) + \
'-c %s_control_lambda.bdg ' % (prefix) + \
'-o %s_ppois.bdg ' % (prefix) + \
'-m ppois -S %s' % (sval)
logger.info(command)
returncode = utils.block_on(command)
assert returncode == 0, "MACS2 non-zero return"
# Remove coordinates outside chromosome sizes (MACS2 bug)
pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix)
pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn)
pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
steps = ['slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)]
out, err = utils.run_pipe(steps)
# Sort file
out, err = utils.run_pipe([
'bedSort %s %s' % (fc_bedgraph_fn, pvalue_bedgraph_sorted_fn)])
# Convert bedgraph to bigwig
command = 'bedGraphToBigWig ' + \
'%s ' % (pvalue_bedgraph_sorted_fn) + \
'%s ' % (chrom_sizes) + \
'%s' % (pvalue_signal_fn)
logger.info(command)
returncode = utils.block_on(command)
logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
assert returncode == 0, "bedGraphToBigWig non-zero return"
# Remove temporary files
os.remove(clipped_narrowpeak_fn)
os.remove(rescaled_narrowpeak_fn)
def main():
args = get_args()
tag = args.tag
xcor = args.xcor
con = args.con
sample = args.sample
genome_size = args.genome
chrom_size = args.size
paired = args.paired
# Create a file handler
handler = logging.FileHandler('call_peaks.log')
logger.addHandler(handler)
# Check if tools are present
check_tools()
# Calculate Cross-correlation if not already calcualted
if xcor == 'Calculate':
xcor_file = calculate_xcor(tag, paired)
else:
xcor_file = xcor
# Call Peaks using MACS2
call_peaks_macs(tag, xcor_file, con, sample, genome_size, chrom_size)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
'''Generate naive overlap peak files and design file for downstream processing.'''
import os
import argparse
import logging
import shutil
import pandas as pd
import sys
from python_utils import utils
EPILOG = '''
For more details:
%(prog)s --help
'''
# SETTINGS
logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.INFO)
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__, epilog=EPILOG,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-d', '--design',
help="The design file of peaks (tsv format).",
required=True)
parser.add_argument('-f', '--files',
help="The design file of with bam files (tsv format).",
required=True)
args = parser.parse_args()
return args
def check_tools():
'''Checks for required componenets on user system.'''
logger.info('Checking for required libraries and components on this system')
bedtools_path = shutil.which("bedtools")
if bedtools_path:
logger.info('Found bedtools: %s', bedtools_path)
else:
logger.error('Missing bedtools')
raise Exception('Missing bedtools')
def update_design(design):
'''Update design file for diffBind and remove controls.'''
logger.info("Running control file update.")
file_dict = design[['sample_id', 'bam_reads']] \
.set_index('sample_id').T.to_dict()
design['control_bam_reads'] = design['control_id'] \
.apply(lambda x: file_dict[x]['bam_reads'])
logger.info("Removing rows that are there own control.")
design = design[design['control_id'] != design['sample_id']]
logger.info("Removing columns that are there own control.")
design = design.drop('bam_index', axis=1)
logger.info("Adding peaks column.")
design = design.assign(peak='', peak_caller='bed')
return design
def overlap(experiment, design):
'''Calculate the overlap of peaks'''
logger.info("Determining consenus peaks for experiment %s.", experiment)
# Output File names
peak_type = 'narrowPeak'
overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type)
rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type)
# Intermediate File names
overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
overlap_pr_fn = 'replicated_pr.%s' % (peak_type)
# Assign Pooled and Psuedoreplicate peaks
pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0]
pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0]
pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0]
# Remove non true replicate rows
not_replicates = ['1_pr', '2_pr', 'pooled']
design_true_reps = design[~design['replicate'].isin(not_replicates)]
true_rep_peaks = design_true_reps.peaks.unique()
# Find overlaps
awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
cut_command = 'cut -f 1-10'
# Find pooled peaks that overlap Rep1 and Rep2
# where overlap is defined as the fractional overlap
# with any one of the overlapping peak pairs >= 0.5
steps_true = ['intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
awk_command,
cut_command,
'sort -u']
iter_true_peaks = iter(true_rep_peaks)
next(iter_true_peaks)
if len(true_rep_peaks) > 1:
for true_peak in true_rep_peaks[1:]:
steps_true.extend(['intersectBed -wo -a stdin -b %s' % (true_peak),
awk_command,
cut_command,
'sort -u'])
out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
print("%d peaks overlap with both true replicates" %
(utils.count_lines(overlap_tr_fn)))
# Find pooled peaks that overlap PseudoRep1 and PseudoRep2
# where overlap is defined as the fractional overlap
# with any one of the overlapping peak pairs >= 0.5
steps_pseudo = ['intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks),
awk_command,
cut_command,
'sort -u',
'intersectBed -wo -a stdin -b %s' % (pr2_peaks),
awk_command,
cut_command,
'sort -u']
out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
print("%d peaks overlap with both pooled pseudoreplicates"
% (utils.count_lines(overlap_pr_fn)))
# Make union of peak lists
out, err = utils.run_pipe([
'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
'sort -u'
], overlapping_peaks_fn)
print("%d peaks overlap with true replicates or with pooled pseudorepliates"
% (utils.count_lines(overlapping_peaks_fn)))
# Make rejected peak list
out, err = utils.run_pipe([
'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn)
], rejected_peaks_fn)
print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn)))
# Remove temporary files
os.remove(overlap_tr_fn)
os.remove(overlap_pr_fn)
return overlapping_peaks_fn
def main():
args = get_args()
design = args.design
files = args.files
# Create a file handler
handler = logging.FileHandler('consensus_peaks.log')
logger.addHandler(handler)
# Read files as dataframes
design_peaks_df = pd.read_csv(design, sep='\t')
design_files_df = pd.read_csv(files, sep='\t')
# Make a design file for
design_diff = update_design(design_files_df)
# Find consenus overlap peaks for each experiment
for experiment, df_experiment in design_peaks_df.groupby('experiment_id'):
replicated_peak = overlap(experiment, df_experiment)
design_diff.loc[design_diff.experiment_id == experiment, "peak"] = replicated_peak
# Write out file
design_diff.columns = ['SampleID',