Skip to content
Snippets Groups Projects
Commit 06eb1972 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

somatic union updates

parent b9d65518
No related merge requests found
#!/bin/bash
#indexbams.sh
usage() {
echo "-h --Help documentation for markdups.sh"
echo "Example: bash indexbams.sh"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :h opt
do
case $opt in
h) usage;;
esac
done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
baseDir="`dirname \"$0\"`"
module load samtools/1.6
for i in *.bam; do
samtools index -@ $SLURM_CPUS_ON_NODE ${i}
done
#!/usr/bin/perl -w
#parse_cnvkit_table.pl
my @keep = ("AKT2","ATM","AURKA","BAP1","BCL2L1","BCL6","BIRC3","BRCA2","CCND1","CCNE1","CD79B","CDK8","CDKN2A","CDKN2B","CEBPA","EGFR","ERBB2","FGF10","FGF14","FGF19","FGF3","FGF4","FLT1","FLT3","FOXP1","GATA6","GNA13","GNAS","IKZF1","IL7R","IRS2","KDM4C","KLHL6","KMT2A","KRAS","LYN","MCL1","MITF","MYC","NOTCH2","PIK3CA","PRKAR1A","PRKDC","PTEN","RB1","RICTOR","RUNX1T1","SDHA","TFDP1","TP53","ZNF217");
my %keep = map {$_ => 1} @keep;
my @cvncalls = @ARGV;
foreach my $file (@ARGV) {
open IN, "<$file" or die $!;
my $out = $file;
$out =~ s/call.cns/cnvcalls.txt/;
open OUT, ">$out" or die $!;
print OUT join("\t","CHROM","START","END","LEN","CN","DEPTH",
"Probes","Weight","GeneIDs"),"\n";
my $header = <IN>;
while (my $line = <IN>) {
chomp($line);
my ($chr,$start,$end,$geneids,$log2,$cn,$depth,
$probes,$weight) = split(/\t/,$line);
my %genes;
my @ids = split(/;|,/,$geneids);
foreach my $gid (@ids) {
my ($key,$value) = split(/=/,$gid);
if ($key eq 'ensembl_gn' || $key eq 'identifier') {
$genes{$value} = 1 if $keep{$value};
}
}
my $newgeneids = join(";", keys %genes);
my $len = sprintf("%.1f",($end-$start)/1000000);
next if ($cn == 2) || scalar(keys %genes) < 1;
print OUT join("\t",$chr,$start,$end,$len,$cn,$depth,
$probes,$weight,$newgeneids),"\n";
}
close IN;
close OUT;
}
#!/usr/bin/perl -w
#svanno.pl
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'input|i=s','refdb|r=s','help|h');
my $vcf = $opt{input};
my $outfile = $vcf;
$outfile =~ s/vcf/txt/g;
open OUT, ">$outfile" or die $!;
my $gffile = $vcf;
$gffile =~ s/vcf/genefusion.txt/g;
open GF, ">$gffile" or die $!;
my %lines;
my %eventid;
my $ct = 0;
open IN, "<$vcf" or die $!;
while (my $line = <IN>) {
chomp($line);
if ($line =~ m/^#CHROM/) {
my @header = split(/\t/,$line);
($chrom, $pos,$id,$ref,$alt,$score,
$filter,$info,$format,@subjacc) = split(/\t/, $line);
}
next if $line =~ m/#/;
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
$ct ++;
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val unless ($hash{$key});
}
if ($id eq 'N') {
$id = 'NB'.sprintf("%06s",$ct);
}
my $evid = (split(/_/,$id))[0];
$hash{'END'} = $pos+1 unless $hash{'END'};
my ($allele,$effect,$impact,$gene,$geneid,$feature,
$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
$cds_pos,$cds_len,$aapos,$aalen,$distance,$err);
next unless $hash{ANN};
F1:foreach $trx (split(/,/,$hash{ANN})) {
($allele,$effect,$impact,$gene,$geneid,$feature,
$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx);
next unless ($impact && $impact =~ m/HIGH|MODERATE/);
next if ($effect eq 'sequence_feature');
last F1;
}
next unless ($impact && $impact =~ m/HIGH|MODERATE/);
next unless ($gene);
next if ($done{$chrom}{$pos});
$done{$chrom}{$pos} = 1
@deschead = split(":",$format);
F1:foreach $sample (@subjacc) {
my $allele_info = shift @gts;
@ainfo = split(/:/, $allele_info);
my %gtinfo = ();
foreach $k (0..$#deschead) {
$gtinfo{$deschead[$k]} = $ainfo[$k];
}
unless ($gtinfo{SU}) {
$gtinfo{SU} = 0;
$gtinfo{SU} = $gtinfo{RV}+$gtinfo{DV} if ($gtinfo{RV} && $gtinfo{DV});
}
next if ($gtinfo{SU} < 10);
if (lc($alt) =~ m/chr(\w+):(\d+)/i) {
$chr2 = $1;
$end = $2;
if ($chr2 eq $chrom) {
print OUT join("\t",$sample,$gene,$chrom,$pos,$end,$id,$hash{SVTYPE},$effect,
$featureid,$codon,$aa,$rank,$gtinfo{SU}),"\n";
}elsif ($id =~ m/_\d+/ && $effect =~ m/gene_fusion/) {
$gene =~ s/\&/--/;
my ($left_gene,$right_gene) = split(/--/,$gene);
next unless ($right_gene);
my $left_breakpoint = join(":",$chrom,$pos);
my $right_breakpoint = join(":",'chr'.$chr2,$end);
my ($leftstrand,$rightstrand) = split(//,$hash{STRANDS});
print GF join("\t",$sample,$gene,$left_gene,$right_gene, $left_breakpoint,
$right_breakpoint,$leftstrand,$rightstrand,'',$gtinfo{SU}),"\n";
}else {
print OUT join("\t",$sample,$gene,$chrom,$pos,$end,$ id,$hash{SVTYPE},$effect,
$featureid,$codon,$aa,$rank,$gtinfo{SU}),"\n";
}
}else {
if ($hash{CHR2} && $hash{CHR2} eq $chrom) {
$end = $hash{END};
print OUT join("\t",$sample,$gene,$chrom,$pos,$end,$id,$hash{SVTYPE},$effect,
$featureid,$codon,$aa,$rank,$gtinfo{SU}),"\n";
}elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) {
$gene =~ s/\&/--/;
my ($left_gene,$right_gene) = split(/--/,$gene);
my $left_breakpoint = join(":",$chrom,$pos);
my $right_breakpoint = join(":",$hash{CHR2},$hash{END});
print GF join("\t",$sample,$gene,$left_gene,$right_gene, $left_breakpoint,$right_breakpoint,'','','',$gtinfo{SU}),"\n";
}unless ($hash{CHR2}) {
$hash{END} = $pos + 1 unless ($hash{END});
print OUT join("\t",$sample,$gene,$chrom,$pos,$hash{END},$id,$hash{SVTYPE},$effect,
$featureid,$codon,$aa,$rank,$gtinfo{SU}),"\n";
}
}
}
}
close IN;
!/bin/bash
#!/bin/bash
#svcalling.sh
usage() {
......@@ -6,16 +6,19 @@ usage() {
echo "-r --Path to Reference Genome with the file genome.fa"
echo "-p --Prefix for output file name"
echo "-b --Bam File"
echo "-n --Reference Bam File"
echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:a:b:p:h opt
while getopts :r:a:b:n:k:p:h opt
do
case $opt in
r) index_path=$OPTARG;;
p) pair_id=$OPTARG;;
b) sbam=$OPTARG;;
k) keepid=$OPTARG;;
n) normal=$OPTARG;;
h) usage;;
esac
done
......@@ -43,26 +46,55 @@ else
usage
fi
baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
module load speedseq/20160506 novoBreak/v1.1.3 delly2/v0.7.7-multi samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14
#RUN DELLY
mkdir temp
delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam}
delly2 filter -t BND -o delly_tra.bcf -f germline delly_translocations.bcf
delly2 filter -t DUP -o delly_dup.bcf -f germline delly_translocations.bcf
delly2 filter -t INV -o delly_inv.bcf -f germline delly_translocations.bcf
delly2 filter -t DEL -o delly_del.bcf -f germline delly_translocations.bcf
delly2 filter -t INS -o delly_ins.bcf -f germline delly_translocations.bcf
if [[ -n ${normal} ]]
then
run_novoBreak.sh /cm/shared/apps/novoBreak/novoBreak_distribution_v1.1.3rc ${reffa} ${sbam} ${normal} $SLURM_CPUS_ON_NODE
perl $baseDir/vcf2bed.sv.pl novoBreak.pass.flt.vcf |sort -T temp -V -k 1,1 -k 2,2n > novobreak.bed
mv novoBreak.pass.flt.vcf ${pair_id}.novobreak.vcf
bgzip ${pair_id}.novobreak.vcf
fi
if [[ -n ${normal} ]]
then
#RUN DELLY
delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam} ${normal}
delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam} ${normal}
delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam} ${normal}
delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam} ${normal}
delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam} ${normal}
delly2 filter -t BND -o delly_tra.bcf -f somatic -s samples.tsv delly_translocations.bcf
delly2 filter -t DUP -o delly_dup.bcf -f somatic -s samples.tsv delly_duplications.bcf
delly2 filter -t INV -o delly_inv.bcf -f somatic -s samples.tsv delly_inversions.bcf
delly2 filter -t DEL -o delly_del.bcf -f somatic -s samples.tsv delly_deletion.bcf
delly2 filter -t INS -o delly_ins.bcf -f somatic -s samples.tsv delly_insertion.bcf
else
#RUN DELLY
delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam}
delly2 filter -t BND -o delly_tra.bcf -f germline delly_translocations.bcf
delly2 filter -t DUP -o delly_dup.bcf -f germline delly_duplications.bcf
delly2 filter -t INV -o delly_inv.bcf -f germline delly_inversions.bcf
delly2 filter -t DEL -o delly_del.bcf -f germline delly_deletion.bcf
delly2 filter -t INS -o delly_ins.bcf -f germline delly_insertion.bcf
fi
#MERGE DELLY AND MAKE BED
bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf | vcf-sort > ${pair_id}.delly.vcf
perl $baseDir/vcf2bed.sv.pl ${pair_id}.delly.vcf > delly.bed
bgzip ${pair_id}.delly.vcf
bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf| vcf-sort -t temp > delly.vcf
perl $baseDir/vcf2bed.sv.pl delly.vcf > delly.bed
bgzip delly.vcf
if [[ -n ${normal} ]]
then
tabix delly.vcf.gz
bcftools view -O z -o delly.vcf.gz -s ${keepid} ${pair_id}.delly.vcf.gz
else
mv delly.vcf.gz ${pair_id}.delly.vcf.gz
fi
tabix ${pair_id}.delly.vcf.gz
#MAKE FILES FOR LUMPY
......@@ -70,19 +102,38 @@ samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${sbam}
samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o splitters.bam -
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o discordants.bam -
#RUN LUMPY
speedseq sv -t $SLURM_CPUS_ON_NODE -o ${pair_id}.sssv -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed
if [[ -n ${normal} ]]
then
samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${normal}
samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o normal.splitters.bam -
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o normal.discordants.bam -
speedseq sv -t $SLURM_CPUS_ON_NODE -o sssv -R ${reffa} -B ${normal},${sbam} -D normal.discordants.bam,discordants.bam -S normal.splitters.bam,splitters.bam -x ${index_path}/exclude_alt.bed
bcftools view -O z -o sssv.vcf.gz -s ${keepid} ${pair_id}.sssv.sv.vcf.gz
else
speedseq sv -t $SLURM_CPUS_ON_NODE -o ${pair_id}.sssv -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed
fi
java -jar $SNPEFF_HOME/SnpSift.jar filter "GEN[0].SU > 2" ${pair_id}.sssv.sv.vcf.gz > lumpy.vcf
perl $baseDir/vcf2bed.sv.pl lumpy.vcf > lumpy.bed
#COMPARE DELLY & LUMPY
bedtools intersect -v -a lumpy.bed -b delly.bed > lumpy_only.bed
bedtools intersect -header -b lumpy_only.bed -a lumpy.vcf |bgzip > lumpy_only.vcf.gz
vcf-concat ${pair_id}.delly.vcf.gz lumpy_only.vcf.gz |vcf-sort -t temp > ${pair_id}.sv.vcf
perl $baseDir/vcf2bed.sv.pl ${pair_id}.sv.vcf |sort -V -k 1,1 -k 2,2n | grep -v 'alt' |grep -v 'random' |uniq > svs.bed
bedtools intersect -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed > exonoverlap_sv.txt
bedtools intersect -v -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed | bedtools intersect -header -wb -a stdin -b ${index_path}/gencode.genes.chr.bed > geneoverlap_sv.txt
perl $baseDir/annot_sv.pl -r ${index_path} -i ${pair_id}.sv.vcf
bgzip ${pair_id}.sv.vcf
if [[ -n ${normal} ]]
then
bedtools multiinter -cluster -header -names novobreak delly lumpy -i novobreak.bed delly.bed lumpy.bed > sv.intersect.bed
grep novobreak sv.intersect.bed |cut -f 1,2,3 |sort -V -k 1,1 -k 2,2n |grep -v start | bedtools intersect -header -b stdin -a ${tid}_${nid}.novobreak.vcf.gz | perl -p -e 's/SPIKEIN/${tid}/' |bgzip > svt1.vcf.gz
else
bedtools multiinter -cluster -header -names delly lumpy -i delly.bed lumpy.bed > sv.intersect.bed
fi
grep delly sv.intersect.bed |cut -f 1,2,3 |sort -V -k 1,1 -k 2,2n |grep -v 'start' |grep -v 'novobreak' | bedtools intersect -header -b stdin -a ${pair_id}.delly.vcf.gz |bgzip > svt2.vcf.gz
grep lumpy sv.intersect.bed |cut -f 1,2,3 |sort -V -k 1,1 -k 2,2n |grep -v 'start' |grep -v 'delly' |grep -v 'novobreak' | bedtools intersect -header -b stdin -a ${pair_id}.sssv.sv.vcf.gz |bgzip > svt3.vcf.gz
vcf-concat svt*.vcf.gz | vcf-sort -t temp > ${pair_id}.sv.vcf
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.sv.vcf > ${pair_id}.sv.annot.vcf
perl $baseDir/svannot.pl -i ${pair_id}.sv.annot.vcf
bgzip ${pair_id}.sv.annot.vcf
tabix ${pair_id}.sv.annot.vcf.gz
fi
......@@ -98,6 +98,9 @@ foreach $vcf (@vcffiles) {
close VCF;
}
my @callers = ('ssvar','platypus','sam','gatk','hotspot');
if (grep(/mutect/,@vcffiles)) {
@callers = ('sssom','pmutect','shimmer','stelka','varscan','virmid');
}
F1:foreach $chr (sort {$a cmp $b} keys %lines) {
F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) {
my $callset = join(",",keys %{$lines{$chr}{$pos}});
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment