From 06eb1972563856fa3dbcabba1390fb08866d476d Mon Sep 17 00:00:00 2001 From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu> Date: Mon, 4 Dec 2017 10:08:00 -0600 Subject: [PATCH] somatic union updates --- alignment/indexbams.sh | 30 +++++++++ variants/parse_cnvkit_table.pl | 38 ++++++++++++ variants/svannot.pl | 110 +++++++++++++++++++++++++++++++++ variants/svcalling.sh | 105 +++++++++++++++++++++++-------- variants/unionvcf.pl | 3 + 5 files changed, 259 insertions(+), 27 deletions(-) create mode 100644 alignment/indexbams.sh create mode 100644 variants/parse_cnvkit_table.pl create mode 100644 variants/svannot.pl diff --git a/alignment/indexbams.sh b/alignment/indexbams.sh new file mode 100644 index 0000000..f69697a --- /dev/null +++ b/alignment/indexbams.sh @@ -0,0 +1,30 @@ +#!/bin/bash +#indexbams.sh + +usage() { + echo "-h --Help documentation for markdups.sh" + echo "Example: bash indexbams.sh" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :h opt +do + case $opt in + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +# Check for mandatory options + +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi +baseDir="`dirname \"$0\"`" + +module load samtools/1.6 +for i in *.bam; do + samtools index -@ $SLURM_CPUS_ON_NODE ${i} +done diff --git a/variants/parse_cnvkit_table.pl b/variants/parse_cnvkit_table.pl new file mode 100644 index 0000000..9867b06 --- /dev/null +++ b/variants/parse_cnvkit_table.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w +#parse_cnvkit_table.pl + +my @keep = ("AKT2","ATM","AURKA","BAP1","BCL2L1","BCL6","BIRC3","BRCA2","CCND1","CCNE1","CD79B","CDK8","CDKN2A","CDKN2B","CEBPA","EGFR","ERBB2","FGF10","FGF14","FGF19","FGF3","FGF4","FLT1","FLT3","FOXP1","GATA6","GNA13","GNAS","IKZF1","IL7R","IRS2","KDM4C","KLHL6","KMT2A","KRAS","LYN","MCL1","MITF","MYC","NOTCH2","PIK3CA","PRKAR1A","PRKDC","PTEN","RB1","RICTOR","RUNX1T1","SDHA","TFDP1","TP53","ZNF217"); + +my %keep = map {$_ => 1} @keep; + + +my @cvncalls = @ARGV; +foreach my $file (@ARGV) { + open IN, "<$file" or die $!; + my $out = $file; + $out =~ s/call.cns/cnvcalls.txt/; + open OUT, ">$out" or die $!; + print OUT join("\t","CHROM","START","END","LEN","CN","DEPTH", + "Probes","Weight","GeneIDs"),"\n"; + my $header = <IN>; + while (my $line = <IN>) { + chomp($line); + my ($chr,$start,$end,$geneids,$log2,$cn,$depth, + $probes,$weight) = split(/\t/,$line); + my %genes; + my @ids = split(/;|,/,$geneids); + foreach my $gid (@ids) { + my ($key,$value) = split(/=/,$gid); + if ($key eq 'ensembl_gn' || $key eq 'identifier') { + $genes{$value} = 1 if $keep{$value}; + } + } + my $newgeneids = join(";", keys %genes); + my $len = sprintf("%.1f",($end-$start)/1000000); + next if ($cn == 2) || scalar(keys %genes) < 1; + print OUT join("\t",$chr,$start,$end,$len,$cn,$depth, + $probes,$weight,$newgeneids),"\n"; + } + close IN; + close OUT; +} diff --git a/variants/svannot.pl b/variants/svannot.pl new file mode 100644 index 0000000..29972af --- /dev/null +++ b/variants/svannot.pl @@ -0,0 +1,110 @@ +#!/usr/bin/perl -w +#svanno.pl + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); + +my %opt = (); +my $results = GetOptions (\%opt,'input|i=s','refdb|r=s','help|h'); + +my $vcf = $opt{input}; +my $outfile = $vcf; +$outfile =~ s/vcf/txt/g; +open OUT, ">$outfile" or die $!; +my $gffile = $vcf; +$gffile =~ s/vcf/genefusion.txt/g; +open GF, ">$gffile" or die $!; + +my %lines; +my %eventid; +my $ct = 0; +open IN, "<$vcf" or die $!; +while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#CHROM/) { + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@subjacc) = split(/\t/, $line); + } + next if $line =~ m/#/; + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + $ct ++; + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + if ($id eq 'N') { + $id = 'NB'.sprintf("%06s",$ct); + } + my $evid = (split(/_/,$id))[0]; + $hash{'END'} = $pos+1 unless $hash{'END'}; + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err); + next unless $hash{ANN}; + F1:foreach $trx (split(/,/,$hash{ANN})) { + ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); + next unless ($impact && $impact =~ m/HIGH|MODERATE/); + next if ($effect eq 'sequence_feature'); + last F1; + } + next unless ($impact && $impact =~ m/HIGH|MODERATE/); + next unless ($gene); + next if ($done{$chrom}{$pos}); + $done{$chrom}{$pos} = 1 + @deschead = split(":",$format); + F1:foreach $sample (@subjacc) { + my $allele_info = shift @gts; + @ainfo = split(/:/, $allele_info); + my %gtinfo = (); + foreach $k (0..$#deschead) { + $gtinfo{$deschead[$k]} = $ainfo[$k]; + } + unless ($gtinfo{SU}) { + $gtinfo{SU} = 0; + $gtinfo{SU} = $gtinfo{RV}+$gtinfo{DV} if ($gtinfo{RV} && $gtinfo{DV}); + } + next if ($gtinfo{SU} < 10); + if (lc($alt) =~ m/chr(\w+):(\d+)/i) { + $chr2 = $1; + $end = $2; + if ($chr2 eq $chrom) { + print OUT join("\t",$sample,$gene,$chrom,$pos,$end,$id,$hash{SVTYPE},$effect, + $featureid,$codon,$aa,$rank,$gtinfo{SU}),"\n"; + }elsif ($id =~ m/_\d+/ && $effect =~ m/gene_fusion/) { + $gene =~ s/\&/--/; + my ($left_gene,$right_gene) = split(/--/,$gene); + next unless ($right_gene); + my $left_breakpoint = join(":",$chrom,$pos); + my $right_breakpoint = join(":",'chr'.$chr2,$end); + my ($leftstrand,$rightstrand) = split(//,$hash{STRANDS}); + print GF join("\t",$sample,$gene,$left_gene,$right_gene, $left_breakpoint, + $right_breakpoint,$leftstrand,$rightstrand,'',$gtinfo{SU}),"\n"; + }else { + print OUT join("\t",$sample,$gene,$chrom,$pos,$end,$ id,$hash{SVTYPE},$effect, + $featureid,$codon,$aa,$rank,$gtinfo{SU}),"\n"; + } + }else { + if ($hash{CHR2} && $hash{CHR2} eq $chrom) { + $end = $hash{END}; + print OUT join("\t",$sample,$gene,$chrom,$pos,$end,$id,$hash{SVTYPE},$effect, + $featureid,$codon,$aa,$rank,$gtinfo{SU}),"\n"; + }elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) { + $gene =~ s/\&/--/; + my ($left_gene,$right_gene) = split(/--/,$gene); + my $left_breakpoint = join(":",$chrom,$pos); + my $right_breakpoint = join(":",$hash{CHR2},$hash{END}); + print GF join("\t",$sample,$gene,$left_gene,$right_gene, $left_breakpoint,$right_breakpoint,'','','',$gtinfo{SU}),"\n"; + }unless ($hash{CHR2}) { + $hash{END} = $pos + 1 unless ($hash{END}); + print OUT join("\t",$sample,$gene,$chrom,$pos,$hash{END},$id,$hash{SVTYPE},$effect, + $featureid,$codon,$aa,$rank,$gtinfo{SU}),"\n"; + } + } + } +} + +close IN; diff --git a/variants/svcalling.sh b/variants/svcalling.sh index 50c9590..78029b1 100644 --- a/variants/svcalling.sh +++ b/variants/svcalling.sh @@ -1,4 +1,4 @@ -!/bin/bash +#!/bin/bash #svcalling.sh usage() { @@ -6,16 +6,19 @@ usage() { echo "-r --Path to Reference Genome with the file genome.fa" echo "-p --Prefix for output file name" echo "-b --Bam File" + echo "-n --Reference Bam File" echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk" exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:a:b:p:h opt +while getopts :r:a:b:n:k:p:h opt do case $opt in r) index_path=$OPTARG;; p) pair_id=$OPTARG;; b) sbam=$OPTARG;; + k) keepid=$OPTARG;; + n) normal=$OPTARG;; h) usage;; esac done @@ -43,26 +46,55 @@ else usage fi -baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" module load speedseq/20160506 novoBreak/v1.1.3 delly2/v0.7.7-multi samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 -#RUN DELLY mkdir temp -delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam} -delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam} -delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam} -delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam} -delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam} -delly2 filter -t BND -o delly_tra.bcf -f germline delly_translocations.bcf -delly2 filter -t DUP -o delly_dup.bcf -f germline delly_translocations.bcf -delly2 filter -t INV -o delly_inv.bcf -f germline delly_translocations.bcf -delly2 filter -t DEL -o delly_del.bcf -f germline delly_translocations.bcf -delly2 filter -t INS -o delly_ins.bcf -f germline delly_translocations.bcf +if [[ -n ${normal} ]] +then + run_novoBreak.sh /cm/shared/apps/novoBreak/novoBreak_distribution_v1.1.3rc ${reffa} ${sbam} ${normal} $SLURM_CPUS_ON_NODE + perl $baseDir/vcf2bed.sv.pl novoBreak.pass.flt.vcf |sort -T temp -V -k 1,1 -k 2,2n > novobreak.bed + mv novoBreak.pass.flt.vcf ${pair_id}.novobreak.vcf + bgzip ${pair_id}.novobreak.vcf +fi +if [[ -n ${normal} ]] +then + #RUN DELLY + delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 filter -t BND -o delly_tra.bcf -f somatic -s samples.tsv delly_translocations.bcf + delly2 filter -t DUP -o delly_dup.bcf -f somatic -s samples.tsv delly_duplications.bcf + delly2 filter -t INV -o delly_inv.bcf -f somatic -s samples.tsv delly_inversions.bcf + delly2 filter -t DEL -o delly_del.bcf -f somatic -s samples.tsv delly_deletion.bcf + delly2 filter -t INS -o delly_ins.bcf -f somatic -s samples.tsv delly_insertion.bcf +else +#RUN DELLY + delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam} + delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam} + delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam} + delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam} + delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam} + delly2 filter -t BND -o delly_tra.bcf -f germline delly_translocations.bcf + delly2 filter -t DUP -o delly_dup.bcf -f germline delly_duplications.bcf + delly2 filter -t INV -o delly_inv.bcf -f germline delly_inversions.bcf + delly2 filter -t DEL -o delly_del.bcf -f germline delly_deletion.bcf + delly2 filter -t INS -o delly_ins.bcf -f germline delly_insertion.bcf +fi #MERGE DELLY AND MAKE BED -bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf | vcf-sort > ${pair_id}.delly.vcf -perl $baseDir/vcf2bed.sv.pl ${pair_id}.delly.vcf > delly.bed -bgzip ${pair_id}.delly.vcf + +bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf| vcf-sort -t temp > delly.vcf +perl $baseDir/vcf2bed.sv.pl delly.vcf > delly.bed +bgzip delly.vcf +if [[ -n ${normal} ]] +then + tabix delly.vcf.gz + bcftools view -O z -o delly.vcf.gz -s ${keepid} ${pair_id}.delly.vcf.gz +else + mv delly.vcf.gz ${pair_id}.delly.vcf.gz +fi tabix ${pair_id}.delly.vcf.gz #MAKE FILES FOR LUMPY @@ -70,19 +102,38 @@ samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${sbam} samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o splitters.bam - gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o discordants.bam - - #RUN LUMPY -speedseq sv -t $SLURM_CPUS_ON_NODE -o ${pair_id}.sssv -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed +if [[ -n ${normal} ]] +then + samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${normal} + samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam + gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o normal.splitters.bam - + gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o normal.discordants.bam - + speedseq sv -t $SLURM_CPUS_ON_NODE -o sssv -R ${reffa} -B ${normal},${sbam} -D normal.discordants.bam,discordants.bam -S normal.splitters.bam,splitters.bam -x ${index_path}/exclude_alt.bed + bcftools view -O z -o sssv.vcf.gz -s ${keepid} ${pair_id}.sssv.sv.vcf.gz +else + speedseq sv -t $SLURM_CPUS_ON_NODE -o ${pair_id}.sssv -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed +fi + java -jar $SNPEFF_HOME/SnpSift.jar filter "GEN[0].SU > 2" ${pair_id}.sssv.sv.vcf.gz > lumpy.vcf perl $baseDir/vcf2bed.sv.pl lumpy.vcf > lumpy.bed #COMPARE DELLY & LUMPY -bedtools intersect -v -a lumpy.bed -b delly.bed > lumpy_only.bed -bedtools intersect -header -b lumpy_only.bed -a lumpy.vcf |bgzip > lumpy_only.vcf.gz -vcf-concat ${pair_id}.delly.vcf.gz lumpy_only.vcf.gz |vcf-sort -t temp > ${pair_id}.sv.vcf -perl $baseDir/vcf2bed.sv.pl ${pair_id}.sv.vcf |sort -V -k 1,1 -k 2,2n | grep -v 'alt' |grep -v 'random' |uniq > svs.bed -bedtools intersect -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed > exonoverlap_sv.txt -bedtools intersect -v -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed | bedtools intersect -header -wb -a stdin -b ${index_path}/gencode.genes.chr.bed > geneoverlap_sv.txt -perl $baseDir/annot_sv.pl -r ${index_path} -i ${pair_id}.sv.vcf -bgzip ${pair_id}.sv.vcf +if [[ -n ${normal} ]] +then + bedtools multiinter -cluster -header -names novobreak delly lumpy -i novobreak.bed delly.bed lumpy.bed > sv.intersect.bed + grep novobreak sv.intersect.bed |cut -f 1,2,3 |sort -V -k 1,1 -k 2,2n |grep -v start | bedtools intersect -header -b stdin -a ${tid}_${nid}.novobreak.vcf.gz | perl -p -e 's/SPIKEIN/${tid}/' |bgzip > svt1.vcf.gz +else + bedtools multiinter -cluster -header -names delly lumpy -i delly.bed lumpy.bed > sv.intersect.bed +fi +grep delly sv.intersect.bed |cut -f 1,2,3 |sort -V -k 1,1 -k 2,2n |grep -v 'start' |grep -v 'novobreak' | bedtools intersect -header -b stdin -a ${pair_id}.delly.vcf.gz |bgzip > svt2.vcf.gz +grep lumpy sv.intersect.bed |cut -f 1,2,3 |sort -V -k 1,1 -k 2,2n |grep -v 'start' |grep -v 'delly' |grep -v 'novobreak' | bedtools intersect -header -b stdin -a ${pair_id}.sssv.sv.vcf.gz |bgzip > svt3.vcf.gz + +vcf-concat svt*.vcf.gz | vcf-sort -t temp > ${pair_id}.sv.vcf +java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.sv.vcf > ${pair_id}.sv.annot.vcf +perl $baseDir/svannot.pl -i ${pair_id}.sv.annot.vcf +bgzip ${pair_id}.sv.annot.vcf +tabix ${pair_id}.sv.annot.vcf.gz + +fi diff --git a/variants/unionvcf.pl b/variants/unionvcf.pl index 277855f..699a7bb 100755 --- a/variants/unionvcf.pl +++ b/variants/unionvcf.pl @@ -98,6 +98,9 @@ foreach $vcf (@vcffiles) { close VCF; } my @callers = ('ssvar','platypus','sam','gatk','hotspot'); +if (grep(/mutect/,@vcffiles)) { + @callers = ('sssom','pmutect','shimmer','stelka','varscan','virmid'); +} F1:foreach $chr (sort {$a cmp $b} keys %lines) { F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) { my $callset = join(",",keys %{$lines{$chr}{$pos}}); -- GitLab