diff --git a/variants/filter_svaba.pl b/variants/filter_svaba.pl index 474046b113fd218d2eea7dacc2f2c9e7f8876fe0..0068be0f51812daf5f94cf10e683351f28122736 100755 --- a/variants/filter_svaba.pl +++ b/variants/filter_svaba.pl @@ -4,7 +4,7 @@ #module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); my %opt = (); -my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s'); +my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s','sv|s=s'); open VCFOUT, ">$opt{pid}\.svaba.vcf" or die $!; open DELFUS, ">$opt{pid}\.svaba.potentialfusion.txt" or die $!; @@ -109,9 +109,12 @@ W1:while (my $line = <IN>) { my $newannot = join(";",@nannot); if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) { + if ($filter =~ m/LOWMAPQ|LowQual/i) { + $filter = 'FailedQC'.$filter; + } print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot, $format,@gts),"\n"; - }elsif ($hash{SPAN} && $hash{SPAN} > 9999) { + }elsif ($hash{SVTYPE} eq 'DEL' && $hash{SPAN} && $hash{SPAN} > 9999) { my ($allele,$effect,$impact,$gene,$geneid,$feature, $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx); @@ -119,3 +122,128 @@ W1:while (my $line = <IN>) { } } close IN; + +my %svpairs; + +open IN, "gunzip -c $opt{sv} |" or die $!; + +W1:while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#/) { + if ($line =~ m/^#CHROM/) { + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@gtheader) = split(/\t/, $line); + unless ($opt{tumor}) { + if (grep(/T_DNA/,@gtheader)) { + my @tsamps = grep(/T_DNA/,@gtheader); + $opt{tumor} = $tsamps[0]; + }else { + $opt{tumor} = $gtheader[0]; + } + } + } + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + if ($alt =~ m/CHR(\w+):(\d+)/i) { + $hash{CHR2} = 'chr'.$1; + $hash{END} = $2; + } + next unless ($hash{ANN}); + next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/); + my %gtinfo = (); + my @deschead = split(/:/,$format); + F1:foreach my $k (0..$#gtheader) { + my $subjid = $gtheader[$k]; + my $allele_info = $gts[$k]; + my @ainfo = split(/:/, $allele_info); + my @mutallfreq = (); + foreach my $k (0..$#ainfo) { + $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k]; + } + $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP}); + next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1); + my @altct = split(/,/,$gtinfo{$subjid}{AD}); + my $refct = shift @altct; + @altct2 = split(/,/,$gtinfo{$subjid}{AO}); + if (scalar(@altct) ne scalar(@altct2)) { + warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n"; + } + my $total = $refct; + foreach my $act (@altct) { + next if ($act eq '.'); + $total += $act; + push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP}); + } + $gtinfo{$subjid}{MAF} = \@mutallfreq; + } + next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20); + unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) { + warn "Missing Alt:$line\n"; + } + @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}}; + @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO}); + next if ($tumoraltct[0] eq '.'); + $hash{AF} = join(",",@tumormaf); + next if ($tumoraltct[0] < 20); + next if ($tumormaf[0] < 0.01); + my $keepforvcf = 0; + my $keeptrx; + F1:foreach $trx (split(/,/,$hash{ANN})) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); + next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i); + next if($effect eq 'sequence_feature'); + $keeptrx = $trx; + $keepforvcf = $gene; + last F1; + } + next unless $keepforvcf; + my @nannot; + foreach $info (sort {$a cmp $b} keys %hash) { + if (defined $hash{$info}) { + push @nannot, $info."=".$hash{$info}; + }else { + push @nannot, $info; + } + } + my $newannot = join(";",@nannot); + my ($svid,$svpair) = split(/:/,$id); + my $newline = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot,$format,@gts); + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx); + my $fusionline = join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts); + $svpairs{$svid}{$svpair} = {vcfline=>$line,fusionline=>$fusionline, + gene=>$keepforvcf,alt=>$alt,span=>$hash{SPAN}}; +} +close IN; + + +foreach my $id (keys %svpairs) { + my $alt1 = $svpairs{$id}{1}{alt}; + my $alt2 = $svpairs{$id}{2}{alt}; + my $svtype; + if ($alt1 =~ m/^\w\[/ && $alt2 =~ m/^\]/) { + $svtype = 'DEL'; + }elsif ($alt2 =~ m/^\w\[/ && $alt1 =~ m/^\]/) { + $svtype = 'INS'; + } + if ($svtype eq 'INS' || ($svtype eq 'DEL' && $svpairs{$id}{1}{gene} !~ m/&/)) { + if ($filter =~ m/LOWMAPQ|LowQual/i) { + $filter = 'FailedQC'.$filter; + } + print VCFOUT $svpairs{$id}{1}{vcfline},"\n" + }elsif ($svtype eq 'DEL' && $svpairs{$id}{1}{span} && $svpairs{$id}{1}{span} > 9999) { + print DELFUS $svpairs{$id}{1}{fusionline},"\n"; + } +} diff --git a/variants/svcalling.sh b/variants/svcalling.sh index 043a86c1c6259acdc507b00f0bd88eddf74ea6c8..e3f682cc252237e0aa2d1999c162aad407430771 100755 --- a/variants/svcalling.sh +++ b/variants/svcalling.sh @@ -125,7 +125,7 @@ then zgrep '#CHROM' ${pair_id}.svaba.sv.vcf.gz > ${pair_id}.svaba.genefusion.txt zcat ${pair_id}.svaba.sv.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS ALT ID ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.sgf.txt mv ${pair_id}.svaba.vcf.gz ${pair_id}.svaba.ori.vcf.gz - perl $baseDir/filter_svaba.pl -t $tid -p $pair_id -i ${pair_id}.svaba.ori.vcf.gz -v + perl $baseDir/filter_svaba.pl -t $tid -p $pair_id -i ${pair_id}.svaba.ori.vcf.gz -s ${pair_id}.svaba.sv.vcf.gz bgzip ${pair_id}.svaba.vcf cat ${pair_id}.svaba.potentialfusion.txt ${pair_id}.sgf.txt | sort -u >> ${pair_id}.svaba.genefusion.txt fi