diff --git a/variants/filter_delly.pl b/variants/filter_delly.pl new file mode 100755 index 0000000000000000000000000000000000000000..4d6cffc8668139cd42f27204794eac9e7b2c9239 --- /dev/null +++ b/variants/filter_delly.pl @@ -0,0 +1,111 @@ +#!/usr/bin/perl -w +#integrate_datasets.pl + +#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s'); + +open VCFOUT, ">$opt{pid}\.delly.vcf" or die $!; +open DELFUS, ">$opt{pid}\.delly.potentialfusion.txt" or die $!; + +open IN, "gunzip -c $opt{in} |" or die $!; + +W1:while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#/) { + if ($line =~ m/^#CHROM/) { + print OUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n"; + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@gtheader) = split(/\t/, $line); + unless ($opt{tumor}) { + if (grep(/T_DNA/,@gtheader)) { + my @tsamps = grep(/T_DNA/,@gtheader); + $opt{tumor} = $tsamps[0]; + }else { + $opt{tumor} = $gtheader[0]; + } + } + } + print VCFOUT $line,"\n"; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + next unless ($hash{ANN}); + next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/); + my %gtinfo = (); + my @deschead = split(/:/,$format); + F1:foreach my $k (0..$#gtheader) { + my $subjid = $gtheader[$k]; + my $allele_info = $gts[$k]; + my @ainfo = split(/:/, $allele_info); + my @mutallfreq = (); + foreach my $k (0..$#ainfo) { + $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k]; + } + $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP}); + next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1); + my @altct = split(/,/,$gtinfo{$subjid}{AD}); + my $refct = shift @altct; + @altct2 = split(/,/,$gtinfo{$subjid}{AO}); + if (scalar(@altct) ne scalar(@altct2)) { + warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n"; + } + my $total = $refct; + foreach my $act (@altct) { + next if ($act eq '.'); + $total += $act; + push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP}); + } + $gtinfo{$subjid}{MAF} = \@mutallfreq; + } + next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20); + unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) { + warn "Missing Alt:$line\n"; + } + @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}}; + @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO}); + next if ($tumoraltct[0] eq '.'); + $hash{AF} = join(",",@tumormaf); + next if ($tumoraltct[0] < 20); + next if ($tumormaf[0] < 0.01); + my $keepforvcf = 0; + my $keeptrx; + F1:foreach $trx (split(/,/,$hash{ANN})) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); + next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i); + next if($effect eq 'sequence_feature'); + $keeptrx = $trx; + $keepforvcf = $gene; + last F1; + } + next unless $keepforvcf; + my @nannot; + foreach $info (sort {$a cmp $b} keys %hash) { + if (defined $hash{$info}) { + push @nannot, $info."=".$hash{$info}; + }else { + push @nannot, $info; + } + } + my $newannot = join(";",@nannot); + if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) { + print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot, + $format,@gts),"\n"; + }elsif ($hash{END} && $hash{END} - $pos > 9999) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx); + print DELFUS join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts),"\n"; + } +} +close IN; diff --git a/variants/filter_svaba.pl b/variants/filter_svaba.pl new file mode 100755 index 0000000000000000000000000000000000000000..f34ce1cf25d9dd4a6b204c5b01923e6e8a0249c1 --- /dev/null +++ b/variants/filter_svaba.pl @@ -0,0 +1,120 @@ +#!/usr/bin/perl -w +#integrate_datasets.pl + +#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s'); + +open VCFOUT, ">$opt{pid}\.svaba.vcf" or die $!; +open DELFUS, ">$opt{pid}\.svaba.potentialfusion.txt" or die $!; + +open IN, "gunzip -c $opt{in} |" or die $!; + +W1:while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#/) { + if ($line =~ m/^#CHROM/) { + print OUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n"; + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@gtheader) = split(/\t/, $line); + unless ($opt{tumor}) { + if (grep(/T_DNA/,@gtheader)) { + my @tsamps = grep(/T_DNA/,@gtheader); + $opt{tumor} = $tsamps[0]; + }else { + $opt{tumor} = $gtheader[0]; + } + } + } + print VCFOUT $line,"\n"; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + if (length($alt) > length($ref)) { + $hash{SVTYPE} = "INS"; + $hash{END} = $pos; + }elsif (length($alt) < length($ref)) { + my $diff = substr($ref, length($alt)); + $hash{SVTYPE} = "DEL"; + $hash{END} = $pos + length($diff); + } + next unless ($hash{ANN}); + next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/); + my %gtinfo = (); + my @deschead = split(/:/,$format); + F1:foreach my $k (0..$#gtheader) { + my $subjid = $gtheader[$k]; + my $allele_info = $gts[$k]; + my @ainfo = split(/:/, $allele_info); + my @mutallfreq = (); + foreach my $k (0..$#ainfo) { + $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k]; + } + $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP}); + next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1); + my @altct = split(/,/,$gtinfo{$subjid}{AD}); + my $refct = shift @altct; + @altct2 = split(/,/,$gtinfo{$subjid}{AO}); + if (scalar(@altct) ne scalar(@altct2)) { + warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n"; + } + my $total = $refct; + foreach my $act (@altct) { + next if ($act eq '.'); + $total += $act; + push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP}); + } + $gtinfo{$subjid}{MAF} = \@mutallfreq; + } + next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20); + unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) { + warn "Missing Alt:$line\n"; + } + @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}}; + @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO}); + next if ($tumoraltct[0] eq '.'); + $hash{AF} = join(",",@tumormaf); + next if ($tumoraltct[0] < 20); + next if ($tumormaf[0] < 0.01); + my $keepforvcf = 0; + my $keeptrx; + F1:foreach $trx (split(/,/,$hash{ANN})) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); + next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i); + next if($effect eq 'sequence_feature'); + $keeptrx = $trx; + $keepforvcf = $gene; + last F1; + } + next unless $keepforvcf; + my @nannot; + foreach $info (sort {$a cmp $b} keys %hash) { + if (defined $hash{$info}) { + push @nannot, $info."=".$hash{$info}; + }else { + push @nannot, $info; + } + } + + my $newannot = join(";",@nannot); + if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) { + print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot, + $format,@gts),"\n"; + }elsif ($hash{SPAN} && $hash{SPAN} > 9999) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx); + print DELFUS join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts),"\n"; + } +} +close IN; diff --git a/variants/svcalling.sh b/variants/svcalling.sh index 7ed1734f1c7a8002376332319e55f67a454234cd..7ec4c1c1529aef2a38a63f604e76673fb25c5973 100755 --- a/variants/svcalling.sh +++ b/variants/svcalling.sh @@ -77,6 +77,7 @@ then delly2 call -t INS -o ${pair_id}.delly_insertion.bcf -q 30 -g ${reffa} ${sbam} ${normal} #delly2 filter -o ${pair_id}.delly_tra.bcf -f somatic -s samples.tsv ${pair_id}.delly_translocations.bcf else + $tid= #RUN DELLY delly2 call -t BND -o ${pair_id}.delly_translocations.bcf -q 30 -g ${reffa} ${sbam} delly2 call -t DUP -o ${pair_id}.delly_duplications.bcf -q 30 -g ${reffa} ${sbam} @@ -94,6 +95,10 @@ then then zgrep '#CHROM' ${pair_id}.delly.vcf.gz > ${pair_id}.delly.genefusion.txt zcat ${pair_id}.delly.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS CHR2 END ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.delly.genefusion.txt + mv ${pair_id}.delly.vcf.gz ${pair_id}.delly.ori.vcf.gz + bash $baseDir/filter_delly.pl -t $tid -p $pair_id -i ${pair_id}.delly.ori.vcf.gz + bgzip ${pair_id}.delly.vcf + cat ${pair_id}.potentialfusion.txt >> ${pair_id}.delly.genefusion.txt fi elif [[ $method == 'svaba' ]] then @@ -116,6 +121,10 @@ then then zgrep '#CHROM' ${pair_id}.svaba.sv.vcf.gz > ${pair_id}.svaba.genefusion.txt zcat ${pair_id}.svaba.sv.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS ALT ID ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.svaba.genefusion.txt + mv ${pair_id}.svaba.vcf.gz ${pair_id}.svaba.ori.vcf.gz + bash $baseDir/filter_svaba.pl -t $tid -p $pair_id -i ${pair_id}.svaba.ori.vcf.gz + bgzip ${pair_id}.svaba.vcf + cat ${pair_id}.potentialfusion.txt >> ${pair_id}.svaba.genefusion.txt fi elif [[ $method == 'lumpy' ]] then