Merge branch 'master' of git.biohpc.swmed.edu:BICF/Astrocyte/process_scripts

8d9c724f · Brandi Cantarel · 2d64c494 · 66010ebb · 8d9c724f · 8d9c724f
Commit 8d9c724f authored 5 years ago by Brandi Cantarel
--- a/variants/filter_delly.pl
+++ b/variants/filter_delly.pl
+#!/usr/bin/perl -w
+#integrate_datasets.pl
+#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+my %opt = ();
+my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s');
+open VCFOUT, ">$opt{pid}\.delly.vcf" or die $!;
+open DELFUS, ">$opt{pid}\.delly.potentialfusion.txt" or die $!;
+open IN, "gunzip -c $opt{in} |" or die $!;
+W1:while (my $line = <IN>) {
+  chomp($line);
+  if ($line =~ m/^#/) {
+    if ($line =~ m/^#CHROM/) {
+      print OUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n";
+      my @header = split(/\t/,$line);
+      ($chrom, $pos,$id,$ref,$alt,$score,
+       $filter,$info,$format,@gtheader) = split(/\t/, $line);
+      unless ($opt{tumor}) {
+	if (grep(/T_DNA/,@gtheader)) {
+	  my @tsamps = grep(/T_DNA/,@gtheader);
+	  $opt{tumor} = $tsamps[0];
+	}else {
+	  $opt{tumor} = $gtheader[0];
+	}
+      }
+    }
+    print VCFOUT $line,"\n";
+  }
+  my ($chrom, $pos,$id,$ref,$alt,$score,
+      $filter,$annot,$format,@gts) = split(/\t/, $line);
+  next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/);
+  my %hash = ();
+  foreach $a (split(/;/,$annot)) {
+    my ($key,$val) = split(/=/,$a);
+    $hash{$key} = $val unless ($hash{$key});
+  }
+  next unless ($hash{ANN});
+  next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/);
+  my %gtinfo = ();
+  my @deschead = split(/:/,$format);
+ F1:foreach my $k (0..$#gtheader) {
+    my $subjid = $gtheader[$k];
+    my $allele_info = $gts[$k];
+    my @ainfo = split(/:/, $allele_info);
+    my @mutallfreq = ();
+    foreach my $k (0..$#ainfo) {
+      $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k];
+    }
+    $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP});
+    next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1);
+    my @altct = split(/,/,$gtinfo{$subjid}{AD});
+    my $refct = shift @altct;
+    @altct2 = split(/,/,$gtinfo{$subjid}{AO});
+    if (scalar(@altct) ne scalar(@altct2)) {
+      warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n";
+    }
+    my $total = $refct;
+    foreach  my $act (@altct) {
+      next if ($act eq '.');
+      $total += $act;
+      push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP});
+    }
+    $gtinfo{$subjid}{MAF} = \@mutallfreq;
+  }
+  next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20);
+  unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) {
+    warn "Missing Alt:$line\n";
+  }
+  @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}};
+  @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO});
+  next if ($tumoraltct[0] eq '.');
+  $hash{AF} = join(",",@tumormaf);
+  next if ($tumoraltct[0] < 20);
+  next if ($tumormaf[0] < 0.01);
+  my $keepforvcf = 0;
+  my $keeptrx;
+ F1:foreach $trx (split(/,/,$hash{ANN})) {
+    my ($allele,$effect,$impact,$gene,$geneid,$feature,
+	$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
+	$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx);
+    next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i);
+    next if($effect eq 'sequence_feature');
+    $keeptrx = $trx;
+    $keepforvcf = $gene;
+    last F1;
+  }
+  next unless $keepforvcf;
+  my @nannot;
+  foreach $info (sort {$a cmp $b} keys %hash) {
+    if (defined $hash{$info}) {
+      push @nannot, $info."=".$hash{$info};
+    }else {
+      push @nannot, $info;
+    }
+  }
+  my $newannot = join(";",@nannot);
+  if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) {
+    print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot,
+		      $format,@gts),"\n";
+  }elsif ($hash{END} && $hash{END} - $pos > 9999) {
+    my ($allele,$effect,$impact,$gene,$geneid,$feature,
+	$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
+	$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx);
+    print DELFUS join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts),"\n";
+  }
+}
+close IN;
--- a/variants/filter_svaba.pl
+++ b/variants/filter_svaba.pl
+#!/usr/bin/perl -w
+#integrate_datasets.pl
+#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+my %opt = ();
+my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s');
+open VCFOUT, ">$opt{pid}\.svaba.vcf" or die $!;
+open DELFUS, ">$opt{pid}\.svaba.potentialfusion.txt" or die $!;
+open IN, "gunzip -c $opt{in} |" or die $!;
+W1:while (my $line = <IN>) {
+  chomp($line);
+  if ($line =~ m/^#/) {
+    if ($line =~ m/^#CHROM/) {
+      print OUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n";
+      my @header = split(/\t/,$line);
+      ($chrom, $pos,$id,$ref,$alt,$score,
+       $filter,$info,$format,@gtheader) = split(/\t/, $line);
+      unless ($opt{tumor}) {
+	if (grep(/T_DNA/,@gtheader)) {
+	  my @tsamps = grep(/T_DNA/,@gtheader);
+	  $opt{tumor} = $tsamps[0];
+	}else {
+	  $opt{tumor} = $gtheader[0];
+	}
+      }
+    }
+    print VCFOUT $line,"\n";
+  }
+  my ($chrom, $pos,$id,$ref,$alt,$score,
+      $filter,$annot,$format,@gts) = split(/\t/, $line);
+  next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/);
+  my %hash = ();
+  foreach $a (split(/;/,$annot)) {
+    my ($key,$val) = split(/=/,$a);
+    $hash{$key} = $val unless ($hash{$key});
+  }
+  if (length($alt) > length($ref)) {
+      $hash{SVTYPE} = "INS";
+      $hash{END} = $pos;
+  }elsif (length($alt) < length($ref)) {
+      my $diff = substr($ref, length($alt));
+      $hash{SVTYPE} = "DEL";
+      $hash{END} = $pos + length($diff);
+  }
+  next unless ($hash{ANN});
+  next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/);
+  my %gtinfo = ();
+  my @deschead = split(/:/,$format);
+ F1:foreach my $k (0..$#gtheader) {
+    my $subjid = $gtheader[$k];
+    my $allele_info = $gts[$k];
+    my @ainfo = split(/:/, $allele_info);
+    my @mutallfreq = ();
+    foreach my $k (0..$#ainfo) {
+      $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k];
+    }
+    $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP});
+    next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1);
+    my @altct = split(/,/,$gtinfo{$subjid}{AD});
+    my $refct = shift @altct;
+    @altct2 = split(/,/,$gtinfo{$subjid}{AO});
+    if (scalar(@altct) ne scalar(@altct2)) {
+      warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n";
+    }
+    my $total = $refct;
+    foreach  my $act (@altct) {
+      next if ($act eq '.');
+      $total += $act;
+      push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP});
+    }
+    $gtinfo{$subjid}{MAF} = \@mutallfreq;
+  }
+  next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20);
+  unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) {
+    warn "Missing Alt:$line\n";
+  }
+  @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}};
+  @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO});
+  next if ($tumoraltct[0] eq '.');
+  $hash{AF} = join(",",@tumormaf);
+  next if ($tumoraltct[0] < 20);
+  next if ($tumormaf[0] < 0.01);
+  my $keepforvcf = 0;
+  my $keeptrx;
+ F1:foreach $trx (split(/,/,$hash{ANN})) {
+    my ($allele,$effect,$impact,$gene,$geneid,$feature,
+	$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
+	$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx);
+    next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i);
+    next if($effect eq 'sequence_feature');
+    $keeptrx = $trx;
+    $keepforvcf = $gene;
+    last F1;
+  }
+  next unless $keepforvcf;
+  my @nannot;
+  foreach $info (sort {$a cmp $b} keys %hash) {
+    if (defined $hash{$info}) {
+      push @nannot, $info."=".$hash{$info};
+    }else {
+      push @nannot, $info;
+    }
+  }
+  my $newannot = join(";",@nannot);
+  if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) {
+    print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot,
+		      $format,@gts),"\n";
+  }elsif ($hash{SPAN} && $hash{SPAN} > 9999) {
+    my ($allele,$effect,$impact,$gene,$geneid,$feature,
+	$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
+	$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx);
+    print DELFUS join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts),"\n";
+  }
+}
+close IN;
--- a/variants/svcalling.sh
+++ b/variants/svcalling.sh
@@ -77,6 +77,7 @@ then
 	delly2 call -t INS -o ${pair_id}.delly_insertion.bcf -q 30 -g ${reffa} ${sbam} ${normal}
 	#delly2 filter -o ${pair_id}.delly_tra.bcf -f somatic -s samples.tsv ${pair_id}.delly_translocations.bcf
    else
+	$tid=
 	#RUN DELLY
 	delly2 call -t BND -o ${pair_id}.delly_translocations.bcf -q 30 -g ${reffa} ${sbam}
 	delly2 call -t DUP -o ${pair_id}.delly_duplications.bcf -q 30 -g ${reffa} ${sbam}
@@ -94,6 +95,10 @@ then
    then
 	zgrep '#CHROM' ${pair_id}.delly.vcf.gz > ${pair_id}.delly.genefusion.txt
 	zcat ${pair_id}.delly.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS CHR2 END ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.delly.genefusion.txt
+	mv ${pair_id}.delly.vcf.gz ${pair_id}.delly.ori.vcf.gz
+	bash $baseDir/filter_delly.pl -t $tid -p $pair_id -i ${pair_id}.delly.ori.vcf.gz
+	bgzip ${pair_id}.delly.vcf
+	cat ${pair_id}.potentialfusion.txt >> ${pair_id}.delly.genefusion.txt
       fi
 elif [[ $method == 'svaba' ]]
 then
@@ -116,6 +121,10 @@ then
    then
 	zgrep '#CHROM' ${pair_id}.svaba.sv.vcf.gz > ${pair_id}.svaba.genefusion.txt
 	zcat ${pair_id}.svaba.sv.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS ALT ID ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u  >> ${pair_id}.svaba.genefusion.txt
+	mv ${pair_id}.svaba.vcf.gz ${pair_id}.svaba.ori.vcf.gz
+	bash $baseDir/filter_svaba.pl -t $tid -p $pair_id -i ${pair_id}.svaba.ori.vcf.gz
+	bgzip ${pair_id}.svaba.vcf
+	cat ${pair_id}.potentialfusion.txt >> ${pair_id}.svaba.genefusion.txt
       fi
 elif [[ $method == 'lumpy' ]]
 then