From a575d98a31758891353dba3bff0ca0dfafe5e3fa Mon Sep 17 00:00:00 2001
From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu>
Date: Mon, 21 May 2018 16:17:44 -0500
Subject: [PATCH] update AF callers

---
 obselete/combine_variants.union.sh | 66 ------------------------------
 variants/union.sh                  |  8 ++--
 variants/unionvcf.pl               | 46 ++++++++++++++++++---
 3 files changed, 46 insertions(+), 74 deletions(-)
 delete mode 100644 obselete/combine_variants.union.sh

diff --git a/obselete/combine_variants.union.sh b/obselete/combine_variants.union.sh
deleted file mode 100644
index ea44757..0000000
--- a/obselete/combine_variants.union.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-#union.sh
-
-usage() {
-  echo "-h Help documentation for gatkrunner.sh"
-  echo "-r  --Reference Genome: GRCh38 or GRCm38"
-  echo "-p  --Prefix for output file name"
-  echo "Example: bash hisat.sh -p prefix -r /path/GRCh38"
-  exit 1
-}
-OPTIND=1 # Reset OPTIND
-while getopts :r:p:h opt
-do
-    case $opt in
-        r) index_path=$OPTARG;;
-        p) pair_id=$OPTARG;;
-        h) usage;;
-    esac
-done
-function join_by { local IFS="$1"; shift; echo "$*"; }
-shift $(($OPTIND -1))
-baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14
-
-HS=*.hotspot.vcf.gz
-list1=`ls *vcf.gz|grep -v hotspot`
-list2=`ls *vcf.gz|grep -v hotspot`
-varlist=''
-calllist=''
-for i in *.vcf.gz; do
-    EXT="${i#*.}"
-    CALL="${EXT%%.*}"
-    calllist="$calllist $CALL"
-    tabix $i
-    if [[ $i == $HS ]]
-    then
-	bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz
-	tabix hotspot.nooverlap.vcf.gz
-	list2="$list2 hotspot.nooverlap.vcf.gz"
-	varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz"
-    else 
-	varlist="$varlist --variant:$CALL $i"
-    fi
-done 
-
-bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct >  ${pair_id}_integrate.bed
-
-priority='ssvar'
-if [[ *.platypus.vcf.gz ]]
-then
-    priority="$priority,platypus"
-fi
-priority="$priority,sam,gatk"
-if [[ -f *.hotspot.vcf.gz ]]
-then
-    priority="$priority,hotspot"
-fi
-
-java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf
-
-perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf
-bgzip ${pair_id}_integrate.bed
-tabix ${pair_id}_integrate.bed.gz
-bgzip ${pair_id}.uniform.vcf
-tabix ${pair_id}.uniform.vcf.gz
-bcftools annotate -a ${pair_id}_integrate.bed.gz --columns CHROM,FROM,TO,CallSet -h ${index_path}/CallSet.header ${pair_id}.uniform.vcf.gz | bgzip > ${pair_id}.union.vcf.gz
diff --git a/variants/union.sh b/variants/union.sh
index 09bae15..204b550 100644
--- a/variants/union.sh
+++ b/variants/union.sh
@@ -22,7 +22,7 @@ shift $(($OPTIND -1))
 baseDir="`dirname \"$0\"`"
 
 source /etc/profile.d/modules.sh
-module load bedtools/2.26.0 samtools/1.6
+module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q
 
 HS=*.hotspot.vcf.gz
 list1=`ls *vcf.gz|grep -v hotspot`
@@ -32,11 +32,13 @@ calllist=''
 for i in *.vcf.gz; do
     if [[ $i == $HS ]]
     then
-	bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > nooverlap.hotspot.vcf.gz
+	bcftools norm -m - -O z -o hotspot.norm.vcf.gz $i
+	java -jar /cm/shared/apps/snpeff/4.3q/SnpSift.jar filter "(GEN[*].AD[1] > 3)" hotspot.norm.vcf.gz |bgzip > hotspot.lowfilt.vcf.gz
+	bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a hotspot.lowfilt.vcf.gz -b stdin |bgzip > nooverlap.hotspot.vcf.gz
 	list2="$list2 nooverlap.hotspot.vcf.gz"
     fi
 done 
-#echo "$baseDir/unionvcf.pl ${index_path}/union.header.vcf $list2"
+
 perl $baseDir/unionvcf.pl ${index_path}/union.header.vcf $list2
 perl $baseDir/vcfsorter.pl ${index_path}/genome.dict int.vcf |bgzip > ${pair_id}.union.vcf.gz
 
diff --git a/variants/unionvcf.pl b/variants/unionvcf.pl
index 04b5572..6390dfa 100755
--- a/variants/unionvcf.pl
+++ b/variants/unionvcf.pl
@@ -47,6 +47,7 @@ foreach $vcf (@vcffiles) {
     my @deschead = split(/:/,$format);
     my $newformat = 'GT:DP:AD:AO:RO';
     my %newgts;
+    my %afinfo;
     my $missingGT = 0;
   FG:foreach my $i (0..$#gts) {
       my $allele_info = $gts[$i];
@@ -94,17 +95,30 @@ foreach $vcf (@vcffiles) {
 	  $gtdata{DP} += $_;
 	}
       }
-      if ($gtdata{DP} && $gtdata{DP} < 3) {
+      my $mafs = '.';
+      my @maf = ();
+      if ($gtdata{DP} && $gtdata{DP} ne '.' && $gtdata{AO} && $gtdata{AO} ne '.') {
+	foreach $areadct (split(/,/,$gtdata{AO})) {
+	  push @maf, sprintf("%.2f",$areadct/$gtdata{DP});
+	}
+	$mafs = join(",",@maf);
+      }
+      if (exists $gtdata{DP} && $gtdata{DP} < 20) {
+	$missingGT ++;
+      }elsif (exists $gtdata{AO} && $gtdata{AO} < 3) {
 	$missingGT ++;
       }
+      $afinfo{$sid} = join(":",$gtdata{DP},$mafs);
       $newgts{$sid} =  join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
     }
     next if ($missingGT == scalar(@gts));
+    my @gtdesc;
     my @newgts;
     foreach $id (@sampleorder) {
+      push @gtdesc, join(":",$id,$afinfo{$id});
       push @newgts, $newgts{$id};
     }
-    $lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
+    $lines{$chrom}{$pos}{$caller} = [$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,\@newgts,\@gtdesc];
   }
   close VCF;
 }
@@ -114,12 +128,34 @@ if (grep(/mutect/,@vcffiles)) {
 }
 F1:foreach $chr (sort {$a cmp $b} keys %lines) {
  F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) {
-    my $callset = join(",",keys %{$lines{$chr}{$pos}});
+    my @callset;
+    my %csets;
+  F3:foreach $caller (sort {$a cmp $b} keys %{$lines{$chr}{$pos}}) {
+      my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot,
+	  $format,$gtsref,$gtdescref) = @{$lines{$chr}{$pos}{$caller}};
+      @gtdesc = @{$gtdescref};
+      foreach $gtd (@gtdesc) {
+	my ($id,$dp,$maf) = split(/:/,$gtd);
+	push @{$csets{$id}}, [$caller,$dp,$maf];
+      }
+      push @callset, join("/",$caller,$alt,@gtdesc);
+    }
+    my $consistent = 1;
+    foreach $id (keys %csets) {
+      my @calls = @{$csets{$id}};
+      my @calls = sort {$a[2] <=> $b[2]} @calls;
+      $consistent = 0 if ($calls[0][2] < 0.25 && $calls[-1][2] - $calls[0][2] > 0.10 && $calls[-1][2]/($calls[0][2]+0.001) > 3);
+    }
   F3:foreach $caller (@callers) {
       if ($lines{$chr}{$pos}{$caller}) {
 	my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot,
-	    $format,@gts) = split(/\t/,$lines{$chr}{$pos}{$caller});
-	$annot = $annot.";CallSet=".$callset;
+	    $format,$gtsref,$gtdescref) = @{$lines{$chr}{$pos}{$caller}};
+	@gts = @{$gtsref};
+	@gtdesc = @{$gtdescref};
+	$annot = $annot.";CallSet=".join("|",@callset);
+	unless ($consistent) {
+	  $annot = $annot.";CallSetInconsistent=1";
+	}
 	print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,
 		       $filter,$annot,$format,@gts),"\n";
 	last F3;
-- 
GitLab