From d58e95778683686ce981f6d1c0e923fe7faeda3a Mon Sep 17 00:00:00 2001
From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu>
Date: Mon, 6 Nov 2017 08:01:54 -0600
Subject: [PATCH] adding union code

---
 variants/union.sh     | 31 ++------------
 variants/unionvcf.pl  | 95 +++++++++++++++++++++++++++++++++++++++++++
 variants/vcfsorter.pl | 90 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 189 insertions(+), 27 deletions(-)
 create mode 100755 variants/unionvcf.pl
 create mode 100755 variants/vcfsorter.pl

diff --git a/variants/union.sh b/variants/union.sh
index ea44757..0b82944 100644
--- a/variants/union.sh
+++ b/variants/union.sh
@@ -5,7 +5,7 @@ usage() {
   echo "-h Help documentation for gatkrunner.sh"
   echo "-r  --Reference Genome: GRCh38 or GRCm38"
   echo "-p  --Prefix for output file name"
-  echo "Example: bash hisat.sh -p prefix -r /path/GRCh38"
+  echo "Example: bash union.sh -p prefix -r /path/GRCh38"
   exit 1
 }
 OPTIND=1 # Reset OPTIND
@@ -20,7 +20,7 @@ done
 function join_by { local IFS="$1"; shift; echo "$*"; }
 shift $(($OPTIND -1))
 baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14
+module load bedtools/2.26.0 samtools/1.6
 
 HS=*.hotspot.vcf.gz
 list1=`ls *vcf.gz|grep -v hotspot`
@@ -31,36 +31,13 @@ for i in *.vcf.gz; do
     EXT="${i#*.}"
     CALL="${EXT%%.*}"
     calllist="$calllist $CALL"
-    tabix $i
     if [[ $i == $HS ]]
     then
 	bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz
-	tabix hotspot.nooverlap.vcf.gz
 	list2="$list2 hotspot.nooverlap.vcf.gz"
-	varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz"
-    else 
-	varlist="$varlist --variant:$CALL $i"
     fi
 done 
 
-bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct >  ${pair_id}_integrate.bed
+perl $baseDir/unionvcf.pl $list2
+perl $baseDir/vcfsorter.pl ${index_path}/genome.dict int.vcf |bgzip > ${pair_id}.union.vcf.gz
 
-priority='ssvar'
-if [[ *.platypus.vcf.gz ]]
-then
-    priority="$priority,platypus"
-fi
-priority="$priority,sam,gatk"
-if [[ -f *.hotspot.vcf.gz ]]
-then
-    priority="$priority,hotspot"
-fi
-
-java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf
-
-perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf
-bgzip ${pair_id}_integrate.bed
-tabix ${pair_id}_integrate.bed.gz
-bgzip ${pair_id}.uniform.vcf
-tabix ${pair_id}.uniform.vcf.gz
-bcftools annotate -a ${pair_id}_integrate.bed.gz --columns CHROM,FROM,TO,CallSet -h ${index_path}/CallSet.header ${pair_id}.uniform.vcf.gz | bgzip > ${pair_id}.union.vcf.gz
diff --git a/variants/unionvcf.pl b/variants/unionvcf.pl
new file mode 100755
index 0000000..f04e195
--- /dev/null
+++ b/variants/unionvcf.pl
@@ -0,0 +1,95 @@
+#!/usr/bin/perl 
+#migrate_db.pl
+
+my $headerfile = shift @ARGV;
+my @vcffiles = @ARGV;
+my $outfile = $vcf;
+$outfile =~ s/vcf.gz/uniform.vcf/;
+open HEADER, "<$headerfile" or die $!;
+open OUT, ">union.vcf" or die $!;
+while (my $line = <HEADER>) {
+    print OUT $line;
+}
+close HEADER;
+my %headerlines;
+foreach $vcf (@vcffiles) {
+    $caller = (split(/\./,$vcf))[1];
+    open VCF, "gunzip -c $vcf|" or die $!;
+    while (my $line = <VCF>) {
+	chomp($line);
+	if ($line =~ m/#/) {
+	    $headerlines{$line} = 1;
+	    next;
+	}
+	my ($chrom, $pos,$id,$ref,$alt,$score,
+	    $filter,$annot,$format,@gts) = split(/\t/, $line);
+	my %hash = ();
+	foreach $a (split(/;/,$annot)) {
+	    my ($key,$val) = split(/=/,$a);
+	    $hash{$key} = $val;
+	}
+	my @deschead = split(/:/,$format);
+	my $newformat = 'GT:DP:AD:AO:RO';
+	my @newgts = ();
+	my $missingGT = 0;
+      FG:foreach my $allele_info (@gts) {
+	  my @gtinfo = split(/:/,$allele_info);
+	  my %gtdata;
+	  if ($allele_info eq '.') {
+	      push @newgts, '.:.:.:.:.';
+	      $missingGT ++;
+	      next FG;
+	  }
+	  foreach my $i (0..$#deschead) {
+	      $gtdata{$deschead[$i]} = $gtinfo[$i];
+	  }
+	  if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') {
+	      push @newgts, '.:.:.:.:.';
+	      $missingGT ++;
+	      next FG;
+	  }
+	  if ($gtdata{AD}){
+	      ($gtdata{RO},@alts) = split(/,/,$gtdata{AD});
+	      $gtdata{AO} = join(",",@alts);
+	      $gtdata{DP} = $gtdata{RO};
+	      foreach (@alts) {
+		  $gtdata{DP} += $_;
+	      }
+	  } elsif (exists $gtdata{NR} && exists $gtdata{NV}) {
+	      $gtdata{DP} = $gtdata{NR}; 	
+	      $gtdata{AO} = $gtdata{NV};
+	      $gtdata{RO} = $gtdata{DP} - $gtdata{AO};
+	  } elsif (exists $gtdata{AO} && exists $gtdata{RO}) {
+	      $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
+	      $gtdata{DP} = $gtdata{RO};
+	      foreach (split(',',$gtdata{AO})) {
+		  $gtdata{DP} += $_;
+	      }
+	  }
+	  if ($gtdata{DP} && $gtdata{DP} < 3) {
+	      $missingGT ++;
+	  }
+	  push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
+      }
+	next if ($missingGT == scalar(@gts));
+	$lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
+    }
+    close VCF;
+}
+my @callers = ('ssvar','platypus','sam','gatk','hotspot');
+ F1:foreach $chr (sort {$a cmp $b} keys %lines) {
+   F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) {
+       my $callset = join(",",keys %{$lines{$chr}{$pos}});
+     F3:foreach $caller (@callers) {
+	 if ($lines{$chr}{$pos}{$caller}) {
+	     my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot,
+		 $format,@gts) = split(/\t/,$lines{$chr}{$pos}{$caller});
+	     $annot = $annot.";CallSet=".$callset;
+	     print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,
+			    $filter,$annot,$format,@gts),"\n";
+	     last F3;
+	 }
+     }
+   }
+}
+close OUT;
diff --git a/variants/vcfsorter.pl b/variants/vcfsorter.pl
new file mode 100755
index 0000000..517aead
--- /dev/null
+++ b/variants/vcfsorter.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/perl
+use strict;
+
+######################################################
+# vcfsorter.pl
+#
+# Copyright (C) 2011 German Gaston Leparc
+#
+# sorts VCF by reference genome
+#
+# usage:
+#
+# vcfsorter.pl genome.dict myvcf.file > mynewvcf.file
+#
+######################################################
+
+my $usage = <<EOF;
+sorts VCF by reference genome
+
+usage:
+
+vcfsorter.pl genome.dict myvcf > mynewvcf.file 2>STDERR
+EOF
+
+
+my $dict_file = @ARGV[0];
+my $vcf_file = @ARGV[1];
+
+die "\nERROR: missing an argument!\n\n$usage" if (@ARGV < 2);
+
+
+#---------------------------------------- LOAD IN FASTA DICT INTO MEMORY
+open(DICT,$dict_file) or die "Can't open $dict_file!\n";
+my @contig_order;
+my $c=0;
+while(<DICT>)
+{
+if($_=~ /\@SQ/)
+	{
+	my ($contig) = $_ =~ /SN:(\S+)/;
+	$contig_order[$c]=$contig;
+	++$c; 
+	#print $contig,"\n";
+	}
+}
+close(DICT);
+
+#---------------------------------------- PARSE VCF FILE & OUTPUT SORTED VCF
+
+open(VCF,$vcf_file) or die "Can't open $vcf_file!\n";
+
+my %vcf_hash;
+my $header;
+
+while(<VCF>)
+{
+if($_=~/^#/){ $header .= $_; } # store header and comment fields
+chomp($_);
+
+my @data = split(/\t/,$_);
+my $contig = $data[0];
+my $start = $data[1];
+my $variant = $data[4]."to".$data[5];
+my $line = $_;
+
+#print $contig,":",$start,"\n";
+
+$vcf_hash{$contig}{$start}{$variant}=$line;
+
+}
+close(VCF);
+
+#------------------ print out the VCF in the order of the reference genome
+
+#print standard VCF header
+print $header;
+
+foreach my $contig (@contig_order) # sort by contig order
+	{
+	#print $contig,"\n";
+	foreach my $start (sort {$a <=> $b} keys %{$vcf_hash{$contig}}) # sort numerically by coordinates
+		{
+		#print $start,"\n";
+		foreach my $variant (keys %{$vcf_hash{$contig}{$start}}) # if overlapping mutation, print each variant
+			{
+			print $vcf_hash{$contig}{$start}{$variant},"\n";
+			}	
+		}
+		
+	}
-- 
GitLab