#!/usr/bin/perl 
#migrate_db.pl

my $headerfile = shift @ARGV;
my @vcffiles = @ARGV;

open HEADER, "<$headerfile" or die $!;
open OUT, ">int.vcf" or die $!;
while (my $line = <HEADER>) {
  chomp($line);
  print OUT $line,"\n";;
}
close HEADER;
my @sampleorder;

my %headerlines;
foreach $vcf (@vcffiles) {
  $caller = (split(/\./,$vcf))[-3];
  open VCF, "gunzip -c $vcf|" or die $!;
  my @sampleids;
  while (my $line = <VCF>) {
    chomp($line);
    if ($line =~ m/#/) {
      if ($line =~ m/#CHROM/) {
	($chromhd, $posd,$idhd,$refhd,$althd,$scorehd,
	 $filterhd,$annothd,$formathd,@sampleids) = split(/\t/, $line);
	foreach $j (0..$#sampleids) {
	  $sampleids[$j] = (split(/\./,$sampleids[$j]))[0];
	}
	unless (@sampleorder) {
	  @sampleorder = @sampleids;
	  print OUT join("\t",$chromhd, $posd,$idhd,$refhd,$althd,$scorehd,
			 $filterhd,$annothd,$formathd,@sampleids),"\n";
	}
	next;
      }
      next;
    }
    my ($chrom, $pos,$id,$ref,$alt,$score,
	$filter,$annot,$format,@gts) = split(/\t/, $line);
    my %hash = ();
    foreach $a (split(/;/,$annot)) {
      my ($key,$val) = split(/=/,$a);
      $hash{$key} = $val;
    }
    #next if (($hash{FS} && $hash{FS} > 60) || $filter =~ m/strandBias/i);
    my @deschead = split(/:/,$format);
    my $newformat = 'GT:DP:AD:AO:RO';
    my %newgts;
    my %afinfo;
    my %gtfilt;
    my $missingGT = 0;
  FG:foreach my $i (0..$#gts) {
      my $allele_info = $gts[$i];
      my $sid = $sampleids[$i];
      my @gtinfo = split(/:/,$allele_info);
      my %gtdata;
      if ($allele_info eq '.') {
	$newgts{$sid} = '.:.:.:.:.';
	$missingGT ++;
	next FG;
      }
      foreach my $i (0..$#deschead) {
	$gtdata{$deschead[$i]} = $gtinfo[$i];
      }
      if ($gtdata{GT} eq './.') {
	$newgts{$sid} = '.:.:.:.:.';
	$missingGT ++;
	next FG;
      }
      if ($gtdata{FT} && $gtdata{FT} =~ m/HighSNVSB/) {
	  $gtfilt{'StrandBias'} = 1;
      }
      if ($gtdata{DP4}) { #varscan uses this
	my ($ref_fwd,$ref_rev,$alt_fwd,$alt_rev) = split(',',$gtdata{DP4});
	$gtdata{AO} = $alt_fwd+$alt_rev;
	$gtdata{RO} = $ref_fwd+$ref_rev;
	$gtdata{DP} = $ref_fwd+$ref_rev+$alt_fwd+$alt_rev;
	$gtdata{AD} = join(",",$gtdata{RO},$gtdata{AO});
      }elsif ($gtdata{AD} && $gtdata{AD} =~ m/,/){
	($gtdata{RO},@alts) = split(/,/,$gtdata{AD});
	$gtdata{AO} = join(",",@alts);
	$gtdata{DP} = $gtdata{RO};
	foreach (@alts) {
	  $gtdata{DP} += $_;
	}
      } elsif (exists $gtdata{NR} && exists $gtdata{NV}) { #platypus uses this
	$gtdata{DP} = (split(/,/,$gtdata{NR}))[0]; 	
	$gtdata{AO} = $gtdata{NV};
	$gtdata{RO} = $gtdata{DP};	
	foreach $altct (split(/,/,$gtdata{NV})) {
	  $gtdata{RO} -= $altct;
	}
	$gtdata{AD} = join(",",$gtdata{RO},$gtdata{AO})
      } elsif (exists $gtdata{AO} && exists $gtdata{RO}) {
	$gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
	$gtdata{DP} = $gtdata{RO};
	foreach (split(',',$gtdata{AO})) {
	  $gtdata{DP} += $_;
	}
      }
      my $mafs = '.';
      my @maf = ();
      if ($gtdata{DP} && $gtdata{DP} ne '.' && exists $gtdata{AO} && $gtdata{AO} ne '.') {
	foreach $areadct (split(/,/,$gtdata{AO})) {
	  push @maf, sprintf("%.2f",$areadct/$gtdata{DP});
	}
	$mafs = join(",",@maf);
      }
      if (exists $gtdata{DP} && $gtdata{DP} < 20) {
	$missingGT ++;
      }elsif (exists $gtdata{AO} && $gtdata{AO} < 3) {
	$missingGT ++;
      }
      $afinfo{$sid} = join(":",$gtdata{DP},$mafs);
      $newgts{$sid} =  join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
    }
    next if ($missingGT == scalar(@gts));
    my @gtdesc;
    my @newgts;
    foreach $id (@sampleorder) {
      push @gtdesc, join(":",$id,$afinfo{$id});
      push @newgts, $newgts{$id};
    }
    my @filts = split(";",$filter);
    my %filterqc = map {$_ => 1} @filts;
    delete $filterqc{'.'};
    if ($gtfilt{'StrandBias'} || ($hash{FS} && $hash{FS} > 60) || 
	($hash{SAP} && $hash{SAP} > 20)) {
	$filterqc{strandBias} = 1;
    }if (scalar(keys %filterqc) > 1) {
	$filter = join(";",keys %filterqc);
    }else {
	$filter = '.';
    }
    $lines{$chrom}{$pos}{$alt}{$caller} = [$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,\@newgts,\@gtdesc];
  }
  close VCF;
}
my @callers = ('ssvar','sam','gatk','strelka2','platypus','hotspot');
if (grep(/mutect/,@vcffiles)) {
  @callers = ('sssom','mutect','shimmer','strelka2','varscan','virmid');
}
F1:foreach $chr (sort {$a cmp $b} keys %lines) {
 F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) {
  F4:foreach $alt (sort {$a <=> $b} keys %{$lines{$chr}{$pos}}) {
      my @callset;
      my %csets;
    F3:foreach $caller (sort {$a cmp $b} keys %{$lines{$chr}{$pos}{$alt}}) {
	my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot,
	    $format,$gtsref,$gtdescref) = @{$lines{$chr}{$pos}{$alt}{$caller}};
	@gtdesc = @{$gtdescref};
	foreach $gtd (@gtdesc) {
	  my ($id,$dp,$maf) = split(/:/,$gtd);
	  push @{$csets{$id}}, [$caller,$dp,$maf];
	}
	push @callset, join("|",$caller,$alt,@gtdesc);
      }
      my $consistent = 1;
      foreach $id (keys %csets) {
	my @calls = @{$csets{$id}};
	my @calls = sort {$a[2] <=> $b[2]} @calls;
	$consistent = 0 if ($calls[0][2] < 0.25 && $calls[-1][2] - $calls[0][2] > 0.10 && $calls[-1][2]/($calls[0][2]+0.001) > 3);
      }
    F3:foreach $caller (@callers) {
	if ($lines{$chr}{$pos}{$alt}{$caller}) {
	  my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot,
	      $format,$gtsref,$gtdescref) = @{$lines{$chr}{$pos}{$alt}{$caller}};
	  @gts = @{$gtsref};
	  @gtdesc = @{$gtdescref};
	  $annot = $annot.";CallSet=".join(",",@callset);
	  unless ($consistent) {
	    $annot = $annot.";CallSetInconsistent=1";
	  }
	  print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,
			 $filter,$annot,$format,@gts),"\n";
	  last F3;
	}
      }
    }
  }
}
close OUT;