From d58e95778683686ce981f6d1c0e923fe7faeda3a Mon Sep 17 00:00:00 2001 From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu> Date: Mon, 6 Nov 2017 08:01:54 -0600 Subject: [PATCH] adding union code --- variants/union.sh | 31 ++------------ variants/unionvcf.pl | 95 +++++++++++++++++++++++++++++++++++++++++++ variants/vcfsorter.pl | 90 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 189 insertions(+), 27 deletions(-) create mode 100755 variants/unionvcf.pl create mode 100755 variants/vcfsorter.pl diff --git a/variants/union.sh b/variants/union.sh index ea44757..0b82944 100644 --- a/variants/union.sh +++ b/variants/union.sh @@ -5,7 +5,7 @@ usage() { echo "-h Help documentation for gatkrunner.sh" echo "-r --Reference Genome: GRCh38 or GRCm38" echo "-p --Prefix for output file name" - echo "Example: bash hisat.sh -p prefix -r /path/GRCh38" + echo "Example: bash union.sh -p prefix -r /path/GRCh38" exit 1 } OPTIND=1 # Reset OPTIND @@ -20,7 +20,7 @@ done function join_by { local IFS="$1"; shift; echo "$*"; } shift $(($OPTIND -1)) baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14 +module load bedtools/2.26.0 samtools/1.6 HS=*.hotspot.vcf.gz list1=`ls *vcf.gz|grep -v hotspot` @@ -31,36 +31,13 @@ for i in *.vcf.gz; do EXT="${i#*.}" CALL="${EXT%%.*}" calllist="$calllist $CALL" - tabix $i if [[ $i == $HS ]] then bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz - tabix hotspot.nooverlap.vcf.gz list2="$list2 hotspot.nooverlap.vcf.gz" - varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz" - else - varlist="$varlist --variant:$CALL $i" fi done -bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct > ${pair_id}_integrate.bed +perl $baseDir/unionvcf.pl $list2 +perl $baseDir/vcfsorter.pl ${index_path}/genome.dict int.vcf |bgzip > ${pair_id}.union.vcf.gz -priority='ssvar' -if [[ *.platypus.vcf.gz ]] -then - priority="$priority,platypus" -fi -priority="$priority,sam,gatk" -if [[ -f *.hotspot.vcf.gz ]] -then - priority="$priority,hotspot" -fi - -java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf - -perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf -bgzip ${pair_id}_integrate.bed -tabix ${pair_id}_integrate.bed.gz -bgzip ${pair_id}.uniform.vcf -tabix ${pair_id}.uniform.vcf.gz -bcftools annotate -a ${pair_id}_integrate.bed.gz --columns CHROM,FROM,TO,CallSet -h ${index_path}/CallSet.header ${pair_id}.uniform.vcf.gz | bgzip > ${pair_id}.union.vcf.gz diff --git a/variants/unionvcf.pl b/variants/unionvcf.pl new file mode 100755 index 0000000..f04e195 --- /dev/null +++ b/variants/unionvcf.pl @@ -0,0 +1,95 @@ +#!/usr/bin/perl +#migrate_db.pl + +my $headerfile = shift @ARGV; +my @vcffiles = @ARGV; +my $outfile = $vcf; +$outfile =~ s/vcf.gz/uniform.vcf/; +open HEADER, "<$headerfile" or die $!; +open OUT, ">union.vcf" or die $!; +while (my $line = <HEADER>) { + print OUT $line; +} +close HEADER; +my %headerlines; +foreach $vcf (@vcffiles) { + $caller = (split(/\./,$vcf))[1]; + open VCF, "gunzip -c $vcf|" or die $!; + while (my $line = <VCF>) { + chomp($line); + if ($line =~ m/#/) { + $headerlines{$line} = 1; + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val; + } + my @deschead = split(/:/,$format); + my $newformat = 'GT:DP:AD:AO:RO'; + my @newgts = (); + my $missingGT = 0; + FG:foreach my $allele_info (@gts) { + my @gtinfo = split(/:/,$allele_info); + my %gtdata; + if ($allele_info eq '.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + foreach my $i (0..$#deschead) { + $gtdata{$deschead[$i]} = $gtinfo[$i]; + } + if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + if ($gtdata{AD}){ + ($gtdata{RO},@alts) = split(/,/,$gtdata{AD}); + $gtdata{AO} = join(",",@alts); + $gtdata{DP} = $gtdata{RO}; + foreach (@alts) { + $gtdata{DP} += $_; + } + } elsif (exists $gtdata{NR} && exists $gtdata{NV}) { + $gtdata{DP} = $gtdata{NR}; + $gtdata{AO} = $gtdata{NV}; + $gtdata{RO} = $gtdata{DP} - $gtdata{AO}; + } elsif (exists $gtdata{AO} && exists $gtdata{RO}) { + $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO}); + $gtdata{DP} = $gtdata{RO}; + foreach (split(',',$gtdata{AO})) { + $gtdata{DP} += $_; + } + } + if ($gtdata{DP} && $gtdata{DP} < 3) { + $missingGT ++; + } + push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); + } + next if ($missingGT == scalar(@gts)); + $lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + } + close VCF; +} +my @callers = ('ssvar','platypus','sam','gatk','hotspot'); + F1:foreach $chr (sort {$a cmp $b} keys %lines) { + F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) { + my $callset = join(",",keys %{$lines{$chr}{$pos}}); + F3:foreach $caller (@callers) { + if ($lines{$chr}{$pos}{$caller}) { + my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot, + $format,@gts) = split(/\t/,$lines{$chr}{$pos}{$caller}); + $annot = $annot.";CallSet=".$callset; + print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts),"\n"; + last F3; + } + } + } +} +close OUT; diff --git a/variants/vcfsorter.pl b/variants/vcfsorter.pl new file mode 100755 index 0000000..517aead --- /dev/null +++ b/variants/vcfsorter.pl @@ -0,0 +1,90 @@ +#!/usr/bin/perl +use strict; + +###################################################### +# vcfsorter.pl +# +# Copyright (C) 2011 German Gaston Leparc +# +# sorts VCF by reference genome +# +# usage: +# +# vcfsorter.pl genome.dict myvcf.file > mynewvcf.file +# +###################################################### + +my $usage = <<EOF; +sorts VCF by reference genome + +usage: + +vcfsorter.pl genome.dict myvcf > mynewvcf.file 2>STDERR +EOF + + +my $dict_file = @ARGV[0]; +my $vcf_file = @ARGV[1]; + +die "\nERROR: missing an argument!\n\n$usage" if (@ARGV < 2); + + +#---------------------------------------- LOAD IN FASTA DICT INTO MEMORY +open(DICT,$dict_file) or die "Can't open $dict_file!\n"; +my @contig_order; +my $c=0; +while(<DICT>) +{ +if($_=~ /\@SQ/) + { + my ($contig) = $_ =~ /SN:(\S+)/; + $contig_order[$c]=$contig; + ++$c; + #print $contig,"\n"; + } +} +close(DICT); + +#---------------------------------------- PARSE VCF FILE & OUTPUT SORTED VCF + +open(VCF,$vcf_file) or die "Can't open $vcf_file!\n"; + +my %vcf_hash; +my $header; + +while(<VCF>) +{ +if($_=~/^#/){ $header .= $_; } # store header and comment fields +chomp($_); + +my @data = split(/\t/,$_); +my $contig = $data[0]; +my $start = $data[1]; +my $variant = $data[4]."to".$data[5]; +my $line = $_; + +#print $contig,":",$start,"\n"; + +$vcf_hash{$contig}{$start}{$variant}=$line; + +} +close(VCF); + +#------------------ print out the VCF in the order of the reference genome + +#print standard VCF header +print $header; + +foreach my $contig (@contig_order) # sort by contig order + { + #print $contig,"\n"; + foreach my $start (sort {$a <=> $b} keys %{$vcf_hash{$contig}}) # sort numerically by coordinates + { + #print $start,"\n"; + foreach my $variant (keys %{$vcf_hash{$contig}{$start}}) # if overlapping mutation, print each variant + { + print $vcf_hash{$contig}{$start}{$variant},"\n"; + } + } + + } -- GitLab