From a575d98a31758891353dba3bff0ca0dfafe5e3fa Mon Sep 17 00:00:00 2001 From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu> Date: Mon, 21 May 2018 16:17:44 -0500 Subject: [PATCH] update AF callers --- obselete/combine_variants.union.sh | 66 ------------------------------ variants/union.sh | 8 ++-- variants/unionvcf.pl | 46 ++++++++++++++++++--- 3 files changed, 46 insertions(+), 74 deletions(-) delete mode 100644 obselete/combine_variants.union.sh diff --git a/obselete/combine_variants.union.sh b/obselete/combine_variants.union.sh deleted file mode 100644 index ea44757..0000000 --- a/obselete/combine_variants.union.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash -#union.sh - -usage() { - echo "-h Help documentation for gatkrunner.sh" - echo "-r --Reference Genome: GRCh38 or GRCm38" - echo "-p --Prefix for output file name" - echo "Example: bash hisat.sh -p prefix -r /path/GRCh38" - exit 1 -} -OPTIND=1 # Reset OPTIND -while getopts :r:p:h opt -do - case $opt in - r) index_path=$OPTARG;; - p) pair_id=$OPTARG;; - h) usage;; - esac -done -function join_by { local IFS="$1"; shift; echo "$*"; } -shift $(($OPTIND -1)) -baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14 - -HS=*.hotspot.vcf.gz -list1=`ls *vcf.gz|grep -v hotspot` -list2=`ls *vcf.gz|grep -v hotspot` -varlist='' -calllist='' -for i in *.vcf.gz; do - EXT="${i#*.}" - CALL="${EXT%%.*}" - calllist="$calllist $CALL" - tabix $i - if [[ $i == $HS ]] - then - bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz - tabix hotspot.nooverlap.vcf.gz - list2="$list2 hotspot.nooverlap.vcf.gz" - varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz" - else - varlist="$varlist --variant:$CALL $i" - fi -done - -bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct > ${pair_id}_integrate.bed - -priority='ssvar' -if [[ *.platypus.vcf.gz ]] -then - priority="$priority,platypus" -fi -priority="$priority,sam,gatk" -if [[ -f *.hotspot.vcf.gz ]] -then - priority="$priority,hotspot" -fi - -java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf - -perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf -bgzip ${pair_id}_integrate.bed -tabix ${pair_id}_integrate.bed.gz -bgzip ${pair_id}.uniform.vcf -tabix ${pair_id}.uniform.vcf.gz -bcftools annotate -a ${pair_id}_integrate.bed.gz --columns CHROM,FROM,TO,CallSet -h ${index_path}/CallSet.header ${pair_id}.uniform.vcf.gz | bgzip > ${pair_id}.union.vcf.gz diff --git a/variants/union.sh b/variants/union.sh index 09bae15..204b550 100644 --- a/variants/union.sh +++ b/variants/union.sh @@ -22,7 +22,7 @@ shift $(($OPTIND -1)) baseDir="`dirname \"$0\"`" source /etc/profile.d/modules.sh -module load bedtools/2.26.0 samtools/1.6 +module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q HS=*.hotspot.vcf.gz list1=`ls *vcf.gz|grep -v hotspot` @@ -32,11 +32,13 @@ calllist='' for i in *.vcf.gz; do if [[ $i == $HS ]] then - bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > nooverlap.hotspot.vcf.gz + bcftools norm -m - -O z -o hotspot.norm.vcf.gz $i + java -jar /cm/shared/apps/snpeff/4.3q/SnpSift.jar filter "(GEN[*].AD[1] > 3)" hotspot.norm.vcf.gz |bgzip > hotspot.lowfilt.vcf.gz + bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a hotspot.lowfilt.vcf.gz -b stdin |bgzip > nooverlap.hotspot.vcf.gz list2="$list2 nooverlap.hotspot.vcf.gz" fi done -#echo "$baseDir/unionvcf.pl ${index_path}/union.header.vcf $list2" + perl $baseDir/unionvcf.pl ${index_path}/union.header.vcf $list2 perl $baseDir/vcfsorter.pl ${index_path}/genome.dict int.vcf |bgzip > ${pair_id}.union.vcf.gz diff --git a/variants/unionvcf.pl b/variants/unionvcf.pl index 04b5572..6390dfa 100755 --- a/variants/unionvcf.pl +++ b/variants/unionvcf.pl @@ -47,6 +47,7 @@ foreach $vcf (@vcffiles) { my @deschead = split(/:/,$format); my $newformat = 'GT:DP:AD:AO:RO'; my %newgts; + my %afinfo; my $missingGT = 0; FG:foreach my $i (0..$#gts) { my $allele_info = $gts[$i]; @@ -94,17 +95,30 @@ foreach $vcf (@vcffiles) { $gtdata{DP} += $_; } } - if ($gtdata{DP} && $gtdata{DP} < 3) { + my $mafs = '.'; + my @maf = (); + if ($gtdata{DP} && $gtdata{DP} ne '.' && $gtdata{AO} && $gtdata{AO} ne '.') { + foreach $areadct (split(/,/,$gtdata{AO})) { + push @maf, sprintf("%.2f",$areadct/$gtdata{DP}); + } + $mafs = join(",",@maf); + } + if (exists $gtdata{DP} && $gtdata{DP} < 20) { + $missingGT ++; + }elsif (exists $gtdata{AO} && $gtdata{AO} < 3) { $missingGT ++; } + $afinfo{$sid} = join(":",$gtdata{DP},$mafs); $newgts{$sid} = join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); } next if ($missingGT == scalar(@gts)); + my @gtdesc; my @newgts; foreach $id (@sampleorder) { + push @gtdesc, join(":",$id,$afinfo{$id}); push @newgts, $newgts{$id}; } - $lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + $lines{$chrom}{$pos}{$caller} = [$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,\@newgts,\@gtdesc]; } close VCF; } @@ -114,12 +128,34 @@ if (grep(/mutect/,@vcffiles)) { } F1:foreach $chr (sort {$a cmp $b} keys %lines) { F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) { - my $callset = join(",",keys %{$lines{$chr}{$pos}}); + my @callset; + my %csets; + F3:foreach $caller (sort {$a cmp $b} keys %{$lines{$chr}{$pos}}) { + my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot, + $format,$gtsref,$gtdescref) = @{$lines{$chr}{$pos}{$caller}}; + @gtdesc = @{$gtdescref}; + foreach $gtd (@gtdesc) { + my ($id,$dp,$maf) = split(/:/,$gtd); + push @{$csets{$id}}, [$caller,$dp,$maf]; + } + push @callset, join("/",$caller,$alt,@gtdesc); + } + my $consistent = 1; + foreach $id (keys %csets) { + my @calls = @{$csets{$id}}; + my @calls = sort {$a[2] <=> $b[2]} @calls; + $consistent = 0 if ($calls[0][2] < 0.25 && $calls[-1][2] - $calls[0][2] > 0.10 && $calls[-1][2]/($calls[0][2]+0.001) > 3); + } F3:foreach $caller (@callers) { if ($lines{$chr}{$pos}{$caller}) { my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot, - $format,@gts) = split(/\t/,$lines{$chr}{$pos}{$caller}); - $annot = $annot.";CallSet=".$callset; + $format,$gtsref,$gtdescref) = @{$lines{$chr}{$pos}{$caller}}; + @gts = @{$gtsref}; + @gtdesc = @{$gtdescref}; + $annot = $annot.";CallSet=".join("|",@callset); + unless ($consistent) { + $annot = $annot.";CallSetInconsistent=1"; + } print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score, $filter,$annot,$format,@gts),"\n"; last F3; -- GitLab