Skip to content
Snippets Groups Projects
Commit d58e9577 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

adding union code

parent fff6d8b9
Branches
Tags
No related merge requests found
......@@ -5,7 +5,7 @@ usage() {
echo "-h Help documentation for gatkrunner.sh"
echo "-r --Reference Genome: GRCh38 or GRCm38"
echo "-p --Prefix for output file name"
echo "Example: bash hisat.sh -p prefix -r /path/GRCh38"
echo "Example: bash union.sh -p prefix -r /path/GRCh38"
exit 1
}
OPTIND=1 # Reset OPTIND
......@@ -20,7 +20,7 @@ done
function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14
module load bedtools/2.26.0 samtools/1.6
HS=*.hotspot.vcf.gz
list1=`ls *vcf.gz|grep -v hotspot`
......@@ -31,36 +31,13 @@ for i in *.vcf.gz; do
EXT="${i#*.}"
CALL="${EXT%%.*}"
calllist="$calllist $CALL"
tabix $i
if [[ $i == $HS ]]
then
bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz
tabix hotspot.nooverlap.vcf.gz
list2="$list2 hotspot.nooverlap.vcf.gz"
varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz"
else
varlist="$varlist --variant:$CALL $i"
fi
done
bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct > ${pair_id}_integrate.bed
perl $baseDir/unionvcf.pl $list2
perl $baseDir/vcfsorter.pl ${index_path}/genome.dict int.vcf |bgzip > ${pair_id}.union.vcf.gz
priority='ssvar'
if [[ *.platypus.vcf.gz ]]
then
priority="$priority,platypus"
fi
priority="$priority,sam,gatk"
if [[ -f *.hotspot.vcf.gz ]]
then
priority="$priority,hotspot"
fi
java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf
perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf
bgzip ${pair_id}_integrate.bed
tabix ${pair_id}_integrate.bed.gz
bgzip ${pair_id}.uniform.vcf
tabix ${pair_id}.uniform.vcf.gz
bcftools annotate -a ${pair_id}_integrate.bed.gz --columns CHROM,FROM,TO,CallSet -h ${index_path}/CallSet.header ${pair_id}.uniform.vcf.gz | bgzip > ${pair_id}.union.vcf.gz
#!/usr/bin/perl
#migrate_db.pl
my $headerfile = shift @ARGV;
my @vcffiles = @ARGV;
my $outfile = $vcf;
$outfile =~ s/vcf.gz/uniform.vcf/;
open HEADER, "<$headerfile" or die $!;
open OUT, ">union.vcf" or die $!;
while (my $line = <HEADER>) {
print OUT $line;
}
close HEADER;
my %headerlines;
foreach $vcf (@vcffiles) {
$caller = (split(/\./,$vcf))[1];
open VCF, "gunzip -c $vcf|" or die $!;
while (my $line = <VCF>) {
chomp($line);
if ($line =~ m/#/) {
$headerlines{$line} = 1;
next;
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val;
}
my @deschead = split(/:/,$format);
my $newformat = 'GT:DP:AD:AO:RO';
my @newgts = ();
my $missingGT = 0;
FG:foreach my $allele_info (@gts) {
my @gtinfo = split(/:/,$allele_info);
my %gtdata;
if ($allele_info eq '.') {
push @newgts, '.:.:.:.:.';
$missingGT ++;
next FG;
}
foreach my $i (0..$#deschead) {
$gtdata{$deschead[$i]} = $gtinfo[$i];
}
if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') {
push @newgts, '.:.:.:.:.';
$missingGT ++;
next FG;
}
if ($gtdata{AD}){
($gtdata{RO},@alts) = split(/,/,$gtdata{AD});
$gtdata{AO} = join(",",@alts);
$gtdata{DP} = $gtdata{RO};
foreach (@alts) {
$gtdata{DP} += $_;
}
} elsif (exists $gtdata{NR} && exists $gtdata{NV}) {
$gtdata{DP} = $gtdata{NR};
$gtdata{AO} = $gtdata{NV};
$gtdata{RO} = $gtdata{DP} - $gtdata{AO};
} elsif (exists $gtdata{AO} && exists $gtdata{RO}) {
$gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
$gtdata{DP} = $gtdata{RO};
foreach (split(',',$gtdata{AO})) {
$gtdata{DP} += $_;
}
}
if ($gtdata{DP} && $gtdata{DP} < 3) {
$missingGT ++;
}
push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
}
next if ($missingGT == scalar(@gts));
$lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
}
close VCF;
}
my @callers = ('ssvar','platypus','sam','gatk','hotspot');
F1:foreach $chr (sort {$a cmp $b} keys %lines) {
F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) {
my $callset = join(",",keys %{$lines{$chr}{$pos}});
F3:foreach $caller (@callers) {
if ($lines{$chr}{$pos}{$caller}) {
my ($chrom, $pos,$id,$ref,$alt,$score,$filter,$annot,
$format,@gts) = split(/\t/,$lines{$chr}{$pos}{$caller});
$annot = $annot.";CallSet=".$callset;
print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts),"\n";
last F3;
}
}
}
}
close OUT;
#!/usr/bin/perl
use strict;
######################################################
# vcfsorter.pl
#
# Copyright (C) 2011 German Gaston Leparc
#
# sorts VCF by reference genome
#
# usage:
#
# vcfsorter.pl genome.dict myvcf.file > mynewvcf.file
#
######################################################
my $usage = <<EOF;
sorts VCF by reference genome
usage:
vcfsorter.pl genome.dict myvcf > mynewvcf.file 2>STDERR
EOF
my $dict_file = @ARGV[0];
my $vcf_file = @ARGV[1];
die "\nERROR: missing an argument!\n\n$usage" if (@ARGV < 2);
#---------------------------------------- LOAD IN FASTA DICT INTO MEMORY
open(DICT,$dict_file) or die "Can't open $dict_file!\n";
my @contig_order;
my $c=0;
while(<DICT>)
{
if($_=~ /\@SQ/)
{
my ($contig) = $_ =~ /SN:(\S+)/;
$contig_order[$c]=$contig;
++$c;
#print $contig,"\n";
}
}
close(DICT);
#---------------------------------------- PARSE VCF FILE & OUTPUT SORTED VCF
open(VCF,$vcf_file) or die "Can't open $vcf_file!\n";
my %vcf_hash;
my $header;
while(<VCF>)
{
if($_=~/^#/){ $header .= $_; } # store header and comment fields
chomp($_);
my @data = split(/\t/,$_);
my $contig = $data[0];
my $start = $data[1];
my $variant = $data[4]."to".$data[5];
my $line = $_;
#print $contig,":",$start,"\n";
$vcf_hash{$contig}{$start}{$variant}=$line;
}
close(VCF);
#------------------ print out the VCF in the order of the reference genome
#print standard VCF header
print $header;
foreach my $contig (@contig_order) # sort by contig order
{
#print $contig,"\n";
foreach my $start (sort {$a <=> $b} keys %{$vcf_hash{$contig}}) # sort numerically by coordinates
{
#print $start,"\n";
foreach my $variant (keys %{$vcf_hash{$contig}{$start}}) # if overlapping mutation, print each variant
{
print $vcf_hash{$contig}{$start}{$variant},"\n";
}
}
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment