updating union

28fc369f · Brandi Cantarel · d58e9577 · 28fc369f · d58e9577 · 28fc369f
Commit 28fc369f authored 7 years ago by Brandi Cantarel
--- a/obselete/combine_variants.union.sh
+++ b/obselete/combine_variants.union.sh
+#!/bin/bash
+#union.sh
+
+usage() {
+  echo "-h Help documentation for gatkrunner.sh"
+  echo "-r  --Reference Genome: GRCh38 or GRCm38"
+  echo "-p  --Prefix for output file name"
+  echo "Example: bash hisat.sh -p prefix -r /path/GRCh38"
+  exit 1
+}
+OPTIND=1 # Reset OPTIND
+while getopts :r:p:h opt
+do
+    case $opt in
+        r) index_path=$OPTARG;;
+        p) pair_id=$OPTARG;;
+        h) usage;;
+    esac
+done
+function join_by { local IFS="$1"; shift; echo "$*"; }
+shift $(($OPTIND -1))
+baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14
+
+HS=*.hotspot.vcf.gz
+list1=`ls *vcf.gz|grep -v hotspot`
+list2=`ls *vcf.gz|grep -v hotspot`
+varlist=''
+calllist=''
+for i in *.vcf.gz; do
+    EXT="${i#*.}"
+    CALL="${EXT%%.*}"
+    calllist="$calllist $CALL"
+    tabix $i
+    if [[ $i == $HS ]]
+    then
+	bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz
+	tabix hotspot.nooverlap.vcf.gz
+	list2="$list2 hotspot.nooverlap.vcf.gz"
+	varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz"
+    else 
+	varlist="$varlist --variant:$CALL $i"
+    fi
+done 
+
+bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct >  ${pair_id}_integrate.bed
+
+priority='ssvar'
+if [[ *.platypus.vcf.gz ]]
+then
+    priority="$priority,platypus"
+fi
+priority="$priority,sam,gatk"
+if [[ -f *.hotspot.vcf.gz ]]
+then
+    priority="$priority,hotspot"
+fi
+
+java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf
+
+perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf
+bgzip ${pair_id}_integrate.bed
+tabix ${pair_id}_integrate.bed.gz
+bgzip ${pair_id}.uniform.vcf
+tabix ${pair_id}.uniform.vcf.gz
+bcftools annotate -a ${pair_id}_integrate.bed.gz --columns CHROM,FROM,TO,CallSet -h ${index_path}/CallSet.header ${pair_id}.uniform.vcf.gz | bgzip > ${pair_id}.union.vcf.gz
--- a/preproc_fastq/check_designfile_rnaseq.pl
+++ b/preproc_fastq/check_designfile_rnaseq.pl
-#!/usr/bin/perl -w
-#check_designfile.pl
-
-my $pe = shift @ARGV;
-my $dfile = shift @ARGV;
-open OUT, ">design.valid.txt" or die $!;
-#open OUT2, ">tochannel.csv" or die $!;
-open DFILE, "<$dfile" or die $!;
-my $head = <DFILE>;
-chomp($head);
-$head =~ s/FullPathTo//g;
-my @colnames = split(/\t/,$head);
-my %newcols = map {$_=> 1} @colnames;
-
-
-unless (grep(/FqR1/,@colnames)) {
-    die "Missing Sequence Files in Design File: FqR1\n";
-}
-unless (grep(/SampleID/,@colnames)) {
-    die "Missing SampleID in Design File\n";
-}
-
-if ($pe eq 'pe') {
-    unless (grep(/FqR2/,@colnames)) {
-	die "Missing Sequence Files in Design File: FqR2\n";
-    }
-}else {
-    delete $newcols{FqR2};
-}
-
-my @cols = sort {$a cmp $b} keys %newcols;
-print OUT join("\t","SampleID","FqR1","FqR2"),"\n";
-my @grp = ('a','b');
-my $lnct = 0;
-while (my $line = <DFILE>) {
-    chomp($line);
-    $line =~ s/FullPathTo//g;
-    my @row = split(/\t/,$line);
-    my %hash;
-    foreach my $i (0..$#row) {
-	next unless ($newcols{$colnames[$i]});
-	$hash{$colnames[$i]} = $row[$i];
-    }
-    if ($hash{SampleID} =~ m/^\d/) {
-	$hash{SampleID} =~ s/^/S/;
-    }
-    $hash{SampleName} = $hash{SampleID} unless ($hash{SampleName});
-    $hash{SubjectID} = $hash{SampleID} unless ($hash{SubjectID});
-    
-    unless ($hash{SampleGroup}) {
-	$j = $lnct %% 2;
-	$hash{SampleGroup} = $grp[$j];
-    }
-    $hash{SampleGroup} =~ s/_/./g;
-    my @line;
-    foreach $f (@cols) {
-	push @line, $hash{$f};
-    }
-    print OUT join("\t",@line),"\n";
-    $hash{FqR2} = 'na' unless ($hash{FqR2});
-    print join(",",$hash{SampleID},$hash{FqR1},$hash{FqR2}),"\n";
-    $lnct ++;
-}
--- a/variants/annotvcf.sh
+++ b/variants/annotvcf.sh
@@ -21,7 +21,7 @@ do
 done
 function join_by { local IFS="$1"; shift; echo "$*"; }
 shift $(($OPTIND -1))
-module load python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q
+module load python/2.7.x-anaconda bedtools/2.26.0 samtools/1.6 snpeff/4.3q
 if  [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
 then
    tabix ${unionvcf}

--- a/variants/gatkrunner.sh
+++ b/variants/gatkrunner.sh
@@ -63,8 +63,9 @@ if [[ $algo == 'gatkbam_rna' ]]
 then
    module load picard/2.10.3
    java -Xmx4g -jar $PICARD/picard.jar CleanSam INPUT=${sbam} OUTPUT=${pair_id}.clean.bam
+    java -Xmx4g -jar $PICARD/picard.jar ReorderSam I=${pair_id}.clean.bam O=${pair_id}.sort.bam R=${reffa} CREATE_INDEX=TRUE 
    #java -Xmx4g -jar $PICARD/picard.jar AddOrReplaceReadGroups INPUT=${pair_id}.clean.bam O=${pair_id}.rg_added_sorted.bam SO=coordinate RGID=${pair_id} RGLB=tx RGPL=illumina RGPU=barcode RGSM=${pair_id}
-    samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.clean.bam
+    #samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.clean.bam
    java -Xmx4g -jar $GATK_JAR -T SplitNCigarReads -R ${reffa} -I ${pair_id}.clean.bam -o ${pair_id}.split.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS
    java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${pair_id}.split.bam -nt $SLURM_CPUS_ON_NODE -nct 1
    java -Xmx32g -jar $GATK_JAR -I ${pair_id}.split.bam -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam

--- a/variants/uniform_vcf_gt.pl
+++ b/variants/uniform_vcf_gt.pl
+#!/usr/bin/perl 
+#migrate_db.pl
+
+my $vcf = shift @ARGV;
+my $outfile = $vcf;
+$outfile =~ s/vcf.gz/uniform.vcf/;
+open OUT, ">$outfile" or die $!;
+open VCF, "gunzip -c $vcf|" or die $!;
+while (my $line = <VCF>) {
+    chomp($line);
+    if ($line =~ m/#/) {
+	print OUT $line,"\n";
+	next;
+    }
+    my ($chrom, $pos,$id,$ref,$alt,$score,
+	$filter,$annot,$format,@gts) = split(/\t/, $line);
+    my %hash = ();
+    foreach $a (split(/;/,$annot)) {
+	my ($key,$val) = split(/=/,$a);
+	$hash{$key} = $val;
+	}
+    my @deschead = split(/:/,$format);
+    my $newformat = 'GT:DP:AD:AO:RO';
+    my @newgts = ();
+    my $missingGT = 0;
+  FG:foreach my $allele_info (@gts) {
+      my @gtinfo = split(/:/,$allele_info);
+      my %gtdata;
+      if ($allele_info eq '.') {
+	  push @newgts, '.:.:.:.:.';
+	  $missingGT ++;
+	  next FG;
+      }
+      foreach my $i (0..$#deschead) {
+	  $gtdata{$deschead[$i]} = $gtinfo[$i];
+      }
+      if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') {
+	  push @newgts, '.:.:.:.:.';
+	  $missingGT ++;
+	  next FG;
+      }
+      if ($gtdata{AD}){
+	  ($gtdata{RO},@alts) = split(/,/,$gtdata{AD});
+	  $gtdata{AO} = join(",",@alts);
+	  $gtdata{DP} = $gtdata{RO};
+	  foreach (@alts) {
+	      $gtdata{DP} += $_;
+	  }
+      } elsif (exists $gtdata{NR} && exists $gtdata{NV}) {
+	      $gtdata{DP} = $gtdata{NR}; 	
+	      $gtdata{AO} = $gtdata{NV};
+	      $gtdata{RO} = $gtdata{DP} - $gtdata{AO};
+      } elsif (exists $gtdata{AO} && exists $gtdata{RO}) {
+	  $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
+	  $gtdata{DP} = $gtdata{RO};
+	  foreach (split(',',$gtdata{AO})) {
+	      $gtdata{DP} += $_;
+	  }
+      }
+      if ($gtdata{DP} && $gtdata{DP} < 5) {
+	  $missingGT ++;
+      }
+      push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
+  }
+    next if ($missingGT == scalar(@gts));
+    print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
+}
+close VCF;
--- a/variants/union.sh
+++ b/variants/union.sh
@@ -28,9 +28,6 @@ list2=`ls *vcf.gz|grep -v hotspot`
 varlist=''
 calllist=''
 for i in *.vcf.gz; do
-    EXT="${i#*.}"
-    CALL="${EXT%%.*}"
-    calllist="$calllist $CALL"
    if [[ $i == $HS ]]
    then
 	bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz
@@ -38,6 +35,6 @@ for i in *.vcf.gz; do
    fi
 done 

-perl $baseDir/unionvcf.pl $list2
+perl $baseDir/unionvcf.pl ${index_path}/union.header.vcf $list2
 perl $baseDir/vcfsorter.pl ${index_path}/genome.dict int.vcf |bgzip > ${pair_id}.union.vcf.gz

--- a/variants/unionvcf.pl
+++ b/variants/unionvcf.pl
@@ -3,23 +3,33 @@

 my $headerfile = shift @ARGV;
 my @vcffiles = @ARGV;
-my $outfile = $vcf;
-$outfile =~ s/vcf.gz/uniform.vcf/;
+
 open HEADER, "<$headerfile" or die $!;
-open OUT, ">union.vcf" or die $!;
+open OUT, ">int.vcf" or die $!;
 while (my $line = <HEADER>) {
-    print OUT $line;
+    chomp($line);
+    print OUT $line,"\n";;
 }
 close HEADER;
+my @sampleorder;
+
 my %headerlines;
 foreach $vcf (@vcffiles) {
    $caller = (split(/\./,$vcf))[1];
    open VCF, "gunzip -c $vcf|" or die $!;
+    my @sampleids;
    while (my $line = <VCF>) {
 	chomp($line);
 	if ($line =~ m/#/) {
-	    $headerlines{$line} = 1;
-	    next;
+	    if ($line =~ m/#CHROM/) {
+		($chromhd, $posd,$idhd,$refhd,$althd,$scorehd,
+		 $filterhd,$annothd,$formathd,@sampleids) = split(/\t/, $line);
+		unless (@sampleorder) {
+		    @sampleorder = @sampleids;
+		    print OUT $line,"\n";
+		}
+		next;
+	    }
 	}
 	my ($chrom, $pos,$id,$ref,$alt,$score,
 	    $filter,$annot,$format,@gts) = split(/\t/, $line);
@@ -30,13 +40,15 @@ foreach $vcf (@vcffiles) {
 	}
 	my @deschead = split(/:/,$format);
 	my $newformat = 'GT:DP:AD:AO:RO';
-	my @newgts = ();
+	my %newgts;
 	my $missingGT = 0;
-      FG:foreach my $allele_info (@gts) {
+      FG:foreach my $i (0..$#gts) {
+	  my $allele_info = $gts[$i];
+	  my $sid = $sampleids[$i];
 	  my @gtinfo = split(/:/,$allele_info);
 	  my %gtdata;
 	  if ($allele_info eq '.') {
-	      push @newgts, '.:.:.:.:.';
+	      $newgts{$sid} = '.:.:.:.:.';
 	      $missingGT ++;
 	      next FG;
 	  }
@@ -44,7 +56,7 @@ foreach $vcf (@vcffiles) {
 	      $gtdata{$deschead[$i]} = $gtinfo[$i];
 	  }
 	  if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') {
-	      push @newgts, '.:.:.:.:.';
+	      $newgts{$sid} = '.:.:.:.:.';
 	      $missingGT ++;
 	      next FG;
 	  }
@@ -69,9 +81,13 @@ foreach $vcf (@vcffiles) {
 	  if ($gtdata{DP} && $gtdata{DP} < 3) {
 	      $missingGT ++;
 	  }
-	  push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
+	  $newgts{$sid} =  join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
      }
 	next if ($missingGT == scalar(@gts));
+	my @newgts;
+	foreach $id (@sampleorder) {
+	    push @newgts, $newgts{$id};
+	}
 	$lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
    }
    close VCF;