update for variant calling debugging

3d9955f6 · Brandi Cantarel · 1fa984ad · 3d9955f6 · 3d9955f6 · 3d9955f6
Commit 3d9955f6 authored 7 years ago by Brandi Cantarel
--- a/alignment/add_umi_sam.py
+++ b/alignment/add_umi_sam.py
+#!/bin/env python
+import sys
+import pysam
+import argparse
+def get_args():
+    '''Define arguments.'''
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument('-s', '--sam',
+                        help="The sam file",
+                        required=True)
+    parser.add_argument('-o', '--out',
+                        help="The outfile",
+                        default='outfile.bam')
+    args = parser.parse_args()
+    return args
+
+# set the tag names - take a look at SAM spec to pick an appropriate one
+args = get_args()
+infile = pysam.AlignmentFile(args.sam, "r")
+out = pysam.AlignmentFile(args.out, "wb", template=infile)
+for read in infile.fetch():
+      read.set_tag('RX', read.qname.split(":")[-1])
+      out.write(read)
+
+infile.close()
+out.close()
--- a/alignment/bamqc.sh
+++ b/alignment/bamqc.sh
@@ -12,13 +12,13 @@ usage() {
  exit 1
 }
 OPTIND=1 # Reset OPTIND
-while getopts :r:b:n:p:h opt
+while getopts :r:b:c:n:p:h opt
 do
    case $opt in
        r) index_path=$OPTARG;;
        b) sbam=$OPTARG;;
        c) bed=$OPTARG;;
-        y) nuctype=$OPTARG;;
+        n) nuctype=$OPTARG;;
        p) pair_id=$OPTARG;;
        h) usage;;
    esac
@@ -27,13 +27,19 @@ done
 shift $(($OPTIND -1))

 # Check for mandatory options
-if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
-    usage
-fi
+#if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
+#    usage
+#fi

 module load samtools/1.6 fastqc/0.11.5
 samtools flagstat ${sbam} > ${pair_id}.flagstat.txt
 fastqc -f bam ${sbam}
+baseDir="`dirname \"$0\"`"
+
+if [[ -z $SLURM_CPUS_ON_NODE ]]
+then
+    SLURM_CPUS_ON_NODE=1
+fi

 if [[ $nuctype == 'dna' ]]; then
    module load bedtools/2.26.0 picard/2.10.3
@@ -42,10 +48,10 @@ if [[ $nuctype == 'dna' ]]; then
    samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
    samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed}  >  ${pair_id}.mapqualcov.txt
    samtools view -b -F 1024 ${pair_id}.ontarget.bam | bedtools coverage -sorted -g  ${index_path}/genomefile.txt -a ${bed} -b stdin -hist | grep ^all > ${pair_id}.dedupcov.txt 
-    java -Xmx32g -jar \$PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
-    java -Xmx64g -jar \$PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
-    samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=\$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
-    java -Xmx4g -jar \$PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt
+    java -Xmx32g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
+    java -Xmx64g -jar $PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
+    samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
+    java -Xmx4g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt
    samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed}  >  ${pair_id}.mapqualcov.txt
    bedtools coverage -sorted -g  ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt
    perl $baseDir/scripts/calculate_depthcov.pl ${pair_id}.covhist.txt

--- a/alignment/dnaseqalign.sh
+++ b/alignment/dnaseqalign.sh
@@ -2,22 +2,23 @@
 #dnaseqalign.sh

 usage() {
-  echo "-h Help documentation for hisat.sh"
+  echo "-h Help documentation for dnaseqalign.sh"
  echo "-r  --Reference Genome: GRCh38 or GRCm38"
  echo "-x  --FastQ R1"
  echo "-y  --FastQ R2"
  echo "-p  --Prefix for output file name"
-  echo "Example: bash dnaseqalign.sh -p prefix -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz  -y SRR1551047_2.fastq.gz"
+  echo "-u UMI"
+  echo "Example: bash dnaseqalign.sh -p prefix -u 1 -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz  -y SRR1551047_2.fastq.gz"
  exit 1
 }
 OPTIND=1 # Reset OPTIND
-while getopts :r:a:x:y:p:h opt
+while getopts :r:x:y:p:uh opt
 do
    case $opt in
        r) index_path=$OPTARG;;
        x) fq1=$OPTARG;;
        y) fq2=$OPTARG;;
-	a) algo=$OPTARG;;
+	u) umi='umi';;
        p) pair_id=$OPTARG;;
        h) usage;;
    esac
@@ -35,25 +36,27 @@ then
    SLURM_CPUS_ON_NODE=1
 fi

-module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3 speedseq/20160506
-if ($fq1 != $fq2)
+module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3
+
+baseDir="`dirname \"$0\"`"
+
+if [[ $fq1 == $fq2 ]]
 then
-    bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
+   bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
 else
-    bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam
+   bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam
 fi
-if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
+
+if [[ $umi=='umi' ]]
 then
-    cat out.sam | k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt | samtools view -1 - > output.unsort.bam
-    run-HLA ${pair_id}.hla > ${pair_id}.hla.top 2> ${pair_id}.log.hla
-    touch ${pair_id}.hla.HLA-dummy.gt
-    cat ${pair_id}.hla.HLA*.gt | grep ^GT | cut -f2- > ${pair_id}.hla.all
+    k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam | python ${baseDir}/add_umi_bam.py -s - -o output.unsort.bam
+elif [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
+then
+    k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam| samtools view -1 - > output.unsort.bam
 else 
    samtools view -1 -o output.unsort.bam out.sam
-fi 
-samtools sort --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam
-samtools view -h output.unsort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
-gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools  view -S -b - | samtools sort -o ${pair_id}.splitters.bam -
-gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools  view -S  -b - | samtools sort -o ${pair_id}.discordants.bam -
+fi
+
+samtools sort -n --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam
 java -Djava.io.tmpdir=./ -Xmx4g  -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam
 samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
--- a/alignment/markdups.sh
+++ b/alignment/markdups.sh
@@ -31,6 +31,7 @@ if [[ -z $SLURM_CPUS_ON_NODE ]]
 then
    SLURM_CPUS_ON_NODE=1
 fi
+baseDir="`dirname \"$0\"`"

 module load picard/2.10.3 samtools/1.6

@@ -53,6 +54,7 @@ then
    java -Djava.io.tmpdir=./ -Xmx4g  -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt
 elif [ $algo == 'fgbio_umi' ]   
 then
+    samtools index -@ $SLURM_CPUS_ON_NODE ${sbam}
    source activate fgbiotools
    fgbio GroupReadsByUmi -s identity -i ${sbam} -o ${pair_id}.group.bam -m 10
    fgbio CallMolecularConsensusReads -i ${pair_id}.group.bam -p consensus -M 1 -o ${pair_id}.consensus.bam -S ':none:'

--- a/alignment/rnaseqalign.sh
+++ b/alignment/rnaseqalign.sh
@@ -8,17 +8,19 @@ usage() {
  echo "-y  --FastQ R2"
  echo "-a  --Method: hisat or star"
  echo "-p  --Prefix for output file name"
-  echo "Example: bash hisat.sh -p prefix -r GRCh38 -a hisat -x SRR1551047_1.fastq.gz  -y SRR1551047_2.fastq.gz"
+  echo "-u  [UMI sequences are in FQ Read Name]"
+  echo "Example: bash rnaseqalign.sh -a hisat -p prefix -u -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz  -y SRR1551047_2.fastq.gz"
  exit 1
 }
 OPTIND=1 # Reset OPTIND
-while getopts :r:a:x:y:p:h opt
+while getopts :r:a:x:y:p:hu opt
 do
    case $opt in
        r) index_path=$OPTARG;;
        x) fq1=$OPTARG;;
        y) fq2=$OPTARG;;
 	a) algo=$OPTARG;;
+	u) umi=1;;
        p) pair_id=$OPTARG;;
        h) usage;;
    esac
@@ -32,6 +34,7 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
 fi

 module load  samtools/gcc/1.6 picard/2.10.3
+baseDir="`dirname \"$0\"`"
 if [[ -z $SLURM_CPUS_ON_NODE ]]
 then
    SLURM_CPUS_ON_NODE=1
@@ -57,6 +60,11 @@ else
    fi
    samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam
 fi
+if [[ $umi==1 ]]
+then
+    python ${baseDir}/add_umi_bam.py -b output.bam -o output.unsort2.bam
+    mv output.unsort2.bam output.bam
+fi
 samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o  output.nsort.bam output.bam
 java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam
 samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
--- a/variants/annot_sv.pl
+++ b/variants/annot_sv.pl
+#!/usr/bin/perl -w
+#svvcf2bed.pl
+
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+
+my %opt = ();
+my $results = GetOptions (\%opt,'input|i=s','refdb|r=s','help|h');
+
+my $vcf = $opt{input};
+
+my %lines;
+my %eventid;
+my $ct = 0;
+open IN, "<$vcf" or die $!;
+while (my $line = <IN>) {
+  chomp($line);
+  if ($line =~ m/^#CHROM/) {
+    my @header = split(/\t/,$line);
+    ($chrom, $pos,$id,$ref,$alt,$score,
+     $filter,$info,$format,@subjacc) = split(/\t/, $line);
+  }
+  next if $line =~ m/#/;
+  my ($chrom, $pos,$id,$ref,$alt,$score,
+      $filter,$annot,$format,@gts) = split(/\t/, $line);
+  $ct ++;
+  my %hash = ();
+  foreach $a (split(/;/,$annot)) {
+    my ($key,$val) = split(/=/,$a);
+    $hash{$key} = $val unless ($hash{$key});
+  }
+  if ($id eq 'N') {
+      $id = 'NB'.sprintf("%06s",$ct);
+  }
+  my $evid = (split(/_/,$id))[0];
+  $hash{'END'} = $pos+1 unless $hash{'END'};
+  @deschead = split(":",$format);
+ F1:foreach $sample (@subjacc) {
+    my $allele_info = shift @gts;
+    @ainfo = split(/:/, $allele_info);
+    my %gtinfo = ();
+    foreach $k (0..$#deschead) {
+      $gtinfo{$deschead[$k]} = $ainfo[$k];
+    }
+    unless ($gtinfo{SU}) {
+	$gtinfo{SU} = 0;
+	$gtinfo{SU} = $gtinfo{RV}+$gtinfo{DV} if ($gtinfo{RV} && $gtinfo{DV});
+    }
+    $gtinfo{CN} = '' unless  $gtinfo{CN};
+    if ($alt =~ m/chr(\w+):(\d+)/i) {
+	if ($1 eq $chrom) {
+	    my $locus = join(":",$chrom,$pos,$2);
+	    $eventid{$id} = $locus;
+	    $lines{$evid}{$locus}{$sample} =  join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
+	}elsif ($id =~ m/_\d+/) {
+	    my $locus = join(":",$chrom,$pos,$hash{END});
+	    $eventid{$id} = $locus;
+	    $lines{$evid}{$locus}{$sample} =  join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
+	}else {
+	    my $locus1 = join(":",$chrom,$pos,$hash{END});
+	    my $id1 = $id."_1";
+	    $eventid{$id1} = $locus1;
+	    my $locus2 = join(":",'chr'.$1,$2,$2+1);
+	    my $id2 = $id."_2";
+	    $eventid{$id2} = $locus2;
+	    $lines{$evid}{$locus1}{$sample} =  join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
+	    $lines{$evid}{$locus2}{$sample} =  join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
+	}
+    }else {
+	if ($hash{CHR2} && $hash{CHR2} eq $chrom) {
+	    my $locus = join(":",$chrom,$pos,$hash{END});
+	    $eventid{$id} = $locus;
+	    $lines{$evid}{$locus}{$sample} =  join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
+	}elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) {
+	    my $locus1 = join(":",$chrom,$pos,$pos+1);
+	    my $id1 = $id."_1";
+	    $eventid{$id1} = $locus1;
+	    my $locus2 = join(":",$hash{CHR2},$hash{END},$hash{END}+1);
+	    my $id2 = $id."_2";
+	    $eventid{$id2} = $locus2;
+	    $lines{$evid}{$locus1}{$sample} =  join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
+	    $lines{$evid}{$locus2}{$sample} =  join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
+	}unless ($hash{CHR2}) {
+	    my $locus = join(":",$chrom,$pos,$hash{END});
+	    $eventid{$id} = $locus;
+	    $lines{$evid}{$locus}{$sample} =  join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
+	}
+    }
+ }
+}
+
+close IN;
+
+my @files = ("exonoverlap_sv.txt","geneoverlap_sv.txt");
+foreach $file (@files) {
+  open IN, "<$file" or die $!;
+  while (my $line = <IN>) {
+    chomp($line);
+    next if ($line =~ m/#/);
+    my ($chrom,$start,$end,$id,$chr,$start2,$end2,$gene) = split(/\t/, $line);
+    my $evid = (split(/_/,$id))[0];
+    my $locus = $eventid{$id};
+    unless ($locus) {
+	$locus = $eventid{$evid};
+    }
+    if ($file eq 'exonoverlap_sv.txt') {
+	push @{$gene{$locus}}, $gene;
+	$inc{$evid} = 1;
+    }elsif ($file eq 'geneoverlap_sv.txt') {
+	push @{$gene{$locus}}, $gene;
+	$inc{$evid} = 1;
+    }
+  }
+}
+
+my $outfile = $vcf;
+$outfile =~ s/vcf/annot.txt/g;
+
+open OUT, ">$outfile" or die $!;
+foreach $events (sort keys %lines) {
+  next unless ($inc{$events});
+  foreach $locus (sort {$a cmp $b} keys %{$lines{$events}}) {
+    my %gene_annots;
+    if ($gene{$locus}) {
+      my @genes = @{$gene{$locus}};
+      my %hash;
+      foreach $g (@genes) {
+	foreach $trx (split(/,/,$g)) {
+	  my ($symbol,$trxid,$exonnum) = split(/\|/,$trx);
+	  next unless $exonnum;
+	  $hash{$symbol}{$trxid}{$exonnum} = 1;
+	} 
+      }
+      foreach $sym (keys %hash) {
+	foreach $trxid (keys %{$hash{$sym}}) {
+	  my @exons = sort {$a <=> $b} keys %{$hash{$sym}{$trxid}};
+	  my $exon_bound = $exons[0];
+	  if ($#exons > 0) {
+	    $exon_bound = join("-",$exons[0],$exons[-1]);
+	  }
+	  $gene_annots{$sym}{$exon_bound} = 1;
+	}
+      }
+    }
+    foreach $sample (sort {$a cmp $b} keys %{$lines{$events}{$locus}}) {
+      foreach $genesym (keys %gene_annots) {
+	  my ($SVTYPE,$CN,$SU) = split(/\t/,$lines{$events}{$locus}{$sample});
+	  next if $SU < 3;
+	  print OUT join("\t",'SV.'.$events,split(":",$locus),$lines{$events}{$locus}{$sample},
+			 $genesym, join(";",sort {$a cmp $b} keys %{$gene_annots{$genesym}}),$sample),"\n";
+      } 
+    }
+  }
+}
+
--- a/variants/cnvkit.sh
+++ b/variants/cnvkit.sh
@@ -35,4 +35,4 @@ cnvkit.py coverage ${sbam} /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.cnvki
 cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.normals.cnn -o ${pair_id}.cnr
 cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns
 cnvkit.py call ${pair_id}.cns -o ${pair_id}.call.cns
-cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.pdf
+cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.cnv.pdf
--- a/variants/germline_vc.sh
+++ b/variants/germline_vc.sh
@@ -53,7 +53,7 @@ else
    echo "Missing Fasta File: ${index_path}/genome.fa"
    usage
 fi
-module load python/2.7.x-anaconda samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel
+module load python/2.7.x-anaconda picard/2.10.3 samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel

 for i in *.bam; do
    if [[ ! -f ${i}.bai ]]

--- a/variants/svcalling.sh
+++ b/variants/svcalling.sh
+!/bin/bash
+#svcalling.sh
+
+usage() {
+  echo "-h Help documentation for gatkrunner.sh"
+  echo "-r  --Path to Reference Genome with the file genome.fa"
+  echo "-p  --Prefix for output file name"
+  echo "-b  --Bam File"
+  echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk"
+  exit 1
+}
+OPTIND=1 # Reset OPTIND
+while getopts :r:a:b:p:h opt
+do
+    case $opt in
+        r) index_path=$OPTARG;;
+        p) pair_id=$OPTARG;;
+        b) sbam=$OPTARG;;
+        h) usage;;
+    esac
+done
+function join_by { local IFS="$1"; shift; echo "$*"; }
+
+shift $(($OPTIND -1))
+baseDir="`dirname \"$0\"`"
+
+
+# Check for mandatory options
+if [[ -z $pair_id ]] || [[ -z $index_path ]]; then
+    usage
+fi
+if [[ -z $SLURM_CPUS_ON_NODE ]]
+then
+    SLURM_CPUS_ON_NODE=1
+fi
+
+if [[ -a "${index_path}/genome.fa" ]]
+then
+    reffa="${index_path}/genome.fa"
+    dict="${index_path}/genome.dict"
+else 
+    echo "Missing Fasta File: ${index_path}/genome.fa"
+    usage
+
+fi
+baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+module load  speedseq/20160506 novoBreak/v1.1.3 delly2/v0.7.7-multi samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14
+#RUN DELLY
+mkdir temp
+delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam}
+delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam}
+delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam}
+delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam}
+delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam}
+delly2 filter -t BND -o  delly_tra.bcf -f germline delly_translocations.bcf
+delly2 filter -t DUP -o  delly_dup.bcf -f germline delly_translocations.bcf
+delly2 filter -t INV -o  delly_inv.bcf -f germline delly_translocations.bcf
+delly2 filter -t DEL -o  delly_del.bcf -f germline delly_translocations.bcf
+delly2 filter -t INS -o  delly_ins.bcf -f germline delly_translocations.bcf
+
+#MERGE DELLY AND MAKE BED
+bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf | vcf-sort > ${pair_id}.delly.vcf
+perl $baseDir/vcf2bed.sv.pl ${pair_id}.delly.vcf > delly.bed
+bgzip ${pair_id}.delly.vcf
+tabix ${pair_id}.delly.vcf.gz
+
+#MAKE FILES FOR LUMPY
+samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${sbam}
+samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
+gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools  view -S -b - | samtools sort -o splitters.bam -
+gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools  view -S  -b - | samtools sort -o discordants.bam -
+
+#RUN LUMPY
+speedseq sv -t $SLURM_CPUS_ON_NODE -o ${pair_id}.sssv -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed
+java -jar $SNPEFF_HOME/SnpSift.jar filter "GEN[0].SU > 2" ${pair_id}.sssv.sv.vcf.gz > lumpy.vcf
+perl $baseDir/vcf2bed.sv.pl lumpy.vcf > lumpy.bed
+
+#COMPARE DELLY & LUMPY
+bedtools intersect -v -a lumpy.bed -b delly.bed > lumpy_only.bed
+bedtools intersect -header -b lumpy_only.bed -a lumpy.vcf |bgzip > lumpy_only.vcf.gz
+vcf-concat ${pair_id}.delly.vcf.gz lumpy_only.vcf.gz |vcf-sort -t temp > ${pair_id}.sv.vcf
+perl $baseDir/vcf2bed.sv.pl ${pair_id}.sv.vcf |sort -V -k 1,1 -k 2,2n | grep -v 'alt' |grep -v 'random' |uniq > svs.bed
+bedtools intersect -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed > exonoverlap_sv.txt
+bedtools intersect -v -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed | bedtools intersect -header -wb -a stdin -b ${index_path}/gencode.genes.chr.bed > geneoverlap_sv.txt
+perl $baseDir/annot_sv.pl -r ${index_path} -i ${pair_id}.sv.vcf
+bgzip ${pair_id}.sv.vcf
+
--- a/variants/union.sh
+++ b/variants/union.sh
@@ -19,7 +19,8 @@ do
 done
 function join_by { local IFS="$1"; shift; echo "$*"; }
 shift $(($OPTIND -1))
-baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+baseDir="`dirname \"$0\"`"
+
 module load bedtools/2.26.0 samtools/1.6

 HS=*.hotspot.vcf.gz

--- a/variants/vcf2bed.sv.pl
+++ b/variants/vcf2bed.sv.pl
+#!/usr/bin/perl
+
+use strict;
+
+if (scalar @ARGV != 1) {
+	print "vcf2bed.pl <vcf_file>\n";
+	exit 1;
+}
+
+my $vcfFile = $ARGV[0];
+
+open VCF, $vcfFile
+	or die "Cannot open $vcfFile";
+
+my $ct = 0;
+while (<VCF>) {
+    next if (index($_, '#') == 0);
+    my @fields = split /\t/, $_;
+    $ct ++;
+    my $chrom = $fields[0];
+    my $pos = $fields[1];
+    my $ref = $fields[3];
+    my $annot = $fields[7];
+    my $alt = $fields[4];
+    if ($fields[2] eq 'N') {
+	$fields[2] = 'NB'.sprintf("%06s",$ct);
+    }
+    my %hash = ();
+    next if ($chrom =~ m/_/); 
+    foreach $a (split(/;/,$annot)) {
+	my ($key,$val) = split(/=/,$a);
+	$hash{$key} = $val unless ($hash{$key});
+    }
+    $hash{'END'} = $pos+1 unless $hash{'END'};
+    next if ($hash{CHR2} && $hash{CHR2} =~ m/_/); 
+    if ($alt =~ m/chr(\w+):(\d+)/i) {
+	if ($1 eq $chrom) {
+	    my $end = $2;
+	    if ($pos > $end) {
+		my $temp = $end;
+		$end = $pos;
+		$pos = $temp;
+	    }
+	    print join("\t",$chrom,$pos,$end,$fields[2]),"\n";
+	}elsif ($fields[2] =~ m/_\d+/) {
+	    print join("\t",$chrom,$pos,$hash{END},$fields[2]),"\n";
+	}else {
+	    print join("\t",$chrom,$pos,$pos+1,$fields[2]."_1"),"\n";
+	    print join("\t",'chr'.$1,$2,$2+1,$fields[2]."_2"),"\n";
+	}
+    }else {
+	if ($hash{CHR2} && $hash{CHR2} eq $chrom) {
+	    my $end = $hash{END};
+	    if ($pos > $end) {
+		my $temp = $end;
+		$end = $pos;
+		$pos = $temp;
+	    }
+	    print join("\t",$chrom,$pos,$end,$fields[2]),"\n";
+	}elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) {
+	    print join("\t",$chrom,$pos,$pos+1,$fields[2]."_1"),"\n";
+	    print join("\t",$hash{CHR2},$hash{END},$hash{END}+1,$fields[2]."_2"),"\n";
+	}unless ($hash{CHR2}) {
+	    print join("\t",$chrom,$pos,$hash{END},$fields[2]),"\n";
+	}
+    }
+}
+
+close VCF;