From dbd26bcfed518b816be336924261601995c4b283 Mon Sep 17 00:00:00 2001
From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu>
Date: Sun, 2 Aug 2020 14:41:47 -0500
Subject: [PATCH] star fusion docker agfusion file format

---
 alignment/bam2tdf.sh                   |   0
 alignment/bamqc.sh                     |   0
 alignment/dnaseqalign.sh               |   0
 alignment/filter_genefusions_docker.pl | 127 +++++++++++++++++++++++++
 alignment/filter_viral_idxstats.pl     |   0
 alignment/hisat_genotype.sh            |   0
 alignment/indexbams.sh                 |   0
 alignment/markdups.sh                  |   0
 alignment/rnaseqalign.sh               |   0
 alignment/starfusion.sh                |  15 ++-
 alignment/virusalign.sh                |   0
 11 files changed, 137 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 alignment/bam2tdf.sh
 mode change 100644 => 100755 alignment/bamqc.sh
 mode change 100644 => 100755 alignment/dnaseqalign.sh
 create mode 100755 alignment/filter_genefusions_docker.pl
 mode change 100644 => 100755 alignment/filter_viral_idxstats.pl
 mode change 100644 => 100755 alignment/hisat_genotype.sh
 mode change 100644 => 100755 alignment/indexbams.sh
 mode change 100644 => 100755 alignment/markdups.sh
 mode change 100644 => 100755 alignment/rnaseqalign.sh
 mode change 100644 => 100755 alignment/starfusion.sh
 mode change 100644 => 100755 alignment/virusalign.sh

diff --git a/alignment/bam2tdf.sh b/alignment/bam2tdf.sh
old mode 100644
new mode 100755
diff --git a/alignment/bamqc.sh b/alignment/bamqc.sh
old mode 100644
new mode 100755
diff --git a/alignment/dnaseqalign.sh b/alignment/dnaseqalign.sh
old mode 100644
new mode 100755
diff --git a/alignment/filter_genefusions_docker.pl b/alignment/filter_genefusions_docker.pl
new file mode 100755
index 0000000..6959c6c
--- /dev/null
+++ b/alignment/filter_genefusions_docker.pl
@@ -0,0 +1,127 @@
+#!/usr/bin/perl -w
+#svvcf2bed.pl
+
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+
+my %opt = ();
+my $results = GetOptions (\%opt,'fusion|f=s','prefix|p=s','help|h','datadir|r=s');
+
+open OM, "<$opt{datadir}/known_genefusions.txt" or die $!;
+while (my $line = <OM>) {
+    chomp($line);
+    $known{$line} = 1;
+}
+close OM;
+
+open OM, "<$opt{datadir}/genelist.txt" or die $!;
+while (my $line = <OM>) {
+    chomp($line);
+    $keep{$line} = 1;
+}
+
+my @exonfiles = `ls */*.exons.csv`;
+foreach $efile (@exonfiles) {
+    chomp($efile);
+    my @leftexons;
+    my @rightexons;
+    my ($dir,$pair,@etc) = split(/\/|\./,$efile);
+    open EFILE, "<$efile" or die $!;
+    my $header = <EFILE>;
+    while (my $line = <EFILE>) {
+	my ($leftgene,$rightgene,$lefttrx,$righttrx,
+	    $leftstrand,$rightstrand,$exonsrc,$exonnum,
+	    $exon_chr,$exon_start,$exon_end) = split(/,/,$line);
+	if ($exonsrc =~ m/5/) {
+	    push @leftexons, $exonnum;
+	}else {
+	    push @rightexons, $exonnum;
+	}
+    }
+    if ($leftexons[0] eq  $leftexons[-1]) {
+	$exonnuminfo{$dir}{leftgene} = $leftexons[0];
+    }else {
+	$exonnuminfo{$dir}{leftgene} = join("-",$leftexons[0],$leftexons[-1]);
+    }
+    if ($rightexons[0] eq  $rightexons[-1]) {
+	$exonnuminfo{$dir}{rightgene} = $rightexons[0];
+    }else {
+	$exonnuminfo{$dir}{rightgene} = join("-",$rightexons[0],$rightexons[-1]);
+    }
+}
+
+open OAS, ">$opt{prefix}\.translocations.answer.txt" or die $!;
+
+print OAS join("\t","FusionName","LeftGene","LeftBreakpoint",
+	       "LeftGeneExons","LeftStrand","RightGene","RightBreakpoint",
+	       "RightGeneExons","RightStrand","RNAReads","DNAReads",
+	       "FusionType","Annot",'Filter','ChrType','ChrDistance'),"\n";
+
+my $sname = $opt{prefix};
+
+open FUSION, "<$opt{fusion}" or die $!;
+my $header = <FUSION>;
+chomp($header);
+$header =~ s/^#//;
+my @hline = split(/\t/,$header);
+while (my $line = <FUSION>) {
+  chomp($line);
+  my @row = split(/\t/,$line);
+  my %hash;
+  foreach my $i (0..$#row) {
+    $hash{$hline[$i]} = $row[$i];
+  }
+  my @filter;
+  my ($left_chr,$left_pos,$left_strand) = split(/:/,$hash{LeftBreakpoint});
+  my ($right_chr,$right_pos,$right_strand) = split(/:/,$hash{RightBreakpoint});
+  $hash{LeftBreakpoint} = join(":",$left_chr,$left_pos);
+  $hash{RightBreakpoint} = join(":",$right_chr,$right_pos);
+  $hash{LeftStrand} = $left_strand;
+  $hash{RightStrand} = $right_strand;
+  $hash{LeftGene} = (split(/\^/,$hash{LeftGene}))[0];
+  $hash{RightGene} = (split(/\^/,$hash{RightGene}))[0];
+  unless ($keep{$hash{LeftGene}} || $keep{$hash{RightGene}}) {
+      push @filter, 'OutsideGeneList';
+  }
+  $hash{SumRNAReads} += $hash{JunctionReadCount}+$hash{SpanningFragCount};
+  my $fname = join("--",$hash{LeftGene},$hash{RightGene});
+  my $fname2 = join("--",sort {$a cmp $b} $hash{LeftGene},$hash{RightGene});
+  my $key = join("_",$hash{LeftGene},$left_pos,$hash{RightGene},$right_pos);
+  my ($leftexon,$rightexon);
+  if ($exonnuminfo{$key}) {
+      $leftexon = $exonnuminfo{$key}{leftgene};
+      $rightexon = $exonnuminfo{$key}{rightgene};
+  }
+  $hash{PROT_FUSION_TYPE} = 'in-frame' if ($hash{PROT_FUSION_TYPE} eq 'INFRAME');
+  my ($dna_support,$rna_support)=("no") x 2;
+  $hash{annots} =~ s/CHROMOSOMAL\[/CHROMOSOMAL /;
+  $hash{annots} =~ s/\]|\[|\"//g;
+  @annots = split(/,/,$hash{annots});
+  my ($chrom_type_dist) = grep(/CHROMOSOMAL/,@annots);
+  my ($chrtype,$chrdist) = split(/ /,$chrom_type_dist);
+  @annot2 = grep(!/CHROMOSOMAL/, @annots);
+  $fusion_annot = '';
+  if (scalar(@annot2) > 0) {
+      $fusion_annot = join(",",@annot2);
+  }
+  if ($known{$fname2} || $fusion_annot =~ m/CCLE|Cosmic|FA_CancerSupp|Klijn_CellLines|Mitelman|YOSHIHARA_TCGA|chimer/i) {
+      push @filter, 'LowReadCt' if ($hash{SumRNAReads} < 3);
+  }else {
+      push @filter, 'LowReadCt' if ($hash{SumRNAReads} < 5);
+  }
+  $rna_support = "yes";
+  if ($left_chr eq $right_chr) {
+      $diff = abs($right_pos-$left_pos);
+      push @filter, 'ReadThrough' if ($diff < 200000);
+  }
+  my $qc ='PASS';
+  if (scalar(@filter) > 0) {
+      $qc = join(";","FailedQC",@filter);
+  }
+  
+  print OAS join("\t",$fname,$hash{LeftGene},$hash{LeftBreakpoint},$leftexon,$hash{LeftStrand},
+		 $hash{RightGene},$hash{RightBreakpoint},$rightexon,$hash{RightStrand},
+		 $hash{SumRNAReads},0,lc($hash{PROT_FUSION_TYPE}),$fusion_annot,$qc,$chrtype,$chrdist),"\n";
+}
+
+
+close OAS;
diff --git a/alignment/filter_viral_idxstats.pl b/alignment/filter_viral_idxstats.pl
old mode 100644
new mode 100755
diff --git a/alignment/hisat_genotype.sh b/alignment/hisat_genotype.sh
old mode 100644
new mode 100755
diff --git a/alignment/indexbams.sh b/alignment/indexbams.sh
old mode 100644
new mode 100755
diff --git a/alignment/markdups.sh b/alignment/markdups.sh
old mode 100644
new mode 100755
diff --git a/alignment/rnaseqalign.sh b/alignment/rnaseqalign.sh
old mode 100644
new mode 100755
diff --git a/alignment/starfusion.sh b/alignment/starfusion.sh
old mode 100644
new mode 100755
index 02e3f40..a627763
--- a/alignment/starfusion.sh
+++ b/alignment/starfusion.sh
@@ -57,17 +57,22 @@ then
     module load singularity/3.0.2
     export PYENSEMBL_CACHE_DIR="/project/shared/bicf_workflow_ref/singularity_images"
     cut -f 5-8 ${pair_id}.starfusion.txt |perl -pe 's/\^|:/\t/g' | awk '{print "singularity exec /project/shared/bicf_workflow_ref/singularity_images/agfusion.simg agfusion annotate  -db  /project/shared/bicf_workflow_ref/singularity_images/pyensembl/GRCh38/ensembl92/agfusion.homo_sapiens.92.db -g5", $1,"-j5",$4,"-g3",$6,"-j3",$9,"-o",$1"_"$4"_"$6"_"$9}' |grep -v 'LeftGene' |sh
+    cut -f 8,10 ${pair_id}.starfusion.txt |grep -v Breakpoint |perl -pe 's/\t/\n/g' |awk -F ':' '{print $1"\t"$2-1"\t"$2}' > temp.bed
+    bedtools intersect -wao -a temp.bed -b ${index_path}/cytoBand.txt |cut -f 1,2,7 > cytoband_pos.txt
+    if [[ $filter == 1 ]]
+    then
+	perl $baseDir/filter_genefusions.pl -p ${pair_id} -r ${index_path} -f ${pair_id}.starfusion.txt
+    fi
 else
     export PYENSEMBL_CACHE_DIR=/opt
     refgeno=${index_path}/CTAT_resource_lib
     STAR-Fusion --genome_lib_dir ${refgeno} --min_sum_frags 3 --CPU $NPROC --left_fq ${fq1} --right_fq ${fq2} --examine_coding_effect --output_dir ${pair_id}_star_fusion
     cp ${pair_id}_star_fusion/star-fusion.fusion_predictions.abridged.coding_effect.tsv ${pair_id}.starfusion.txt
     cut -f 7-10 ${pair_id}.starfusion.txt |perl -pe 's/\^|:/\t/g' | awk '{print "agfusion annotate  -db agfusion.homo_sapiens.95.db -g5", $1,"-j5",$4,"-g3",$6,"-j3",$9,"-o",$1"_"$4"_"$6"_"$9}' |grep -v 'LeftGene' |sh
-fi
-
-if [[ $filter == 1 ]]
-then
     cut -f 8,10 ${pair_id}.starfusion.txt |grep -v Breakpoint |perl -pe 's/\t/\n/g' |awk -F ':' '{print $1"\t"$2-1"\t"$2}' > temp.bed
     bedtools intersect -wao -a temp.bed -b ${index_path}/cytoBand.txt |cut -f 1,2,7 > cytoband_pos.txt
-    perl $baseDir/filter_genefusions.pl -p ${pair_id} -r ${index_path} -f ${pair_id}.starfusion.txt
+    if [[ $filter == 1 ]]
+    then
+	perl $baseDir/filter_genefusions_docker.pl -p ${pair_id} -r ${index_path} -f ${pair_id}.starfusion.txt
+    fi
 fi
diff --git a/alignment/virusalign.sh b/alignment/virusalign.sh
old mode 100644
new mode 100755
-- 
GitLab