diff --git a/alignment/filter_genefusions.pl b/alignment/filter_genefusions.pl index 6dca268dba98d92cbc5b992f04add77881515b51..8ea28ab8dc6b9b35232a6ab6accedf8d08181a0b 100755 --- a/alignment/filter_genefusions.pl +++ b/alignment/filter_genefusions.pl @@ -91,6 +91,7 @@ while (my $line = <FUSION>) { $leftexon = $exonnuminfo{$key}{leftgene}; $rightexon = $exonnuminfo{$key}{rightgene}; } + $hash{PROT_FUSION_TYPE} = 'in-frame' if ($hash{PROT_FUSION_TYPE} eq 'INFRAME'); my ($dna_support,$rna_support)=("no") x 2; if ($known{$fname2} && ($hash{SumRNAReads} >= 3)|| ($hash{SumRNAReads} >= 5)) { $rna_support = "yes"; @@ -99,11 +100,11 @@ while (my $line = <FUSION>) { $hash{RightStrand},$hash{SumRNAReads},0),"\n"; print OAS join("\t",$fname,$hash{LeftGene},$hash{LeftBreakpoint},$leftexon,$hash{LeftStrand}, $hash{RightGene},$hash{RightBreakpoint},$rightexon,$hash{RightStrand}, - $hash{SumRNAReads},0,$hash{PROT_FUSION_TYPE},$hash{annots}),"\n"; + $hash{SumRNAReads},0,lc($hash{PROT_FUSION_TYPE}),$hash{annots}),"\n"; print OUTIR join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}},"UTSW",$sname,$fname." fusion", - $dna_support,$rna_support,"STAR Fusion","N/A"),"\n"; + $dna_support,$rna_support,"STAR Fusion",lc($hash{PROT_FUSION_TYPE})),"\n"; print OUTIR join("\t",$hash{RightGene},$entrez{$hash{RightGene}},"UTSW",$sname,$fname." fusion", - $dna_support,$rna_support,"STAR Fusion","N/A"),"\n"; + $dna_support,$rna_support,"STAR Fusion",lc($hash{PROT_FUSION_TYPE})),"\n"; } } diff --git a/diff_exp/geneabundance.sh b/diff_exp/geneabundance.sh index 40a68ab11a3c04f7ec63ae15cac7ca8e43a22397..1baa572f82cf920d242ada960e3a7d86f1c2274f 100644 --- a/diff_exp/geneabundance.sh +++ b/diff_exp/geneabundance.sh @@ -39,8 +39,7 @@ then fi source /etc/profile.d/modules.sh module load subread/1.6.1 stringtie/1.3.2d-intel -featureCounts -s $stranded -T $SLURM_CPUS_ON_NODE -p -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam} - +featureCounts -s $stranded -M --fraction -J --ignoreDup -T $SLURM_CPUS_ON_NODE -p -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam} mkdir ${pair_id}_stringtie cd ${pair_id}_stringtie stringtie ../${sbam} -p $SLURM_CPUS_ON_NODE -G ../${gtf} -B -e -o denovo.gtf -A ../${pair_id}.fpkm.txt diff --git a/variants/annotvcf.sh b/variants/annotvcf.sh index f3d246289978d343b37ec7b4632e16021efb9b36..7bc785b9e608a0595fa454e065f809a4dca5a0b3 100755 --- a/variants/annotvcf.sh +++ b/variants/annotvcf.sh @@ -36,7 +36,7 @@ then bcftools annotate -Oz -a ${index_path}/gnomad.txt.gz -h ${index_path}/gnomad.header -c CHROM,POS,REF,ALT,GNOMAD_HOM,GNOMAD_AF,AF_POPMAX -o ${pair_id}.gnomad.vcf.gz ${unionvcf} tabix ${pair_id}.gnomad.vcf.gz bcftools annotate -Oz -a ${index_path}/repeat_regions.bed.gz -o ${pair_id}.repeat.vcf.gz --columns CHROM,FROM,TO,RepeatType -h /project/shared/bicf_workflow_ref/RepeatType.header ${pair_id}.gnomad.vcf.gz - java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.repeat.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar annotate -id ${index_path}/dbSnp.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CLNSIG,CLNDSDB,CLNDSDBID,CLNDBN,CLNREVSTAT,CLNACC ${index_path}/clinvar.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CNT ${index_path}/cosmic.vcf.gz - | java -Xmx10g -jar $SNPEFF_HOME/SnpSift.jar dbnsfp -v -db ${index_path}/dbNSFP.txt.gz - | bgzip > ${pair_id}.annot.vcf.gz + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-downstream -no-upstream -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.repeat.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar annotate -id ${index_path}/dbSnp.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CLNSIG,CLNDSDB,CLNDSDBID,CLNDBN,CLNREVSTAT,CLNACC ${index_path}/clinvar.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CNT ${index_path}/cosmic.vcf.gz - | java -Xmx10g -jar $SNPEFF_HOME/SnpSift.jar dbnsfp -v -db ${index_path}/dbNSFP.txt.gz - | bgzip > ${pair_id}.annot.vcf.gz tabix ${pair_id}.annot.vcf.gz else if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCm38' ]] diff --git a/variants/cnvkit.sh b/variants/cnvkit.sh index 7129b70b85d79db692773365d8ef41303b51c376..98c3363d298f42bc8c36a39d67c422a5644025f4 100755 --- a/variants/cnvkit.sh +++ b/variants/cnvkit.sh @@ -49,7 +49,7 @@ then elif [[ $capture == '/project/shared/bicf_workflow_ref/GRCh38/clinseq_prj/UTSWV2_2.panelplus.bed' ]] then normals="${index_path}/panelofnormals.panel1385V2_2.cnn" - target='panel1385V2-2.cnvkit_' + targets="${index_path}/panel1385V2-2.cnvkit_" fi echo "${targets}targets.bed" diff --git a/variants/filter_cnvkit.pl b/variants/filter_cnvkit.pl index 6dfdf441403379f1764b4ef450c8bfb20f0ad38a..b7504ad5823d209c647e3ae270f9e71df8f55f62 100755 --- a/variants/filter_cnvkit.pl +++ b/variants/filter_cnvkit.pl @@ -60,11 +60,19 @@ while (my $line = <CNR>) { my ($chr,$start,$end,$geneids,$log2,$depth,$weight) = split(/\t/,$line); my $key = $chr.":".$start."-".$end; my %genes; - my @ids = split(/;|,/,$geneids); - foreach my $gid (@ids) { - my ($key,$value) = split(/=/,$gid); - if ($key eq 'ensembl_gn' || $key eq 'identifier') { - $genes{$value} = 1 if $keep{$value}; + if ($geneids =~ m/ensembl_gn/g) { + my @ids = split(/;|,/,$geneids); + foreach my $gid (@ids) { + my ($key,$value) = split(/=/,$gid); + if ($key eq 'ensembl_gn' || $key eq 'identifier') { + $genes{$value} = 1 if $keep{$value}; + } + } + }else { + my @ids = split(/,/,$geneids); + foreach my $gid (@ids) { + my ($gene,$trxid,$exonnum,$strand) = split(/\|/,$gid); + $genes{$gene} = 1 if $keep{$gene}; } } foreach $gene (keys %genes) { @@ -73,7 +81,7 @@ while (my $line = <CNR>) { } open IN, "<$file" or die $!; -my $header = <IN>; +$header = <IN>; while (my $line = <IN>) { chomp($line); my ($chr,$start,$end,$geneids,$log2,$cn,$depth, @@ -81,11 +89,19 @@ while (my $line = <IN>) { next if ($chr eq 'chrX' && $cn == 1); my $key = $chr.":".$start."-".$end; my %genes; - my @ids = split(/;|,/,$geneids); - foreach my $gid (@ids) { - my ($key,$value) = split(/=/,$gid); - if ($key eq 'ensembl_gn' || $key eq 'identifier') { - $genes{$value} = 1 if $keep{$value}; + if ($geneids =~ m/ensembl_gn/g) { + my @ids = split(/;|,/,$geneids); + foreach my $gid (@ids) { + my ($key,$value) = split(/=/,$gid); + if ($key eq 'ensembl_gn' || $key eq 'identifier') { + $genes{$value} = 1 if $keep{$value}; + } + } + }else { + my @ids = split(/,/,$geneids); + foreach my $gid (@ids) { + my ($gene,$trxid,$exonnum,$strand) = split(/\|/,$gid); + $genes{$gene} = 1 if $keep{$gene}; } } my $len = sprintf("%.1f",($end-$start)/1000); diff --git a/variants/filter_pindel.pl b/variants/filter_pindel.pl new file mode 100644 index 0000000000000000000000000000000000000000..efc9a5cd79daea2082d07e3d011622096c562ba8 --- /dev/null +++ b/variants/filter_pindel.pl @@ -0,0 +1,112 @@ +#!/usr/bin/perl -w +#integrate_datasets.pl + +#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'td|d=s','indel|i=s','sv|s=s','tumor|t=s'); + + +my @files = grep(/vcf.gz/,values %opt); +foreach $file (@files) { + chomp($file); + open IN, "gunzip -c $file |" or die $!; + my $outfile = $file; + $outfile =~ s/vcf.*/pass.vcf/; + my @gtheader; + open OUT, ">$outfile" or die $!; + W1:while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#/) { + print OUT $line,"\n"; + if ($line =~ m/^#CHROM/) { + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@gtheader) = split(/\t/, $line); + unless ($opt{tumor}) { + if (grep(/T_DNA/,@gtheader)) { + my @tsamps = grep(/T_DNA/,@gtheader); + $opt{tumor} = $tsamps[0]; + }else { + $opt{tumor} = $gtheader[0]; + } + } + } + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + next unless ($hash{ANN}); + next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/); + my %gtinfo = (); + my @deschead = split(/:/,$format); + F1:foreach my $k (0..$#gtheader) { + my $subjid = $gtheader[$k]; + my $allele_info = $gts[$k]; + my @ainfo = split(/:/, $allele_info); + my @mutallfreq = (); + foreach my $k (0..$#ainfo) { + $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k]; + $hash{$deschead[$k]} = $ainfo[$k] if ($subjid eq $opt{tumor}); + } + $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP}); + next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1); + my @altct = split(/,/,$gtinfo{$subjid}{AD}); + my $refct = shift @altct; + @altct2 = split(/,/,$gtinfo{$subjid}{AO}); + if (scalar(@altct) ne scalar(@altct2)) { + warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n"; + } + my $total = $refct; + foreach my $act (@altct) { + next if ($act eq '.'); + $total += $act; + push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP}); + } + $gtinfo{$subjid}{MAF} = \@mutallfreq; + } + next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20); + unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) { + warn "Missing Alt:$line\n"; + } + @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}}; + @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO}); + next if ($tumoraltct[0] eq '.'); + $hash{AF} = join(",",@tumormaf); + next if ($tumoraltct[0] < 20); + next if ($tumormaf[0] < 0.05); + my $keepforvcf = 0; + foreach $trx (split(/,/,$hash{ANN})) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); + next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i); + next if($effect eq 'sequence_feature'); + if ($file eq $opt{sv}) { + next unless ($effect eq 'gene_fusion'); + } + $keepforvcf = $gene; + } + next unless $keepforvcf; + my @fail = sort {$a cmp $b} keys %fail; + next if (scalar(@fail) > 0); + my @nannot; + foreach $info (sort {$a cmp $b} keys %hash) { + if (defined $hash{$info}) { + push @nannot, $info."=".$hash{$info}; + }else { + push @nannot, $info; + } + } + my $newannot = join(";",@nannot); + print OUT join("\t",$chrom, $pos,$id,$ref,$alt,$score,$filter,$newannot, + $format,@gts),"\n"; + } + close IN; +} diff --git a/variants/parse_pindel.pl b/variants/parse_pindel.pl index facbeff4c0fe89a269f2e922b482403125bd733f..79c0e234e9df7838a72101d92535f82b1a3bdb74 100755 --- a/variants/parse_pindel.pl +++ b/variants/parse_pindel.pl @@ -77,14 +77,15 @@ while (my $line = <VCF>) { next if ($missingGT == scalar(@gts)); if ($hash{SVTYPE} eq 'DUP:TANDEM') { - print DUP join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + print DUP join("\t",$chrom,$pos,$id,'N','<DUP>',$score,$filter,$annot,$newformat,@newgts),"\n"; }elsif ($hash{SVTYPE} eq 'INS') { - print SV join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + print SV join("\t",$chrom,$pos,$id,'N','<INS>',$score,$filter,$annot,$newformat,@newgts),"\n"; }elsif ($hash{SVTYPE} eq 'DEL' || $hash{SVTYPE} eq 'INS') { if (abs($hash{SVLEN}) < 50) { print SI join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; }else { - print SV join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + $newalt = "<".$hash{SVTYPE}.">"; + print SV join("\t",$chrom,$pos,$id,'N',$newalt,$score,$filter,$annot,$newformat,@newgts),"\n"; } } }