diff --git a/genect_rnaseq/fpkm_subset_panel.pl b/genect_rnaseq/fpkm_subset_panel.pl new file mode 100755 index 0000000000000000000000000000000000000000..baa830a29069088bb559c3ec83d9324a9d7f62c7 --- /dev/null +++ b/genect_rnaseq/fpkm_subset_panel.pl @@ -0,0 +1,85 @@ +#!/bin/bin/perl +use strict; +use warnings; +use diagnostics; +use Getopt::Long; + + +my ($genelist,$fpkm_file) = ("") x 2; + +GetOptions( 'g|genes=s' => \$genelist, + 'f|fpkm=s' => \$fpkm_file); + +my $fpkm_out = $fpkm_file; +$fpkm_out =~ s/\.fpkm.+txt/\.fpkm.capture.txt/; +open OUT, ">$fpkm_out" or die $!; + +#gene_info.human.txt file is required for gene alias information +open GENEINFO, "</project/shared/bicf_workflow_ref/human/gene_info.human.txt" or die $!; +my %geneinfo; +while(my $gline = <GENEINFO>){ + chomp $gline; + my ($taxid,$geneid,$symbol,$locus,$syn,@info) = split("\t",$gline); + if($syn ne '-'){ + my @synonyms = split(/\|/,$syn); + $geneinfo{$symbol} = $syn; + foreach my $synonym(@synonyms){ + $geneinfo{$synonym} = $symbol; + } + } +} + +#genelist is the list of genes in panel. Not specifying genelist will print out all lines(e.g. wholernaseq) +my %genes; +if($genelist ne ""){ + open GENES, "<$genelist" or die $!; + while (my $gene = <GENES>){ + chomp $gene; + $genes{$gene} = 1; + } + close GENES; +} +my $genelist_size = keys %genes; + + +#FPKM data +open FPKM, "<$fpkm_file" or die $!; +my %fpkmdata; +my %found; +my $header = <FPKM>; +chomp $header; +print OUT $header."\n"; +while(my $line = <FPKM>){ + chomp $line; + my ($gid,$gname,$ref,$strand,$start,$end,$cov,$fpkm,$tpm) = split("\t",$line); + if($ref =~ /^chr/){ + if(!exists $fpkmdata{$gname}){$fpkmdata{$gname} = $line;} + else{$fpkmdata{$gname} = $fpkmdata{$gname}."\n".$line;} + } + if ($genelist_size == 0){ #wholernaseq + print OUT $line."\n"; + } + elsif($genes{$gname} and $ref =~ /^chr/){ #panel gene found in fpkm data + print OUT $line."\n"; + $found{$gname} =1; + } +} + +foreach my $gene_key (keys %genes){ + if($found{$gene_key}){} + else { #gene from gene list is not in fpkm data or has a different name + if($geneinfo{$gene_key}){ #gene is in gene_info.human.txt + if($fpkmdata{$geneinfo{$gene_key}}){ #gene from gene list is an alias in gene_info.human.txt + print OUT $fpkmdata{$geneinfo{$gene_key}}."\n"; + } + elsif($geneinfo{$gene_key} =~ /\|/){ + my @alias_array = split(/\|/, $geneinfo{$gene_key}); + foreach my $alias(@alias_array){ + if($fpkmdata{$alias}){print OUT $fpkmdata{$alias}."\n";}#gene from genelist is a Gene Name in gene_info.human.txt but fpkm data has an alias + } + } + } + } +} +close FPKM; +close OUT; diff --git a/genect_rnaseq/geneabundance.sh b/genect_rnaseq/geneabundance.sh index 5d1a362da7f790c8dafb40218f83f5d78f3463aa..35c1d000fc2ad58ff7963819facfd92cd0f6b006 100644 --- a/genect_rnaseq/geneabundance.sh +++ b/genect_rnaseq/geneabundance.sh @@ -11,13 +11,14 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :b:g:p:s:h opt +while getopts :b:g:p:f:s:h opt do case $opt in b) sbam=$OPTARG;; g) gtf=$OPTARG;; p) pair_id=$OPTARG;; s) stranded=$OPTARG;; + f) filter=$OPTARG;; h) usage;; esac done @@ -48,4 +49,11 @@ featureCounts -s $stranded -M --fraction -J --ignoreDup -T $NPROC -p -g gene_nam mkdir ${pair_id}_stringtie cd ${pair_id}_stringtie stringtie ../${sbam} -p $NPROC -G ${gtf} -B -e -o denovo.gtf -A ../${pair_id}.fpkm.txt - + +if [[ -f $filter ]] +then + cd .. + mv ${pair_id}.fpkm.txt ${prefix}.fpkm.ori.txt + perl ${baseDir}/fpkm_subset_panel.pl -f ${prefix}.fpkm.ori.txt -g $filter + mv ${prefix}.fpkm.capture.txt ${pair_id}.fpkm.txt +fi