diff --git a/cbioportal/concat_cnvs.pl b/cbioportal/concat_cnvs.pl new file mode 100644 index 0000000000000000000000000000000000000000..11fd0aa67b5a6e7363b87112031c39fccf0faf39 --- /dev/null +++ b/cbioportal/concat_cnvs.pl @@ -0,0 +1,52 @@ +#!/usr/bin/perl -w +#concat_cnvs.pl + +my @discreet = `ls *cnv_discreet.txt`; +my %cts; +my %sample; +foreach $file (@discreet) { + open IN, "<$file" or die $!; + my ($sample,@ext) = split(/\./,$file); + $sample{$sample} = 1; + while (my $line = <IN>) { + chomp($line); + my ($chr,$s,$e,$ct,$gene) = split(/\t/,$line); + $cts{$gene}{$sample} = $ct; + } +} +my @samples = sort {$a cmp $b} keys %sample; +open OUT, ">discreet.cna.txt" or die $!; +print OUT join("\t",'Hugo_Symbol',@samples),"\n"; +foreach my $gene (keys %cts) { + my @line; + foreach my $sid (@samples) { + $cts{$gene}{$sid} = 2 unless ($cts{$gene}{$sid}); + push @line, $cts{$gene}{$sid}; + } + print OUT join("\t",$gene,@line),"\n"; +} + +my @continuous = `ls *cnv_continuous.txt`; + my %cts; +my %sample; +foreach $file (@continuous) { + open IN, "<$file" or die $!; + my ($sample,@ext) = split(/\./,$file); + $sample{$sample} = 1; + while (my $line = <IN>) { + chomp($line); + my ($chr,$s,$e,$ct,$gene) = split(/\t/,$line); + $cts{$gene}{$sample} = $ct; + } +} +my @samples = sort {$a cmp $b} keys %sample; +open OUT, ">continuous.cna.txt" or die $!; +print OUT join("\t",'Hugo_Symbol',@samples),"\n"; +foreach my $gene (keys %cts) { + my @line; + foreach my $sid (@samples) { + $cts{$gene}{$sid} = 2 unless ($cts{$gene}{$sid}); + push @line, $cts{$gene}{$sid}; + } + print OUT join("\t",$gene,@line),"\n"; +} diff --git a/cbioportal/filter_maf.pl b/cbioportal/filter_maf.pl new file mode 100644 index 0000000000000000000000000000000000000000..84b2aaa202c83aa76333c4b24c3f96b1903ef77e --- /dev/null +++ b/cbioportal/filter_maf.pl @@ -0,0 +1,52 @@ +#!/usr/bin/perl -w +#patient_sample_uuid.pl + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'prjid|p=s'); + +my @maffiles = @ARGV; + +open MAFOUT, ">variants.maf" or die $!; +my @mincols = ('Hugo_Symbol','Entrez_Gene_Id','Variant_Classification', + 'Tumor_Sample_Barcode','HGVSp_Short','Protein_position', + 'SWISSPROT'); +#($hash{'Hugo_Symbol'},$hash{'Entrez_Gene_Id'},$hash{'Variant_Classification'},$hash{'Tumor_Sample_Barcode'},$hash{'HGVSp_Short'},$hash{'Protein_position'},$hash{'SWISSPROT'}); + +foreach $maf (@maffiles) { + open MAF, "<$maf" or die $!; + while (my $line = <MAF>) { + chomp($line); + if ($line =~ m/#/) { + print MAFOUT $line,"\n"; + } + elsif ($line =~ m/Hugo_Symbol/i) { + @mafcols = split(/\t/,$line); + print MAFOUT join("\t",@mincols),"\n"; + }else { + my @row = split(/\t/,$line); + my %hash; + foreach my $i (0..$#mafcols) { + $row[$i] = '' unless $row[$i]; + $hash{$mafcols[$i]} = $row[$i]; + } + next if ($hash{Variant_Classification} =~ m/Silent|Intron|UTR|Flank|IGR|RNA|Splice_Region/); + next unless ($hash{FILTER} =~ m/PASS/); + $mafids{$hash{Tumor_Sample_Barcode}} = 1; + my @newline; + foreach $i (0..$#mincols) { + push @newline, $hash{$mincols[$i]}; + } + print MAFOUT join("\t",@newline),"\n"; + } + } + close MAF; +} +close MAFOUT; +open SEQD, ">case_lists/sequenced.txt" or die $!; +print SEQD join("\n","cancer_study_identifier: $opt{prjid}", + "stable_id: $opt{prjid}_sequenced", + "case_list_name: Sequenced", + "case_list_description: Sequenced Samples", + "case_list_ids:".join("\t",keys %mafids)),"\n"; +close SEQD; diff --git a/cbioportal/prepare_cbioportal.sh b/cbioportal/prepare_cbioportal.sh new file mode 100644 index 0000000000000000000000000000000000000000..6d052f9115570d66d760d88348bd8d8bce112308 --- /dev/null +++ b/cbioportal/prepare_cbioportal.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +module load bedtools/2.29.0 +ln -s /project/shared/bicf_workflow_ref/human/grch38_cloud/rnaref/genenames.txt . +perl /project/PHG/PHG_Clinical/devel/clinseq_workflows/process_scripts/genect_rnaseq/concat_cts.pl -o ./ */*/*.cts +perl /project/PHG/PHG_Clinical/devel/clinseq_workflows/process_scripts/genect_rnaseq/concat_fpkm.pl -o ./ */*/*.fpkm.txt +cut -f 2,4- countTable.fpkm.txt |perl -pi -e 's/SYMBOL/Hugo_Symbol/g' > fpkm.txt + +ls ../*/CNV/*.txt | awk -F '/' '{print "cut -f 1-3,5",$0,"|bedtools intersect -wao -a stdin -b tempus.genes.hg19.bed | cut -f 1-3,4,8 >",$2".cnv_continuous.txt"}' |sh +ls ../*/CNV/*.txt | awk -F '/' '{print "cut -f 1-3,12",$0,"|bedtools intersect -wao -a stdin -b tempus.genes.hg19.bed | cut -f 1-3,4,8 >",$2".cnv_discreet.txt"}' |sh + perl /project/PHG/PHG_Clinical/devel/clinseq_workflows/process_scripts/cbioportal/concat_cnvs.pl diff --git a/cbioportal/sym2entrez.pl b/cbioportal/sym2entrez.pl new file mode 100644 index 0000000000000000000000000000000000000000..d1ec492a59f80fdf41f5cb3d530799bbbcc59458 --- /dev/null +++ b/cbioportal/sym2entrez.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl -w +#translocation2cbioportal.pl + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +use File::Basename; + +my $results= GetOptions (\%opt,'datadir|d=s','gbuilddir|g=s'); + +open ENT_GENE, "<$opt{datadir}\/gene_info.human.txt" or die $!; +my %entrez; +my %entgene; +my $ent_header = <ENT_GENE>; +while (my $line = <ENT_GENE>){ + chomp $line; + my @row = split(/\t/, $line); + $entgene{'chr'.$row[6]}{$row[2]}=$row[1]; +} +close ENT_GENE; +open ENT_ENS, "<$opt{gbuilddir}\/genenames.txt" or die $!; +my $gn_header = <ENT_ENS>; +my %ensym; +while (my $line = <ENT_ENS>){ + chomp $line; + my @row = split(/\t/, $line); + $entrez{$row[3]}=$entgene{$row[0]}{$row[4]}; +} +close ENT_ENS; +open ENT_ENS, "<$opt{datadir}\/gene2ensembl.human.txt" or die $!; +my $ens_header = <ENT_ENS>; +while (my $line = <ENT_ENS>){ + chomp $line; + my @row = split(/\t/, $line); + $entrez{$row[2]}=$row[1]; +} +close ENT_ENS; + +my @fusion_files = @ARGV; +open OUT, ">variants.fusion.txt" or die $!; +print OUT join("\t",'Hugo_Symbol','Entrez_Gene_Id','Center', + 'Tumor_Sample_Barcode','Fusion','DNA_support', + 'RNA_support','Method','Frame','Fusion_Status'); + +foreach my $ffile (@fusion_files) { + open FF, "<$ffile" or die $!; + my $head = <FF>; + chomp($head); + my @colnames = split(/\t/,$head); + $fname = basename($ffile); + my $sample = (split(/\./,$fname))[0]; + while (my $line = <FF>) { + chomp($line); + my @row = split(/\t/,$line); + my %hash; + foreach my $i (0..$#row) { + $hash{$colnames[$i]} = $row[$i]; + } + print OUT join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}}, + '',$sample,$hash{FusionName},$hash{DNAReads}, + $hash{RNAReads},'StarFusion',$hash{FusionType}, + uc($hash{SomaticStatus})),"\n" + } + +} diff --git a/cbioportal/translocation2cbioportal.pl b/cbioportal/translocation2cbioportal.pl new file mode 100644 index 0000000000000000000000000000000000000000..15ac7ecafde44a3deea2dba115e3c00e6592fb9c --- /dev/null +++ b/cbioportal/translocation2cbioportal.pl @@ -0,0 +1,52 @@ +#!/usr/bin/perl -w +#translocation2cbioportal.pl + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +use File::Basename; + +my $results= GetOptions (\%opt,'datadir|d=s','gbuilddir|g=s'); + +open ENT_GENE, "<$opt{datadir}\/gene_info.human.txt" or die $!; +my %entrez; +my %entgene; +my $ent_header = <ENT_GENE>; +while (my $line = <ENT_GENE>){ + chomp $line; + my @row = split(/\t/, $line); + $entrez{$row[2]}=$row[1]; +} +close ENT_GENE; + +my @fusion_files = @ARGV; +open OUT, ">variants.fusion.txt" or die $!; +print OUT join("\t",'Hugo_Symbol','Entrez_Gene_Id','Center', + 'Tumor_Sample_Barcode','Fusion','DNA_support', + 'RNA_support','Method','Frame','Fusion_Status'); + +foreach my $ffile (@fusion_files) { + open FF, "<$ffile" or die $!; + my $head = <FF>; + chomp($head); + my @colnames = split(/\t/,$head); + $fname = basename($ffile); + my $sample = (split(/\./,$fname))[0]; + while (my $line = <FF>) { + chomp($line); + my @row = split(/\t/,$line); + my %hash; + foreach my $i (0..$#row) { + $hash{$colnames[$i]} = $row[$i]; + } + next unless $hash{Filter} eq 'PASS'; + print OUT join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}}, + '',$sample,$hash{FusionName},$hash{DNAReads}, + $hash{RNAReads},'StarFusion',$hash{FusionType}, + uc($hash{SomaticStatus})),"\n" if $entrez{$hash{LeftGene}}; + print OUT join("\t",$hash{RightGene},$entrez{$hash{RightGene}}, + '',$sample,$hash{FusionName},$hash{DNAReads}, + $hash{RNAReads},'StarFusion',$hash{FusionType}, + uc($hash{SomaticStatus})),"\n" if $entrez{$hash{RightGene}}; + + } + +} diff --git a/update_reference_data/gencode_genename.pl b/update_reference_data/gencode_genename.pl new file mode 100644 index 0000000000000000000000000000000000000000..61e378ac120b5fa9b7e4e08f85a4fed912156424 --- /dev/null +++ b/update_reference_data/gencode_genename.pl @@ -0,0 +1,34 @@ +#!/usr/bin/perl -w +#parse_gencode.pl + +my $gtf = shift @ARGV; +my $keepfile = shift @ARGV; +my %keep; +if ($keepfile) { + open KP, "<$keepfile" or die $!; + while (my $line = <KP>) { + chomp($line); + $inc{$line} = 1; + } +} +open OUT, ">genenames.txt" or die $!; +print OUT join("\t",'chrom','start','end','ensembl','symbol','type'),"\n"; +open GCODE, "<$gtf" or die $!; +while (my $line = <GCODE>) { + chomp($line); + next if ($line =~ m/^#/); + my ($chrom,$source,$feature,$start,$end,$filter,$phase,$frame,$info) = + split(/\t/,$line); + next unless ($feature eq 'gene'); + $info =~ s/\"//g; + my %hash; + foreach $a (split(/;\s*/,$info)) { + my ($key,$val) = split(/ /,$a); + $hash{$key} = $val; + } + $hash{gene_id} =~ s/\.\d+//; + if ($keepfile) { + next unless $inc{$hash{gene_name}}; + } + print OUT join("\t",$chrom,$start,$end,$hash{gene_id},$hash{gene_name},$hash{gene_type}),"\n"; +}