adding cbioportal general code

0002b743 · Brandi Cantarel · 4b2e0039 · 0002b743 · 0002b743 · 0002b743
Commit 0002b743 authored 4 years ago by Brandi Cantarel
--- a/cbioportal/concat_cnvs.pl
+++ b/cbioportal/concat_cnvs.pl
+#!/usr/bin/perl -w
+#concat_cnvs.pl
+
+my @discreet = `ls *cnv_discreet.txt`;
+my %cts;
+my %sample;
+foreach $file (@discreet) {
+    open IN, "<$file" or die $!;
+    my ($sample,@ext) = split(/\./,$file);
+    $sample{$sample} = 1;
+    while (my $line = <IN>) {
+	chomp($line);
+	my ($chr,$s,$e,$ct,$gene) = split(/\t/,$line);
+	$cts{$gene}{$sample} = $ct;
+    }
+}
+my @samples = sort {$a cmp $b} keys %sample;
+open OUT, ">discreet.cna.txt" or die $!;
+print OUT join("\t",'Hugo_Symbol',@samples),"\n";
+foreach my $gene (keys %cts) {
+    my @line;
+    foreach my $sid (@samples) {
+	$cts{$gene}{$sid} = 2 unless ($cts{$gene}{$sid});
+	push @line, $cts{$gene}{$sid};
+    }
+    print OUT join("\t",$gene,@line),"\n";
+}
+
+my @continuous = `ls *cnv_continuous.txt`;
+    my %cts;
+my %sample;
+foreach $file (@continuous) {
+    open IN, "<$file" or die $!;
+    my ($sample,@ext) = split(/\./,$file);
+    $sample{$sample} = 1;
+    while (my $line = <IN>) {
+	chomp($line);
+	my ($chr,$s,$e,$ct,$gene) = split(/\t/,$line);
+	$cts{$gene}{$sample} = $ct;
+    }
+}
+my @samples = sort {$a cmp $b} keys %sample;
+open OUT, ">continuous.cna.txt" or die $!;
+print OUT join("\t",'Hugo_Symbol',@samples),"\n";
+foreach my $gene (keys %cts) {
+    my @line;
+    foreach my $sid (@samples) {
+	$cts{$gene}{$sid} = 2 unless ($cts{$gene}{$sid});
+	push @line, $cts{$gene}{$sid};
+    }
+    print OUT join("\t",$gene,@line),"\n";
+}
--- a/cbioportal/filter_maf.pl
+++ b/cbioportal/filter_maf.pl
+#!/usr/bin/perl -w
+#patient_sample_uuid.pl
+
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+my %opt = ();
+my $results = GetOptions (\%opt,'prjid|p=s');
+
+my @maffiles = @ARGV;
+
+open MAFOUT, ">variants.maf" or die $!;
+my @mincols = ('Hugo_Symbol','Entrez_Gene_Id','Variant_Classification',
+	       'Tumor_Sample_Barcode','HGVSp_Short','Protein_position',
+	       'SWISSPROT');
+#($hash{'Hugo_Symbol'},$hash{'Entrez_Gene_Id'},$hash{'Variant_Classification'},$hash{'Tumor_Sample_Barcode'},$hash{'HGVSp_Short'},$hash{'Protein_position'},$hash{'SWISSPROT'});
+
+foreach $maf (@maffiles) {
+  open MAF, "<$maf" or die $!;
+  while (my $line = <MAF>) {
+    chomp($line);
+    if ($line =~ m/#/) {
+      print MAFOUT $line,"\n";
+    }
+    elsif ($line =~ m/Hugo_Symbol/i) {
+      @mafcols = split(/\t/,$line);
+      print MAFOUT join("\t",@mincols),"\n";
+    }else {
+      my @row = split(/\t/,$line);
+      my %hash;
+      foreach my $i (0..$#mafcols) {
+	$row[$i]  = '' unless $row[$i];
+	$hash{$mafcols[$i]} = $row[$i];
+      }
+      next if ($hash{Variant_Classification} =~ m/Silent|Intron|UTR|Flank|IGR|RNA|Splice_Region/);
+      next unless ($hash{FILTER} =~ m/PASS/);
+      $mafids{$hash{Tumor_Sample_Barcode}} = 1;
+      my @newline;
+      foreach $i (0..$#mincols) {
+	  push @newline, $hash{$mincols[$i]};
+      }
+      print MAFOUT join("\t",@newline),"\n";
+    }
+  }
+  close MAF;
+}
+close MAFOUT;
+open SEQD, ">case_lists/sequenced.txt" or die $!;
+print SEQD join("\n","cancer_study_identifier: $opt{prjid}",
+		"stable_id: $opt{prjid}_sequenced",
+		"case_list_name: Sequenced",
+		"case_list_description: Sequenced Samples",
+		"case_list_ids:".join("\t",keys %mafids)),"\n";
+close SEQD;
--- a/cbioportal/prepare_cbioportal.sh
+++ b/cbioportal/prepare_cbioportal.sh
+#!/bin/bash
+
+module load bedtools/2.29.0
+ln -s /project/shared/bicf_workflow_ref/human/grch38_cloud/rnaref/genenames.txt .
+perl /project/PHG/PHG_Clinical/devel/clinseq_workflows/process_scripts/genect_rnaseq/concat_cts.pl -o ./ */*/*.cts
+perl /project/PHG/PHG_Clinical/devel/clinseq_workflows/process_scripts/genect_rnaseq/concat_fpkm.pl -o ./ */*/*.fpkm.txt
+cut -f 2,4- countTable.fpkm.txt |perl -pi -e 's/SYMBOL/Hugo_Symbol/g' > fpkm.txt
+
+ls ../*/CNV/*.txt | awk -F '/' '{print "cut -f 1-3,5",$0,"|bedtools intersect -wao -a stdin -b tempus.genes.hg19.bed | cut -f 1-3,4,8 >",$2".cnv_continuous.txt"}' |sh
+ls ../*/CNV/*.txt | awk -F '/' '{print "cut -f 1-3,12",$0,"|bedtools intersect -wao -a stdin -b tempus.genes.hg19.bed | cut -f 1-3,4,8 >",$2".cnv_discreet.txt"}' |sh
+ perl /project/PHG/PHG_Clinical/devel/clinseq_workflows/process_scripts/cbioportal/concat_cnvs.pl
--- a/cbioportal/sym2entrez.pl
+++ b/cbioportal/sym2entrez.pl
+#!/usr/bin/perl -w
+#translocation2cbioportal.pl
+
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+use File::Basename;
+
+my $results= GetOptions (\%opt,'datadir|d=s','gbuilddir|g=s');
+
+open ENT_GENE, "<$opt{datadir}\/gene_info.human.txt" or die $!;
+my %entrez;
+my %entgene;
+my $ent_header = <ENT_GENE>;
+while (my $line = <ENT_GENE>){
+  chomp $line;
+  my @row = split(/\t/, $line);
+  $entgene{'chr'.$row[6]}{$row[2]}=$row[1];
+}
+close ENT_GENE;
+open ENT_ENS, "<$opt{gbuilddir}\/genenames.txt" or die $!;
+my $gn_header = <ENT_ENS>;
+my %ensym;
+while (my $line = <ENT_ENS>){
+  chomp $line;
+  my @row = split(/\t/, $line);
+  $entrez{$row[3]}=$entgene{$row[0]}{$row[4]};
+}
+close ENT_ENS;
+open ENT_ENS, "<$opt{datadir}\/gene2ensembl.human.txt" or die $!;
+my $ens_header = <ENT_ENS>;
+while (my $line = <ENT_ENS>){
+  chomp $line;
+  my @row = split(/\t/, $line);
+  $entrez{$row[2]}=$row[1];
+}
+close ENT_ENS;
+
+my @fusion_files = @ARGV;
+open OUT, ">variants.fusion.txt" or die $!;
+print OUT join("\t",'Hugo_Symbol','Entrez_Gene_Id','Center',
+	       'Tumor_Sample_Barcode','Fusion','DNA_support',
+	       'RNA_support','Method','Frame','Fusion_Status');
+
+foreach my $ffile (@fusion_files) {
+    open FF, "<$ffile" or die $!;
+    my $head = <FF>;
+    chomp($head);
+    my @colnames = split(/\t/,$head);
+    $fname = basename($ffile);
+    my $sample = (split(/\./,$fname))[0];
+    while (my $line = <FF>) {
+	chomp($line);
+	my @row = split(/\t/,$line);
+	my %hash;
+	foreach my $i (0..$#row) {
+	    $hash{$colnames[$i]} = $row[$i];
+	}
+	print OUT join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}},
+		       '',$sample,$hash{FusionName},$hash{DNAReads},
+		       $hash{RNAReads},'StarFusion',$hash{FusionType},
+		       uc($hash{SomaticStatus})),"\n"
+    }
+
+}
--- a/cbioportal/translocation2cbioportal.pl
+++ b/cbioportal/translocation2cbioportal.pl
+#!/usr/bin/perl -w
+#translocation2cbioportal.pl
+
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+use File::Basename;
+
+my $results= GetOptions (\%opt,'datadir|d=s','gbuilddir|g=s');
+
+open ENT_GENE, "<$opt{datadir}\/gene_info.human.txt" or die $!;
+my %entrez;
+my %entgene;
+my $ent_header = <ENT_GENE>;
+while (my $line = <ENT_GENE>){
+  chomp $line;
+  my @row = split(/\t/, $line);
+  $entrez{$row[2]}=$row[1];
+}
+close ENT_GENE;
+
+my @fusion_files = @ARGV;
+open OUT, ">variants.fusion.txt" or die $!;
+print OUT join("\t",'Hugo_Symbol','Entrez_Gene_Id','Center',
+	       'Tumor_Sample_Barcode','Fusion','DNA_support',
+	       'RNA_support','Method','Frame','Fusion_Status');
+
+foreach my $ffile (@fusion_files) {
+    open FF, "<$ffile" or die $!;
+    my $head = <FF>;
+    chomp($head);
+    my @colnames = split(/\t/,$head);
+    $fname = basename($ffile);
+    my $sample = (split(/\./,$fname))[0];
+    while (my $line = <FF>) {
+	chomp($line);
+	my @row = split(/\t/,$line);
+	my %hash;
+	foreach my $i (0..$#row) {
+	    $hash{$colnames[$i]} = $row[$i];
+	}
+	next unless $hash{Filter} eq 'PASS';
+	print OUT join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}},
+		       '',$sample,$hash{FusionName},$hash{DNAReads},
+		       $hash{RNAReads},'StarFusion',$hash{FusionType},
+		       uc($hash{SomaticStatus})),"\n" if $entrez{$hash{LeftGene}};
+	print OUT join("\t",$hash{RightGene},$entrez{$hash{RightGene}},
+		       '',$sample,$hash{FusionName},$hash{DNAReads},
+		       $hash{RNAReads},'StarFusion',$hash{FusionType},
+		       uc($hash{SomaticStatus})),"\n" if $entrez{$hash{RightGene}};
+	
+    }
+
+}
--- a/update_reference_data/gencode_genename.pl
+++ b/update_reference_data/gencode_genename.pl
+#!/usr/bin/perl -w
+#parse_gencode.pl
+
+my $gtf = shift @ARGV;
+my $keepfile = shift @ARGV;
+my %keep;
+if ($keepfile) {
+    open KP, "<$keepfile" or die $!;
+    while (my $line = <KP>) {
+	chomp($line);
+	$inc{$line} = 1;
+    }
+}
+open OUT, ">genenames.txt" or die $!;
+print OUT join("\t",'chrom','start','end','ensembl','symbol','type'),"\n";
+open GCODE, "<$gtf" or die $!;
+while (my $line = <GCODE>) {
+    chomp($line);
+    next if ($line =~ m/^#/);
+    my ($chrom,$source,$feature,$start,$end,$filter,$phase,$frame,$info) = 
+	split(/\t/,$line);
+    next unless ($feature eq 'gene');
+    $info =~ s/\"//g;
+    my %hash;
+    foreach $a (split(/;\s*/,$info)) {
+	my ($key,$val) = split(/ /,$a);
+	$hash{$key} = $val;
+    }
+    $hash{gene_id} =~ s/\.\d+//;
+    if ($keepfile) {
+	next unless $inc{$hash{gene_name}};
+    }
+    print OUT join("\t",$chrom,$start,$end,$hash{gene_id},$hash{gene_name},$hash{gene_type}),"\n";
+}