From 2fa59e746e589e7434e512470457095018b58299 Mon Sep 17 00:00:00 2001
From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu>
Date: Wed, 29 Jan 2020 08:59:34 -0600
Subject: [PATCH] update CNVkit for SNPs using VCF

---
 variants/cnvkit.sh        |  22 +++----
 variants/filter_cnvkit.pl | 117 +++++++++++++++++---------------------
 variants/formatVcfCNV.pl  |   4 +-
 3 files changed, 65 insertions(+), 78 deletions(-)

diff --git a/variants/cnvkit.sh b/variants/cnvkit.sh
index 0637a7c..b8f1da3 100755
--- a/variants/cnvkit.sh
+++ b/variants/cnvkit.sh
@@ -59,15 +59,9 @@ echo "${targets}antitargets.bed"
 echo "${normals}"
 
 source /etc/profile.d/modules.sh
-module load cnvkit/0.9.5 bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8
-unset DISPLAY
+module load cnvkit/0.9.5 bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8 java/oracle/jdk1.8.0_171 snpeff/4.3q
 
-if [[ $idtsnp == 1 ]]
-then
-    samtools index ${sbam}
-    bcftools mpileup --threads 10 --gvcf 10 -A -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 1000000 -L 1000000 -C50 -f ${reffa} ${sbam} | bcftools call --threads 10 -vmO v -o common_variants.vcf -T ${index_path}/IDT_snps.hg38.bed
-    $baseDir/formatVcfCNV.pl cnvkit_common common_variants.vcf
-fi
+unset DISPLAY
 
 cnvkit.py coverage ${sbam} ${targets}targets.bed -o ${pair_id}.targetcoverage.cnn
 cnvkit.py coverage ${sbam} ${targets}antitargets.bed -o ${pair_id}.antitargetcoverage.cnn
@@ -76,11 +70,19 @@ cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns
 
 if [[ $idtsnp == 1 ]]
 then
+    samtools index ${sbam}
+    java -jar /cm/shared/apps/gatk/3.8/target/package/GenomeAnalysisTK.jar -T UnifiedGenotyper -R ${reffa} --output_mode EMIT_ALL_SITES -L ${index_path}/IDT_snps.hg38.bed -o common_variants.vcf -glm BOTH -dcov 10000 -I ${sbam}
+    $baseDir/formatVcfCNV.pl cnvkit_common common_variants.vcf
+    echo -e "CHROM\tPOS\tAO\tRO\tDP\tMAF" > ${pair_id}.ballelefreq.txt
+    java -jar $SNPEFF_HOME/SnpSift.jar extractFields cnvkit_common.vcf CHROM POS GEN[0].AO GEN[0].RO GEN[0].DP |grep -v CHROM | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$3/$5}' >>  ${pair_id}.ballelefreq.txt
+    
     cnvkit.py call --filter cn ${pair_id}.cns -v cnvkit_common.vcf -o ${pair_id}.call.cns
+    cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf -v cnvkit_common.vcf 
+    
 else 
     cnvkit.py call --filter cn ${pair_id}.cns -o ${pair_id}.call.cns
+    cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf
 fi
 
-cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf
 cut -f 1,2,3 ${pair_id}.call.cns | grep -v chrom | bedtools intersect -wao -b /project/shared/bicf_workflow_ref/human/GRCh38/cytoBand.txt -a stdin |cut -f 1,2,3,7 >  ${pair_id}.cytoband.bed
-perl $baseDir/filter_cnvkit.pl ${pair_id}.call.cns
+perl $baseDir/filter_cnvkit.pl -s ${pair_id}.call.cns
diff --git a/variants/filter_cnvkit.pl b/variants/filter_cnvkit.pl
index 7695f10..a30aeb1 100755
--- a/variants/filter_cnvkit.pl
+++ b/variants/filter_cnvkit.pl
@@ -1,34 +1,14 @@
 #!/usr/bin/perl -w
 #parse_cnvkit_table.pl
 
-my $refdir = '/project/shared/bicf_workflow_ref/human/GRCh38/';
-open OM, "<$refdir\/clinseq_prj/panelgenes.txt" or die $!;
-while (my $line = <OM>) {
-  chomp($line);
-  $keep{$line} = 1;
-}
-
-open ENT_ENS, "<$refdir/../gene2ensembl.human.txt" or die $!;
-my %entrez;
-my $ent_header = <ENT_ENS>;
-while (my $line = <ENT_ENS>){
-  chomp $line;
-  my @row = split(/\t/, $line);
-  $entrez{$row[2]}=$row[1];
-}
-close ENT_ENS;
-open ENT_SYM, "<$refdir/../gene_info.human.txt" or die $!;
-$ent_header = <ENT_SYM>;
-while (my $line = <ENT_SYM>){
-  chomp $line;
-  my @row = split(/\t/, $line);
-  $entrez{$row[2]}=$row[1];
-}
-close ENT_SYM;
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+my %opt = ();
+my $results = GetOptions (\%opt,'input|s=s','help|h');
 
-my $file = shift @ARGV;
+my $file = $opt{input};
 my $sname = (split(/\./,(split(/\//,$file))[-1]))[0];
 my $prefix = (split(/\./,$file))[0];
+
 my %cyto;
 open CYTO, "<$prefix\.cytoband.bed" or die $!;
 while (my $line = <CYTO>) {
@@ -40,89 +20,97 @@ while (my $line = <CYTO>) {
     push @{$cyto{$key}{$1}},$2;
 }
 
-open OUT, ">$prefix\.cnvcalls.txt" or die $!;
-open OUT2, ">$prefix\.cnv.answer.txt" or die $!;
-open OUT3, ">$prefix\.answerplot.cns" or die $!;
-open BIO, ">$prefix\.data_cna_discrete.cbioportal.txt" or die $!;
-open BIO2, ">$prefix\.data_cna_continuous.cbioportal.txt" or die $!;
+open OUT, ">$prefix\.cnv.answer.txt" or die $!;
+open CNSO, ">$prefix\.answerplot.cns" or die $!;
 
-print OUT join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score"),"\n";
-print OUT3 join("\t","Chromosome","Start","End","Log2","CN"),"\n";
-print OUT2 join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score","CytoBand"),"\n";
-print BIO join("\t","Hugo_Symbol","Entrez_Gene_Id",$sname),"\n";
-print BIO2 join("\t","Hugo_Symbol","Entrez_Gene_Id",$sname),"\n";
+print CNSO join("\t","Chromosome","Start","End","Log2","CN"),"\n";
+print OUT join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score","CytoBand"),"\n";
 
 open CNR, "<$prefix\.cnr" or die $!;
 open CNRO, ">$prefix\.answerplot.cnr" or die $!;
 print CNRO join("\t","Gene","Chromosome","Start","End","Log2","Depth","Weight"),"\n";
 my $header = <CNR>;
+chomp($header);
+my @colnames = split(/\t/,$header);
+
 while (my $line = <CNR>) {
     chomp($line);
-    my ($chr,$start,$end,$geneids,$log2,$depth,$weight) = split(/\t/,$line);
-    my $key = $chr.":".$start."-".$end;
+    my @row = split(/\t/,$line);
+    my %hash = ();
+    foreach my $j (0..$#row) {
+	$hash{$colnames[$j]} = $row[$j];
+    }
+    my $key = $hash{chromosome}.":".$hash{start}."-".$hash{end};
+    my $geneids = $hash{gene};
     my %genes;
     if ($geneids =~ m/ensembl_gn/g) {
 	my @ids = split(/;|,/,$geneids);
 	foreach my $gid (@ids) {
 	    my ($key,$value) = split(/=/,$gid);
 	    if ($key eq 'ensembl_gn' || $key eq 'identifier') {
-		$genes{$value} = 1 if $keep{$value};
+		$genes{$value} = 1;
 	    }
 	}
     } else {
 	my @ids = split(/,/,$geneids);
 	foreach my $gid (@ids) {
-	    next if ($gid =~ /^SNP:rs\d+$/);
+	    next if ($gid =~ /^rs\d+$|^SNP:rs\d+$|^-$|Fusion/);
 	    my ($gene,@other) = split(/:/,$gid);
-	    $genes{$gene} = 1 if $keep{$gene};
+	    $genes{$gene} = 1;
 	}
     }
     foreach $gene (keys %genes) {
-	print CNRO join("\t",$gene,$chr,$start,$end,$log2,$depth,$weight),"\n"; 
+	print CNRO join("\t",$gene,$hash{chromosome},$hash{start},$hash{end},
+			$hash{log2},$hash{depth},$hash{weight}),"\n"; 
     }
 }
 
 open IN, "<$file" or die $!;
 $header = <IN>;
+chomp($header);
+@colnames = split(/\t/,$header);
+
 while (my $line = <IN>) {
     chomp($line);
-    my ($chr,$start,$end,$geneids,$log2,$cn,$depth,
-	$probes,$weight) = split(/\t/,$line);
-    next if ($chr eq 'chrX' && $cn == 1);
-    my $key = $chr.":".$start."-".$end;
+    my @row = split(/\t/,$line);
+    my %hash = ();
+    foreach my $j (0..$#row) {
+	$hash{$colnames[$j]} = $row[$j];
+    }
+    next if ($hash{chromosome} eq 'chrX' && $hash{cn} == 1);
+    my $key = $hash{chromosome}.":".$hash{start}."-".$hash{end};
+    my $geneids = $hash{gene};
     my %genes;
     if ($geneids =~ m/ensembl_gn/g) {
 	my @ids = split(/;|,/,$geneids);
 	foreach my $gid (@ids) {
 	    my ($key,$value) = split(/=/,$gid);
 	    if ($key eq 'ensembl_gn' || $key eq 'identifier') {
-		$genes{$value} = 1 if $keep{$value};
+		$genes{$value} = 1;
 	    }
 	}
     } else {
 	my @ids = split(/,/,$geneids);
 	foreach my $gid (@ids) {
-	    next if ($gid =~ /^SNP:rs\d+$/);
+	    next if ($gid =~ /^rs\d+$|^SNP:rs\d+$|^-$|Fusion/);
 	    my ($gene,@other) = split(/:/,$gid);
-	    $genes{$gene} = 1 if $keep{$gene};
+	    $genes{$gene} = 1;
 	}
     }
-    my $len = sprintf("%.1f",($end-$start)/1000);
-    print OUT3 join("\t",$chr,$start,$end,$log2,$cn),"\n";
-    next if ($cn == 2) || scalar(keys %genes) < 1;
+    my $len = sprintf("%.1f",($hash{end}-$hash{start})/1000);
+    print CNSO join("\t",$hash{chromosome},$hash{start},$hash{end},
+		    $hash{log2},$hash{cn}),"\n";
+    
+    next if ($hash{cn} == 2) || scalar(keys %genes) < 1;
     my $abtype = 'amplification';
-    $abtype = 'loss' if ($cn < 2);
-    $abtype = 'gain' if ($cn > 2 && $cn < 6);
+    $abtype = 'loss' if ($hash{cn} < 2);
+    $abtype = 'gain' if ($hash{cn} > 2 && $hash{cn} < 6);
     foreach $gene (keys %genes) {
-	$cn_cbio = $cn -2;
-	$cn_cbio = 2 if ($cn > 4);
-	print BIO join("\t",$gene,$entrez{$gene},$cn_cbio),"\n";
-        print BIO2 join("\t",$gene,$entrez{$gene},$log2),"\n";
 	my @cytoband;
-	if (@{$cyto{$key}{'p'}}) {
+	if ($cyto{$key}{'p'}) {
 	  @nums = sort {$b <=> $a} @{$cyto{$key}{'p'}};
 	  push @cytoband, 'p'.$nums[0],'p'.$nums[-1];
-	} if (@{$cyto{$key}{'q'}}) {
+	} if ($cyto{$key}{'q'}) {
 	  @nums = sort {$a <=> $b} @{$cyto{$key}{'q'}};
 	  push @cytoband, 'q'.$nums[0],'q'.$nums[-1];
 	} 
@@ -131,11 +119,8 @@ while (my $line = <IN>) {
 	}else {
 	  $cband = join("-",$cytoband[0],$cytoband[-1]);
 	}
-	    print OUT2 join("\t",$gene,$chr,$start,$end,$abtype,$cn,$weight,$cband),"\n";
-	    print OUT join("\t",$gene,$chr,$start,$end,$abtype,$cn,$weight),"\n";
-	  }
-      }
+	print OUT join("\t",$gene,$hash{chromosome},$hash{start},$hash{end},
+			$abtype,$hash{cn},$hash{weight},$cband),"\n";
+    }
+}
 close IN;
-close OUT;
-close BIO;
-close BIO2;
diff --git a/variants/formatVcfCNV.pl b/variants/formatVcfCNV.pl
index 4a60d0a..01b702f 100755
--- a/variants/formatVcfCNV.pl
+++ b/variants/formatVcfCNV.pl
@@ -43,7 +43,7 @@ while (my $line = <VCF>) {
       foreach my $i (0..$#deschead) {
 	  $gtdata{$deschead[$i]} = $gtinfo[$i];
       }
-      if ($alt eq '.') {
+      if ($alt eq '.' || $alt eq '<NON_REF>') {
 	  $gtdata{AO} = 0;
 	  $gtdata{RO} = $gtdata{DP};
 	  $gtdata{AD} = join(",", $gtdata{RO},$gtdata{AO});
@@ -69,7 +69,7 @@ while (my $line = <VCF>) {
     next if ($missingGT == scalar(@gts));
     if ($hash{END}) {
 	foreach $i ($pos..$hash{END}) {
-	    print OUT join("\t",$chrom,$i,$id,'N','N',$score,$filter,$annot,$newformat,@newgts),"\n";
+	    print OUT join("\t",$chrom,$i,$id,$ref,'.',$score,$filter,$annot,$newformat,@newgts),"\n";
 	}
     }else {
 	print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
-- 
GitLab