From 2fa59e746e589e7434e512470457095018b58299 Mon Sep 17 00:00:00 2001 From: Brandi Cantarel <brandi.cantarel@utsouthwestern.edu> Date: Wed, 29 Jan 2020 08:59:34 -0600 Subject: [PATCH] update CNVkit for SNPs using VCF --- variants/cnvkit.sh | 22 +++---- variants/filter_cnvkit.pl | 117 +++++++++++++++++--------------------- variants/formatVcfCNV.pl | 4 +- 3 files changed, 65 insertions(+), 78 deletions(-) diff --git a/variants/cnvkit.sh b/variants/cnvkit.sh index 0637a7c..b8f1da3 100755 --- a/variants/cnvkit.sh +++ b/variants/cnvkit.sh @@ -59,15 +59,9 @@ echo "${targets}antitargets.bed" echo "${normals}" source /etc/profile.d/modules.sh -module load cnvkit/0.9.5 bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8 -unset DISPLAY +module load cnvkit/0.9.5 bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8 java/oracle/jdk1.8.0_171 snpeff/4.3q -if [[ $idtsnp == 1 ]] -then - samtools index ${sbam} - bcftools mpileup --threads 10 --gvcf 10 -A -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 1000000 -L 1000000 -C50 -f ${reffa} ${sbam} | bcftools call --threads 10 -vmO v -o common_variants.vcf -T ${index_path}/IDT_snps.hg38.bed - $baseDir/formatVcfCNV.pl cnvkit_common common_variants.vcf -fi +unset DISPLAY cnvkit.py coverage ${sbam} ${targets}targets.bed -o ${pair_id}.targetcoverage.cnn cnvkit.py coverage ${sbam} ${targets}antitargets.bed -o ${pair_id}.antitargetcoverage.cnn @@ -76,11 +70,19 @@ cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns if [[ $idtsnp == 1 ]] then + samtools index ${sbam} + java -jar /cm/shared/apps/gatk/3.8/target/package/GenomeAnalysisTK.jar -T UnifiedGenotyper -R ${reffa} --output_mode EMIT_ALL_SITES -L ${index_path}/IDT_snps.hg38.bed -o common_variants.vcf -glm BOTH -dcov 10000 -I ${sbam} + $baseDir/formatVcfCNV.pl cnvkit_common common_variants.vcf + echo -e "CHROM\tPOS\tAO\tRO\tDP\tMAF" > ${pair_id}.ballelefreq.txt + java -jar $SNPEFF_HOME/SnpSift.jar extractFields cnvkit_common.vcf CHROM POS GEN[0].AO GEN[0].RO GEN[0].DP |grep -v CHROM | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$3/$5}' >> ${pair_id}.ballelefreq.txt + cnvkit.py call --filter cn ${pair_id}.cns -v cnvkit_common.vcf -o ${pair_id}.call.cns + cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf -v cnvkit_common.vcf + else cnvkit.py call --filter cn ${pair_id}.cns -o ${pair_id}.call.cns + cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf fi -cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf cut -f 1,2,3 ${pair_id}.call.cns | grep -v chrom | bedtools intersect -wao -b /project/shared/bicf_workflow_ref/human/GRCh38/cytoBand.txt -a stdin |cut -f 1,2,3,7 > ${pair_id}.cytoband.bed -perl $baseDir/filter_cnvkit.pl ${pair_id}.call.cns +perl $baseDir/filter_cnvkit.pl -s ${pair_id}.call.cns diff --git a/variants/filter_cnvkit.pl b/variants/filter_cnvkit.pl index 7695f10..a30aeb1 100755 --- a/variants/filter_cnvkit.pl +++ b/variants/filter_cnvkit.pl @@ -1,34 +1,14 @@ #!/usr/bin/perl -w #parse_cnvkit_table.pl -my $refdir = '/project/shared/bicf_workflow_ref/human/GRCh38/'; -open OM, "<$refdir\/clinseq_prj/panelgenes.txt" or die $!; -while (my $line = <OM>) { - chomp($line); - $keep{$line} = 1; -} - -open ENT_ENS, "<$refdir/../gene2ensembl.human.txt" or die $!; -my %entrez; -my $ent_header = <ENT_ENS>; -while (my $line = <ENT_ENS>){ - chomp $line; - my @row = split(/\t/, $line); - $entrez{$row[2]}=$row[1]; -} -close ENT_ENS; -open ENT_SYM, "<$refdir/../gene_info.human.txt" or die $!; -$ent_header = <ENT_SYM>; -while (my $line = <ENT_SYM>){ - chomp $line; - my @row = split(/\t/, $line); - $entrez{$row[2]}=$row[1]; -} -close ENT_SYM; +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'input|s=s','help|h'); -my $file = shift @ARGV; +my $file = $opt{input}; my $sname = (split(/\./,(split(/\//,$file))[-1]))[0]; my $prefix = (split(/\./,$file))[0]; + my %cyto; open CYTO, "<$prefix\.cytoband.bed" or die $!; while (my $line = <CYTO>) { @@ -40,89 +20,97 @@ while (my $line = <CYTO>) { push @{$cyto{$key}{$1}},$2; } -open OUT, ">$prefix\.cnvcalls.txt" or die $!; -open OUT2, ">$prefix\.cnv.answer.txt" or die $!; -open OUT3, ">$prefix\.answerplot.cns" or die $!; -open BIO, ">$prefix\.data_cna_discrete.cbioportal.txt" or die $!; -open BIO2, ">$prefix\.data_cna_continuous.cbioportal.txt" or die $!; +open OUT, ">$prefix\.cnv.answer.txt" or die $!; +open CNSO, ">$prefix\.answerplot.cns" or die $!; -print OUT join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score"),"\n"; -print OUT3 join("\t","Chromosome","Start","End","Log2","CN"),"\n"; -print OUT2 join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score","CytoBand"),"\n"; -print BIO join("\t","Hugo_Symbol","Entrez_Gene_Id",$sname),"\n"; -print BIO2 join("\t","Hugo_Symbol","Entrez_Gene_Id",$sname),"\n"; +print CNSO join("\t","Chromosome","Start","End","Log2","CN"),"\n"; +print OUT join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score","CytoBand"),"\n"; open CNR, "<$prefix\.cnr" or die $!; open CNRO, ">$prefix\.answerplot.cnr" or die $!; print CNRO join("\t","Gene","Chromosome","Start","End","Log2","Depth","Weight"),"\n"; my $header = <CNR>; +chomp($header); +my @colnames = split(/\t/,$header); + while (my $line = <CNR>) { chomp($line); - my ($chr,$start,$end,$geneids,$log2,$depth,$weight) = split(/\t/,$line); - my $key = $chr.":".$start."-".$end; + my @row = split(/\t/,$line); + my %hash = (); + foreach my $j (0..$#row) { + $hash{$colnames[$j]} = $row[$j]; + } + my $key = $hash{chromosome}.":".$hash{start}."-".$hash{end}; + my $geneids = $hash{gene}; my %genes; if ($geneids =~ m/ensembl_gn/g) { my @ids = split(/;|,/,$geneids); foreach my $gid (@ids) { my ($key,$value) = split(/=/,$gid); if ($key eq 'ensembl_gn' || $key eq 'identifier') { - $genes{$value} = 1 if $keep{$value}; + $genes{$value} = 1; } } } else { my @ids = split(/,/,$geneids); foreach my $gid (@ids) { - next if ($gid =~ /^SNP:rs\d+$/); + next if ($gid =~ /^rs\d+$|^SNP:rs\d+$|^-$|Fusion/); my ($gene,@other) = split(/:/,$gid); - $genes{$gene} = 1 if $keep{$gene}; + $genes{$gene} = 1; } } foreach $gene (keys %genes) { - print CNRO join("\t",$gene,$chr,$start,$end,$log2,$depth,$weight),"\n"; + print CNRO join("\t",$gene,$hash{chromosome},$hash{start},$hash{end}, + $hash{log2},$hash{depth},$hash{weight}),"\n"; } } open IN, "<$file" or die $!; $header = <IN>; +chomp($header); +@colnames = split(/\t/,$header); + while (my $line = <IN>) { chomp($line); - my ($chr,$start,$end,$geneids,$log2,$cn,$depth, - $probes,$weight) = split(/\t/,$line); - next if ($chr eq 'chrX' && $cn == 1); - my $key = $chr.":".$start."-".$end; + my @row = split(/\t/,$line); + my %hash = (); + foreach my $j (0..$#row) { + $hash{$colnames[$j]} = $row[$j]; + } + next if ($hash{chromosome} eq 'chrX' && $hash{cn} == 1); + my $key = $hash{chromosome}.":".$hash{start}."-".$hash{end}; + my $geneids = $hash{gene}; my %genes; if ($geneids =~ m/ensembl_gn/g) { my @ids = split(/;|,/,$geneids); foreach my $gid (@ids) { my ($key,$value) = split(/=/,$gid); if ($key eq 'ensembl_gn' || $key eq 'identifier') { - $genes{$value} = 1 if $keep{$value}; + $genes{$value} = 1; } } } else { my @ids = split(/,/,$geneids); foreach my $gid (@ids) { - next if ($gid =~ /^SNP:rs\d+$/); + next if ($gid =~ /^rs\d+$|^SNP:rs\d+$|^-$|Fusion/); my ($gene,@other) = split(/:/,$gid); - $genes{$gene} = 1 if $keep{$gene}; + $genes{$gene} = 1; } } - my $len = sprintf("%.1f",($end-$start)/1000); - print OUT3 join("\t",$chr,$start,$end,$log2,$cn),"\n"; - next if ($cn == 2) || scalar(keys %genes) < 1; + my $len = sprintf("%.1f",($hash{end}-$hash{start})/1000); + print CNSO join("\t",$hash{chromosome},$hash{start},$hash{end}, + $hash{log2},$hash{cn}),"\n"; + + next if ($hash{cn} == 2) || scalar(keys %genes) < 1; my $abtype = 'amplification'; - $abtype = 'loss' if ($cn < 2); - $abtype = 'gain' if ($cn > 2 && $cn < 6); + $abtype = 'loss' if ($hash{cn} < 2); + $abtype = 'gain' if ($hash{cn} > 2 && $hash{cn} < 6); foreach $gene (keys %genes) { - $cn_cbio = $cn -2; - $cn_cbio = 2 if ($cn > 4); - print BIO join("\t",$gene,$entrez{$gene},$cn_cbio),"\n"; - print BIO2 join("\t",$gene,$entrez{$gene},$log2),"\n"; my @cytoband; - if (@{$cyto{$key}{'p'}}) { + if ($cyto{$key}{'p'}) { @nums = sort {$b <=> $a} @{$cyto{$key}{'p'}}; push @cytoband, 'p'.$nums[0],'p'.$nums[-1]; - } if (@{$cyto{$key}{'q'}}) { + } if ($cyto{$key}{'q'}) { @nums = sort {$a <=> $b} @{$cyto{$key}{'q'}}; push @cytoband, 'q'.$nums[0],'q'.$nums[-1]; } @@ -131,11 +119,8 @@ while (my $line = <IN>) { }else { $cband = join("-",$cytoband[0],$cytoband[-1]); } - print OUT2 join("\t",$gene,$chr,$start,$end,$abtype,$cn,$weight,$cband),"\n"; - print OUT join("\t",$gene,$chr,$start,$end,$abtype,$cn,$weight),"\n"; - } - } + print OUT join("\t",$gene,$hash{chromosome},$hash{start},$hash{end}, + $abtype,$hash{cn},$hash{weight},$cband),"\n"; + } +} close IN; -close OUT; -close BIO; -close BIO2; diff --git a/variants/formatVcfCNV.pl b/variants/formatVcfCNV.pl index 4a60d0a..01b702f 100755 --- a/variants/formatVcfCNV.pl +++ b/variants/formatVcfCNV.pl @@ -43,7 +43,7 @@ while (my $line = <VCF>) { foreach my $i (0..$#deschead) { $gtdata{$deschead[$i]} = $gtinfo[$i]; } - if ($alt eq '.') { + if ($alt eq '.' || $alt eq '<NON_REF>') { $gtdata{AO} = 0; $gtdata{RO} = $gtdata{DP}; $gtdata{AD} = join(",", $gtdata{RO},$gtdata{AO}); @@ -69,7 +69,7 @@ while (my $line = <VCF>) { next if ($missingGT == scalar(@gts)); if ($hash{END}) { foreach $i ($pos..$hash{END}) { - print OUT join("\t",$chrom,$i,$id,'N','N',$score,$filter,$annot,$newformat,@newgts),"\n"; + print OUT join("\t",$chrom,$i,$id,$ref,'.',$score,$filter,$annot,$newformat,@newgts),"\n"; } }else { print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; -- GitLab