diff --git a/variants/cnvkit.sh b/variants/cnvkit.sh index f52e5057a4547f34d332bfd2595ebf3078b285d4..1898189e9d73654e9dd0782dfcabb4af4381d400 100755 --- a/variants/cnvkit.sh +++ b/variants/cnvkit.sh @@ -30,7 +30,7 @@ baseDir="`dirname \"$0\"`" if [[ -z $index_path ]] then - index_path='/project/shared/bicf_workflow_ref/human/GRCh38/clinseq_prj' + index_path='/project/shared/bicf_workflow_ref/human/GRCh38' fi # Check for mandatory options if [[ -z $pair_id ]] || [[ -z $sbam ]] @@ -51,13 +51,18 @@ echo "${targets}antitargets.bed" echo "${normals}" source /etc/profile.d/modules.sh -module load cnvkit/0.9.5 bedtools/2.26.0 +module load cnvkit/0.9.5 bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8 unset DISPLAY + +#samtools index ${sbam} +#bcftools mpileup --threads 10 -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 99999 -C50 -f ${reffa} -t ${index_path}/NGSCheckMate.bed ${sbam} | bcftools call --threads 10 -vmO v -o common_variants.vcf + cnvkit.py coverage ${sbam} ${targets}targets.bed -o ${pair_id}.targetcoverage.cnn cnvkit.py coverage ${sbam} ${targets}antitargets.bed -o ${pair_id}.antitargetcoverage.cnn cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn ${normals} -o ${pair_id}.cnr cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns cnvkit.py call ${pair_id}.cns -o ${pair_id}.call.cns +#cnvkit.py call ${pair_id}.cns -v common_variants.vcf -o ${pair_id}.ballelecall.cns cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf cut -f 1,2,3 ${pair_id}.call.cns | grep -v chrom | bedtools intersect -wao -b /project/shared/bicf_workflow_ref/human/GRCh38/cytoBand.txt -a stdin |cut -f 1,2,3,7 > ${pair_id}.cytoband.bed -perl $baseDir/filter_cnvkit.pl *.call.cns +perl $baseDir/filter_cnvkit.pl ${pair_id}.call.cns diff --git a/variants/filter_itdseeker.pl b/variants/filter_itdseeker.pl new file mode 100755 index 0000000000000000000000000000000000000000..65647de8e9e275ed1d8645839d9131a1bc7ec94a --- /dev/null +++ b/variants/filter_itdseeker.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl -w +#integrate_datasets.pl + +#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'td|d=s','indel|i=s','sv|s=s','tumor|t=s'); + +my @files = grep(/vcf.gz/,values %opt); +foreach $file (@files) { + chomp($file); + open IN, "gunzip -c $file |" or die $!; + my $outfile = $file; + $outfile =~ s/vcf.*/pass.vcf/; + my @gtheader; + open OUT, ">$outfile" or die $!; + W1:while (my $line = <IN>) { + chomp($line); + my $format = 'GT:MAF:AO'; + if ($line =~ m/^#/) { + if ($line =~ m/^#CHROM/) { + print OUT "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count\">\n"; + print OUT "##FORMAT=<ID=MAF,Number=1,Type=Integer,Description=\"Mutation Allele Frequency\">\n"; + print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"; + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info) = split(/\t/, $line); + print OUT join("\t",$chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,'FORMAT',$opt{tumor}),"\n"; + }else { + print OUT $line,"\n"; + } + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot) = split(/\t/, $line); + next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + next unless ($hash{ANN}); + next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/); + my %gtinfo; + foreach (split(/,/,$hash{DP2})) { + $gtinfo{AO} += $_; + } + $gtinfo{MAF} = $hash{VAF}; + $hash{AF} = $hash{VAF}; + next if $gtinfo{AO} < 10; + next if ($hash{VAF} < 0.01); + $gt = '0/0'; + $gt = '0/1' if ($hash{VAF} > 0.3); + $gt = '1/1' if ($hash{VAF} > 0.8); + my $gtinfo = join(":",$gt,$gtinfo{MAF},$gtinfo{AO}); + my @nannot; + foreach $info (sort {$a cmp $b} keys %hash) { + if (defined $hash{$info}) { + push @nannot, $info."=".$hash{$info}; + }else { + push @nannot, $info; + } + } + my $newannot = join(";",@nannot); + print OUT join("\t",$chrom, $pos,$id,$ref,$alt,$score,$filter,$newannot, + $format,$gtinfo),"\n"; + } + close IN; +} diff --git a/variants/filter_pindel.pl b/variants/filter_pindel.pl index 7e56a75aaf2ddd188b4b7f0a50831f39b449424c..63f33e3ad3b904de1690885f3639ec90fbf9b951 100755 --- a/variants/filter_pindel.pl +++ b/variants/filter_pindel.pl @@ -6,7 +6,6 @@ use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); my %opt = (); my $results = GetOptions (\%opt,'td|d=s','indel|i=s','sv|s=s','tumor|t=s'); - my @files = grep(/vcf.gz/,values %opt); foreach $file (@files) { chomp($file); @@ -18,8 +17,8 @@ foreach $file (@files) { W1:while (my $line = <IN>) { chomp($line); if ($line =~ m/^#/) { - print OUT $line,"\n"; if ($line =~ m/^#CHROM/) { + print OUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n"; my @header = split(/\t/,$line); ($chrom, $pos,$id,$ref,$alt,$score, $filter,$info,$format,@gtheader) = split(/\t/, $line); @@ -32,6 +31,7 @@ foreach $file (@files) { } } } + print OUT $line,"\n"; next; } my ($chrom, $pos,$id,$ref,$alt,$score, @@ -53,7 +53,7 @@ foreach $file (@files) { my @mutallfreq = (); foreach my $k (0..$#ainfo) { $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k]; - $hash{$deschead[$k]} = $ainfo[$k] if ($subjid eq $opt{tumor}); + #$hash{$deschead[$k]} = $ainfo[$k] if ($subjid eq $opt{tumor}); } $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP}); next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1); @@ -108,7 +108,7 @@ foreach $file (@files) { } } my $newannot = join(";",@nannot); - print OUT join("\t",$chrom, $pos,$id,$ref,$alt,$score,$filter,$newannot, + print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot, $format,@gts),"\n"; } close IN; diff --git a/variants/itdseek.sh b/variants/itdseek.sh index c5f8efca94eff6b7fb1b61c51f0422ae79f34b88..6d47a7aed25fe659c87ed284079ff6c286827b16 100755 --- a/variants/itdseek.sh +++ b/variants/itdseek.sh @@ -15,7 +15,7 @@ do r) index_path=$OPTARG;; b) sbam=$OPTARG;; p) pair_id=$OPTARG;; - l) idtbed=$OPTARG;; + l) itdbed=$OPTARG;; h) usage;; esac done @@ -47,11 +47,9 @@ fi source /etc/profile.d/modules.sh -module load samtools/1.6 snpeff/4.3q vcftools/0.1.14 bcftools/gcc/1.8 bedtools/2.26.0 +module load samtools/gcc/1.8 snpeff/4.3q vcftools/0.1.14 htslib/gcc/1.8 bcftools/gcc/1.8 bedtools/2.26.0 stexe=`which samtools` -#flt3='chr13:28003274-28100592' - -samtools view -@ $SLURM_CPUS_ON_NODE -L ${idtbed} ${sbam} | /project/shared/bicf_workflow_ref/seqprg/itdseek-1.2/itdseek.pl --refseq ${reffa} --samtools ${stexe} --bam ${sbam} | vcf-sort |bgzip > ${pair_id}.idtseek.vcf.gz -tabix ${pair_id}.idtseek.vcf.gz -bedtools intersect -header -b ${idtbed} -a ${pair_id}.idtseek.vcf.gz | java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 - |bgzip > ${pair_id}.idtseek_tandemdup.vcf.gz +samtools view -@ $SLURM_CPUS_ON_NODE -L ${itdbed} ${sbam} | /project/shared/bicf_workflow_ref/seqprg/itdseek-1.2/itdseek.pl --refseq ${reffa} --samtools ${stexe} --bam ${sbam} | vcf-sort | bedtools intersect -header -b ${itdbed} -a stdin | bgzip > ${pair_id}.itdseek.vcf.gz +tabix ${pair_id}.itdseek.vcf.gz +bcftools norm --fasta-ref $reffa -m - -Ov ${pair_id}.itdseek.vcf.gz | java -Xmx30g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 - |bgzip > ${pair_id}.itdseek_tandemdup.vcf.gz diff --git a/variants/norm_annot.sh b/variants/norm_annot.sh index 4ccd4c2360bbcbfc1ceede436805c81dd59cd2d5..536aa9c5d41691e9d428637151e2ddbe2392b799 100755 --- a/variants/norm_annot.sh +++ b/variants/norm_annot.sh @@ -26,8 +26,18 @@ baseDir="`dirname \"$0\"`" source /etc/profile.d/modules.sh module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q +if [[ -a "${index_path}/genome.fa" ]] +then + reffa="${index_path}/genome.fa" + dict="${index_path}/genome.dict" +else + echo "Missing Fasta File: ${index_path}/genome.fa" + usage + +fi + perl $baseDir\/uniform_vcf_gt.pl $pair_id $vcf bgzip -f ${pair_id}.uniform.vcf j=${pair_id}.uniform.vcf.gz tabix -f $j -bcftools norm -m - -Oz $j -o ${pair_id}.norm.vcf.gz +bcftools norm --fasta-ref $reffa -m - -Oz $j -o ${pair_id}.norm.vcf.gz diff --git a/variants/parse_pindel.pl b/variants/parse_pindel.pl index 79c0e234e9df7838a72101d92535f82b1a3bdb74..79e64f0babd98d1fa211800330be873cfc74feb5 100755 --- a/variants/parse_pindel.pl +++ b/variants/parse_pindel.pl @@ -75,16 +75,14 @@ while (my $line = <VCF>) { push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); } next if ($missingGT == scalar(@gts)); - + if ($hash{SVTYPE} eq 'DUP:TANDEM') { - print DUP join("\t",$chrom,$pos,$id,'N','<DUP>',$score,$filter,$annot,$newformat,@newgts),"\n"; - }elsif ($hash{SVTYPE} eq 'INS') { - print SV join("\t",$chrom,$pos,$id,'N','<INS>',$score,$filter,$annot,$newformat,@newgts),"\n"; + print DUP join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; }elsif ($hash{SVTYPE} eq 'DEL' || $hash{SVTYPE} eq 'INS') { if (abs($hash{SVLEN}) < 50) { print SI join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; }else { - $newalt = "<".$hash{SVTYPE}.">"; + $newalt = "<".$hash{SVTYPE}.">"; print SV join("\t",$chrom,$pos,$id,'N',$newalt,$score,$filter,$annot,$newformat,@newgts),"\n"; } } diff --git a/variants/pindel.sh b/variants/pindel.sh index 269016e211048454cb9e46e9dc75a5a76da1cce2..81b53a5087155ad7203dacf3fd96ac8cd0679961 100755 --- a/variants/pindel.sh +++ b/variants/pindel.sh @@ -9,11 +9,12 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:p:h opt +while getopts :r:l:p:h opt do case $opt in r) index_path=$OPTARG;; p) pair_id=$OPTARG;; + l) idtbed=$OPTARG;; h) usage;; esac done @@ -55,10 +56,10 @@ done pindel -T $SLURM_CPUS_ON_NODE -f ${reffa} -i ${pair_id}.pindel.config -o ${pair_id}.pindel_out --RP pindel2vcf -P ${pair_id}.pindel_out -r ${reffa} -R HG38 -d ${genomefiledate} -v pindel.vcf -cat pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter " ( GEN[*].AD[1] >= 10 )" | bgzip > pindel.vcf.gz -perl $baseDir/parse_pindel.pl ${pair_id} pindel.vcf.gz -java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.indel.vcf |bgzip > indel.vcf.gz -perl $baseDir/norm_annot.sh -r ${index_path} -p pindel_indel -v indel.vcf.gz -mv pindel_indel.norm.vcf.gz ${pair_id}.pindel_indel.vcf.gz -java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.dup.vcf |bgzip > ${pair_id}.pindel_tandemdup.vcf.gz +cat pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter "( GEN[*].AD[1] >= 10 )" | bgzip > pindel.vcf.gz +tabix pindel.vcf.gz +bash $baseDir/norm_annot.sh -r ${index_path} -p pindel -v pindel.vcf.gz +perl $baseDir/parse_pindel.pl ${pair_id} pindel.norm.vcf.gz +java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.indel.vcf |bgzip > ${pair_id}.pindel_indel.vcf.gz +java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.dup.vcf | bedtools intersect -header -b ${idtbed} -a stdin | bgzip > ${pair_id}.pindel_tandemdup.vcf.gz java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.sv.vcf | bgzip > ${pair_id}.pindel_sv.vcf.gz diff --git a/variants/uni_norm_annot.sh b/variants/uni_norm_annot.sh index dba02f29f8ffdd1717d49e7f33f8b78caccb15be..6a09f64e64933ac2ce2a6e98a0944dabbce6eab9 100755 --- a/variants/uni_norm_annot.sh +++ b/variants/uni_norm_annot.sh @@ -15,7 +15,6 @@ do case $opt in r) index_path=$OPTARG;; p) pair_id=$OPTARG;; - v) vcf=$OPTARG;; h) usage;; esac @@ -24,6 +23,16 @@ function join_by { local IFS="$1"; shift; echo "$*"; } shift $(($OPTIND -1)) baseDir="`dirname \"$0\"`" +if [[ -a "${index_path}/genome.fa" ]] +then + reffa="${index_path}/genome.fa" + dict="${index_path}/genome.dict" +else + echo "Missing Fasta File: ${index_path}/genome.fa" + usage + +fi + source /etc/profile.d/modules.sh module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q @@ -32,7 +41,7 @@ mv ${vcf} ${pair_id}.ori.vcf.gz bgzip -f ${pair_id}.uniform.vcf j=${pair_id}.uniform.vcf.gz tabix -f $j -bcftools norm -m - -Oz $j -o ${pair_id}.norm.vcf.gz +bcftools norm --fasta-ref $reffa -m - -Oz $j -o ${pair_id}.norm.vcf.gz bash $baseDir/annotvcf.sh -p ${pair_id} -r $index_path -v ${pair_id}.norm.vcf.gz /project/shared/bicf_workflow_ref/seqprg/vt/vt decompose_blocksub ${pair_id}.annot.vcf.gz -p -a -o ${pair_id}.vcf bgzip -f ${pair_id}.vcf