diff --git a/variants/germline_vc.sh b/variants/germline_vc.sh index a2c2c559730f7191aface492b999183a77a37106..d54bd99d85d60adab203bf632b3af0b6e3ee7dbc 100755 --- a/variants/germline_vc.sh +++ b/variants/germline_vc.sh @@ -79,7 +79,7 @@ then java -jar $SNPEFF_HOME/SnpSift.jar annotate ${index_path}/cosmic.vcf.gz ${pair_id}.lowfreq.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar filter "(CNT[*] >0)" - |bgzip > ${pair_id}.hotspot.vcf.gz elif [[ $algo == 'speedseq' ]] then - module load speedseq/20160506 + module load speedseq/gcc/0.1.2 speedseq var -t $SLURM_CPUS_ON_NODE -o ssvar ${reffa} *.bam vcf-annotate -n --fill-type ssvar.vcf.gz| bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.ssvar.vcf.gz - elif [[ $algo == 'gatk' ]] diff --git a/variants/parse_pindel.pl b/variants/parse_pindel.pl new file mode 100755 index 0000000000000000000000000000000000000000..4b6c5f8b61ebd93ff52af78e1f47bd0e0eb864b7 --- /dev/null +++ b/variants/parse_pindel.pl @@ -0,0 +1,94 @@ +#!/usr/bin/perl +#parse_pindel + +my $pair_id = shift @ARGV; +my $tid = shift @ARGV; +my $vcf = shift @ARGV; + +open SI, ">$pair_id.indel.vcf" or die $!; +open SV, ">$pair_id.sv.vcf" or die $!; +open DUP, ">$pair_id.dup.vcf" or die $!; + +open VCF, "gunzip -c $vcf|" or die $!; +while (my $line = <VCF>) { + chomp($line); + if ($line =~ m/#/) { + print SI $line,"\n"; + print SV $line,"\n"; + print DUP $line,"\n"; + if ($line =~ m/#CHROM/) { + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@subjacc) = split(/\t/, $line); + } + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val; + } + my @deschead = split(/:/,$format); + my $newformat = 'GT:DP:AD:AO:RO'; + my @newgts = (); + my $missingGT = 0; + my %allele; + FG:foreach my $i (0..$#gts) { + my $sid = $subjacc[$i]; + my @gtinfo = split(/:/,$gts[$i]); + my %gtdata; + if ($allele_info eq '.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + foreach my $i (0..$#deschead) { + $gtdata{$deschead[$i]} = $gtinfo[$i]; + } + if ($gtdata{AD}){ + ($gtdata{RO},@alts) = split(/,/,$gtdata{AD}); + $gtdata{AO} = join(",",@alts); + $gtdata{DP} = $gtdata{RO}; + foreach (@alts) { + $gtdata{DP} += $_; + } + } elsif (exists $gtdata{NR} && exists $gtdata{NV}) { + $gtdata{DP} = $gtdata{NR}; + $gtdata{AO} = $gtdata{NV}; + $gtdata{RO} = $gtdata{DP} - $gtdata{AO}; + } elsif (exists $gtdata{AO} && exists $gtdata{RO}) { + $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO}); + $gtdata{DP} = $gtdata{RO}; + foreach (split(',',$gtdata{AO})) { + $gtdata{DP} += $_; + } + } + if (($gtdata{DP} && $gtdata{DP} < 10) || ()) { + $missingGT ++; + } if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + $allele{$tid} = [sprintf("%.4f",$gtdata{AO}/$gtdata{DP}),$gtdata{DP}]; + push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); + } + next if ($missingGT == scalar(@gts)); + next unless ($allele{$tid}); + my ($taf,$tdp) = @{$allele{$tid}}; + next if ($taf < 0.05 && $tdp < 10); + if ($hash{SVTYPE} eq 'DUP:TANDEM') { + print DUP join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + }elsif ($hash{SVTYPE} eq 'INS') { + print SV join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + }elsif ($hash{SVTYPE} eq 'DEL' || $hash{SVTYPE} eq 'INS') { + if (abs($hash{SVLEN}) < 50) { + print SI join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + }else { + print SV join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + } + } +} +close VCF; diff --git a/variants/pindel.sh b/variants/pindel.sh new file mode 100644 index 0000000000000000000000000000000000000000..35e271a3216e747a7cc66049b723faec28a3e001 --- /dev/null +++ b/variants/pindel.sh @@ -0,0 +1,63 @@ +#!/bin/bash +#svcalling.sh + +usage() { + echo "-h Help documentation for gatkrunner.sh" + echo "-r --Path to Reference Genome with the file genome.fa" + echo "-p --Prefix for output file name" + echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:p:h opt +do + case $opt in + r) index_path=$OPTARG;; + p) pair_id=$OPTARG;; + h) usage;; + esac +done +function join_by { local IFS="$1"; shift; echo "$*"; } + +shift $(($OPTIND -1)) +baseDir="`dirname \"$0\"`" + + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $index_path ]]; then + usage +fi +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi + +if [[ -a "${index_path}/genome.fa" ]] +then + reffa="${index_path}/genome.fa" + dict="${index_path}/genome.dict" +else + echo "Missing Fasta File: ${index_path}/genome.fa" + usage + +fi + +source /etc/profile.d/modules.sh + +genomefiledate=`find ${reffa} -maxdepth 0 -printf "%TY%Tm%Td\n"` + +module load samtools/1.6 pindel/0.2.5-intel snpeff/4.3q +touch ${pair_id}.pindel.config +for i in *.bam; do + sname="${i%.bam}" + echo -e "${i}\t400\t${sname}" >> ${pair_id}.pindel.config +done + +pindel -T $SLURM_CPUS_ON_NODE -f ${reffa} -i ${pair_id}.pindel.config -o ${pair_id}.pindel_out --RP +pindel2vcf -P ${pair_id}.pindel_out -r ${reffa} -R HG38 -d ${genomefiledate} -v pindel.vcf +cat pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter " ( GEN[*].AD[1] >= 10 )" | bgzip > pindel.vcf.gz +perl $baseDir/norm_annot.sh -r ${index_path} -p pindel -v pindel.vcf.gz +perl $baseDir/parse_pindel.pl ${pair_id} $tid pindel.norm.vcf.gz +java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.indel.vcf |bgzip > ${pair_id}.pindel_indel.vcf.gz +java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.dup.vcf |bgzip > ${pair_id}.pindel_tandemdup.vcf.gz +java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.sv.vcf |bgzip > ${pair_id}.pindel_sv.vcf.gz diff --git a/variants/svcalling.sh b/variants/svcalling.sh index b0bcaaeb6955fffcee23efe8e2a02187a8191746..17bc77001347e37a528075d1f4da81ba0acf28c4 100755 --- a/variants/svcalling.sh +++ b/variants/svcalling.sh @@ -11,16 +11,15 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:p:b:t:i:k:n:m:h opt +while getopts :r:p:b:i:e:n:a:h opt do case $opt in r) index_path=$OPTARG;; p) pair_id=$OPTARG;; b) sbam=$OPTARG;; - k) tid=$OPTARG;; - i) nid=$OPTARG;; + i) tid=$OPTARG;; n) normal=$OPTARG;; - m) method=$OPTARG;; + a) method=$OPTARG;; h) usage;; esac done @@ -48,25 +47,11 @@ else usage fi + source /etc/profile.d/modules.sh -module load speedseq/20160506 novoBreak/v1.1.3 delly2/v0.7.7-multi samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 +module load samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 mkdir temp -genomefiledate=`find ${reffa} -maxdepth 0 -printf "%TY%Tm%Td\n"` - -if [[ $method == 'pindel' ]] -then - module load samtools/1.6 pindel/0.2.5-intel snpeff/4.3q - echo -e "${sbam}\t400\t${tid}" > ${pair_id}.pindel.config - if [[ -n ${normal} ]] - then - echo -e "${normal}\t400\t${nid}" >> ${pair_id}.pindel.config - fi - pindel -T $SLURM_CPUS_ON_NODE -f ${reffa} -i ${pair_id}.pindel.config -o ${pair_id}.pindel_out --RP - pindel2vcf -P ${pair_id}.pindel_out -r ${reffa} -R HG38 -d ${genomefiledate} -v pindel.vcf - java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter " ( GEN[*].AD[1] >= 20 )" | bgzip > ${pair_id}.pindel.vcf.gz -fi - if [[ $method == 'delly' ]] then module load delly2/v0.7.7-multi samtools/1.6 snpeff/4.3q