diff --git a/variants/annotvcf.sh b/variants/annotvcf.sh new file mode 100644 index 0000000000000000000000000000000000000000..ba25c22e89bb0211ab7d5cd6153f207337bf8828 --- /dev/null +++ b/variants/annotvcf.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#annotvcf.sh + +usage() { + echo "-h Help documentation for gatkrunner.sh" + echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-p --Prefix for output file name" + echo "-v --VCF File" + echo "Example: bash hisat.sh -p prefix -r /path/GRCh38 -v vcffile" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:v:p:h opt +do + case $opt in + r) index_path=$OPTARG;; + p) pair_id=$OPTARG;; + v) unionvcf=$OPTARG;; + h) usage;; + esac +done +function join_by { local IFS="$1"; shift; echo "$*"; } +shift $(($OPTIND -1)) +module load python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q +if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]] +then + tabix ${unionvcf} + bcftools annotate -Oz -a ${index_path}/ExAC.vcf.gz -o ${pair_id}.exac.vcf.gz --columns CHROM,POS,AC_Het,AC_Hom,AC_Hemi,AC_Adj,AN_Adj,AC_POPMAX,AN_POPMAX,POPMAX ${unionvcf} + tabix ${pair_id}.exac.vcf.gz + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.exac.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar annotate -id ${index_path}/dbSnp.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CLNSIG,CLNDSDB,CLNDSDBID,CLNDBN,CLNREVSTAT,CLNACC ${index_path}/clinvar.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CNT ${index_path}/cosmic.vcf.gz - | java -Xmx10g -jar $SNPEFF_HOME/SnpSift.jar dbnsfp -v -db ${index_path}/dbNSFP.txt.gz - |bgzip > ${pair_id}.annot.vcf.gz + tabix ${pair_id}.annot.vcf.gz +else + if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCm38' ]] + then + snpeffvers='GRCh38.86' + elif [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh37' ]] + then + snpeffvers='GRCh37.75' + fi + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffvers} ${unionvcf} |bgzip > ${pair_id}.annot.vcf.gz + tabix ${pair_id}.annot.vcf.gz +fi diff --git a/variants/germline_vc.sh b/variants/germline_vc.sh index 6be820cabf929bee2a795ffb6d6f3563aa0c723f..f8ba19916409e2e2bf19f726c13afb2c8b880798 100644 --- a/variants/germline_vc.sh +++ b/variants/germline_vc.sh @@ -3,10 +3,10 @@ usage() { echo "-h Help documentation for gatkrunner.sh" - echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-r --Path to Reference Genome with the file genome.fa" echo "-p --Prefix for output file name" - echo "-a --Algorithm/Command" - echo "Example: bash hisat.sh -p prefix -r /path/GRCh38" + echo "-a --Algorithm/Command: gatk, mpileup, speedseq, platypus " + echo "Example: bash hisat.sh -p prefix -r /path/GRCh38 -a gatk" exit 1 } OPTIND=1 # Reset OPTIND @@ -75,10 +75,10 @@ then elif [[ $algo == 'gatk' ]] then module load gatk/3.7 - $gvcflist='' + gvcflist='' for i in *.bam; do java -Djava.io.tmpdir=./ -Xmx32g -jar $GATK_JAR -R ${reffa} -D ${dbsnp} -T HaplotypeCaller -stand_call_conf 10 -A FisherStrand -A QualByDepth -A VariantType -A DepthPerAlleleBySample -A HaplotypeScore -A AlleleBalance -variant_index_type LINEAR -variant_index_parameter 128000 --emitRefConfidence GVCF -I $i -o ${i}.gatk.g.vcf -nct 2 & - $gvcflist+="--variant ${i}.gatk.g.vcf " + gvcflist="$gvcflist --variant ${i}.gatk.g.vcf" done wait @@ -92,5 +92,5 @@ then Platypus.py callVariants --minMapQual=10 --mergeClusteredVariants=1 --nCPU=$SLURM_CPUS_ON_NODE --bamFiles=${bamlist} --refFile=${reffa} --output=platypus.vcf vcf-sort platypus.vcf |vcf-annotate -n --fill-type -n |bgzip > platypus.vcf.gz tabix platypus.vcf.gz - bcftools norm -c s -f ${reffa} -w 10 -O z -o ${i}.pl.vcf.gz platypus.vcf.gz + bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.platypus.vcf.gz platypus.vcf.gz fi diff --git a/variants/union.sh b/variants/union.sh index 640ddfc8162e71e70e161ba0a8fd79b491d8d394..a1714df4fb6a5a4ffadad7678dccaa821834103a 100644 --- a/variants/union.sh +++ b/variants/union.sh @@ -5,7 +5,6 @@ usage() { echo "-h Help documentation for gatkrunner.sh" echo "-r --Reference Genome: GRCh38 or GRCm38" echo "-p --Prefix for output file name" - echo "-a --Algorithm/Command" echo "Example: bash hisat.sh -p prefix -r /path/GRCh38" exit 1 } @@ -20,7 +19,7 @@ do done function join_by { local IFS="$1"; shift; echo "$*"; } shift $(($OPTIND -1)) - +baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14 HS=*.hotspot.vcf.gz @@ -30,16 +29,16 @@ calllist='' for i in *.vcf.gz; do EXT="${i#*.}" CALL="${EXT%%.*}" - calllist+=" $CALL" + calllist="$calllist $CALL" tabix $i - if [[ $i eq $HS ]] + if [[ $i == $HS ]] then bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz tabix hotspot.nooverlap.vcf.gz - list2+=" hotspot.nooverlap.vcf.gz" - varlist+=" --variant:$CALL hotspot.nooverlap.vcf.gz" + list2="$list2 hotspot.nooverlap.vcf.gz" + varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz" else - varlist+=" --variant:$CALL $i" + varlist="$varlist --variant:$CALL $i" fi done @@ -48,17 +47,17 @@ bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort priority='ssvar' if [[ *.platypus.vcf.gz ]] then - priority+=',platypus' + priority="$priority,platypus" fi -priority+=',sam,gatk' +priority="$priority,sam,gatk" if [[ *.hotspot.vcf.gz ]] then - priority+=',hotspot' + priority="$priority,hotspot" fi java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf -perl $baseDir/scripts/uniform_integrated_vcf.pl ${pair_id}.int.vcf +perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf bgzip ${pair_id}_integrate.bed tabix ${pair_id}_integrate.bed.gz bgzip ${pair_id}.uniform.vcf