diff --git a/alignment/dnaseqalign.sh b/alignment/dnaseqalign.sh index 666ba1e53078ed0d9a838ea28ebb3503959dc7af..af4d9a839d06e54ad10ed9c0608d292d8d89b281 100644 --- a/alignment/dnaseqalign.sh +++ b/alignment/dnaseqalign.sh @@ -51,21 +51,18 @@ module load python/2.7.x-anaconda bwakit/0.7.15 samtools/gcc/1.8 picard/2.10.3 baseDir="`dirname \"$0\"`" diff $fq1 $fq2 > difffile -if [[ $aligner == 'bwa' ]] -then - module load bwa/intel/0.7.17 - if [ -s difffile ] - then - bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${read_group}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${read_group}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam - else - bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${read_group}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${read_group}" ${index_path}/genome.fa ${fq1} > out.sam - fi -elif [[ $aligner == 'hisat2' ]] +module load bwa/intel/0.7.17 + + +if [ -s difffile ] then - module load hisat2/2.1.0-intel - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal -x ${index_path}/hisat_index/genome -1 $fq1 -2 $fq2 -S out.sam --summary-file ${pair_id}.alignerout.txt + file_opt="${fq1} ${fq2}" +else + file_opt="${fq1}" fi +bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${read_group}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${read_group}" ${index_path}/genome.fa $file_opt > out.sam + if [[ $umi == 'umi' ]] && [[ $index_path == '/project/shared/bicf_workflow_ref/human/GRCh38' ]] then k8 ${testexe}/bwa-postalt.js -p tmphla ${index_path}/genome.fa.alt out.sam | python ${baseDir}/add_umi_sam.py -s - -o output.unsort.bam @@ -78,6 +75,7 @@ then else samtools view -1 -o output.unsort.bam out.sam fi + which samtools samtools sort -n --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam diff --git a/variants/cnvkit.sh b/variants/cnvkit.sh index b4a3eff2efffc88c188c211d80d6a6e558a180ef..0637a7c70a796df729abd68efcead28fd2b632bb 100755 --- a/variants/cnvkit.sh +++ b/variants/cnvkit.sh @@ -11,7 +11,7 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :b:p:n:f:t:c:uqh opt +while getopts :b:p:n:t:r:uqh opt do case $opt in b) sbam=$OPTARG;; @@ -19,7 +19,6 @@ do n) normals=$OPTARG;; r) index_path=$OPTARG;; t) targets=$OPTARG;; - c) capture=$OPTARG;; u) umi='umi';; q) idtsnp=1;; h) usage;; @@ -31,7 +30,7 @@ baseDir="`dirname \"$0\"`" if [[ -z $index_path ]] then - index_path='/project/shared/bicf_workflow_ref/human/GRCh38' + index_path='/project/shared/bicf_workflow_ref/human/grch38_cloud/dnaref' fi # Check for mandatory options if [[ -z $pair_id ]] || [[ -z $sbam ]] @@ -46,6 +45,14 @@ if [[ -z $normals ]] || [[ -z $targets ]] then usage fi +if [[ -s "${index_path}/genome.fa" ]] +then + reffa="${index_path}/genome.fa" + dict="${index_path}/genome.dict" +else + echo "Missing Fasta File: ${index_path}/genome.fa" + usage +fi echo "${targets}targets.bed" echo "${targets}antitargets.bed" @@ -58,7 +65,8 @@ unset DISPLAY if [[ $idtsnp == 1 ]] then samtools index ${sbam} - bcftools mpileup --threads 10 -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 99999 -C50 -f ${reffa} -t ${index_path}/clinseq_prj/IDT_snps.120bp.bed ${sbam} | bcftools call --threads 10 -vmO v -o common_variants.vcf + bcftools mpileup --threads 10 --gvcf 10 -A -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 1000000 -L 1000000 -C50 -f ${reffa} ${sbam} | bcftools call --threads 10 -vmO v -o common_variants.vcf -T ${index_path}/IDT_snps.hg38.bed + $baseDir/formatVcfCNV.pl cnvkit_common common_variants.vcf fi cnvkit.py coverage ${sbam} ${targets}targets.bed -o ${pair_id}.targetcoverage.cnn @@ -68,7 +76,7 @@ cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns if [[ $idtsnp == 1 ]] then - cnvkit.py call --filter cn ${pair_id}.cns -v common_variants.vcf -o ${pair_id}.call.cns + cnvkit.py call --filter cn ${pair_id}.cns -v cnvkit_common.vcf -o ${pair_id}.call.cns else cnvkit.py call --filter cn ${pair_id}.cns -o ${pair_id}.call.cns fi diff --git a/variants/filter_cnvkit.pl b/variants/filter_cnvkit.pl index 7bd4bbf73f26c15ed570dea7dfb8d0baef83df74..7695f102e327ea0a45d5d45bc9aaa84807fc286b 100755 --- a/variants/filter_cnvkit.pl +++ b/variants/filter_cnvkit.pl @@ -69,13 +69,11 @@ while (my $line = <CNR>) { $genes{$value} = 1 if $keep{$value}; } } - }elsif ($geneids =~ m/:/) { - my ($gene,$chr,$pos) = split(/:/,$geneids); - $genes{$gene} = 1; - }else { + } else { my @ids = split(/,/,$geneids); foreach my $gid (@ids) { - my ($gene,$trxid,$exonnum,$strand) = split(/\|/,$gid); + next if ($gid =~ /^SNP:rs\d+$/); + my ($gene,@other) = split(/:/,$gid); $genes{$gene} = 1 if $keep{$gene}; } } @@ -101,13 +99,11 @@ while (my $line = <IN>) { $genes{$value} = 1 if $keep{$value}; } } - }elsif ($geneids =~ m/:/) { - my ($gene,$chr,$pos) = split(/:/,$geneids); - $genes{$gene} = 1; - }else { + } else { my @ids = split(/,/,$geneids); foreach my $gid (@ids) { - my ($gene,$trxid,$exonnum,$strand) = split(/\|/,$gid); + next if ($gid =~ /^SNP:rs\d+$/); + my ($gene,@other) = split(/:/,$gid); $genes{$gene} = 1 if $keep{$gene}; } } diff --git a/variants/formatVcfCNV.pl b/variants/formatVcfCNV.pl new file mode 100755 index 0000000000000000000000000000000000000000..4a60d0af01e82d75e260b1c6b76a4d67af031192 --- /dev/null +++ b/variants/formatVcfCNV.pl @@ -0,0 +1,78 @@ +#!/usr/bin/perl +#migrate_db.pl + +my $pair_id = shift @ARGV; +my $vcf = shift @ARGV; +my $outfile = $pair_id.".vcf"; +open OUT, ">$outfile" or die $!; +open VCF, "$vcf" or die $!; +while (my $line = <VCF>) { + chomp($line); + if ($line =~ m/#/) { + if ($line =~ m/#CHROM/) { + print OUT "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count\">\n"; + print OUT "##FORMAT=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count\">\n"; + print OUT "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n"; + print OUT "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n"; + } + unless ($line =~ m/FORMAT=<ID=AO/ || $line =~ m/FORMAT=<ID=AD/ || $line =~ m/FORMAT=<ID=RO/ || $line =~ m/FORMAT=<ID=DP/) { + print OUT $line,"\n"; + } + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($chrom =~ m/alt/); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val; + } + my @deschead = split(/:/,$format); + my $newformat = 'GT:DP:AD:AO:RO'; + my @newgts = (); + my $missingGT = 0; + FG:foreach my $allele_info (@gts) { + my @gtinfo = split(/:/,$allele_info); + my %gtdata; + if ($allele_info eq '.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + foreach my $i (0..$#deschead) { + $gtdata{$deschead[$i]} = $gtinfo[$i]; + } + if ($alt eq '.') { + $gtdata{AO} = 0; + $gtdata{RO} = $gtdata{DP}; + $gtdata{AD} = join(",", $gtdata{RO},$gtdata{AO}); + } elsif ($gtdata{AD}){ + ($gtdata{RO},@alts) = split(/,/,$gtdata{AD}); + $gtdata{AO} = join(",",@alts); + $gtdata{DP} = $gtdata{RO}; + foreach (@alts) { + $gtdata{DP} += $_; + } + + } + if ($gtdata{DP} && $gtdata{DP} < 5) { + $missingGT ++; + } + if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); + } + next if ($missingGT == scalar(@gts)); + if ($hash{END}) { + foreach $i ($pos..$hash{END}) { + print OUT join("\t",$chrom,$i,$id,'N','N',$score,$filter,$annot,$newformat,@newgts),"\n"; + } + }else { + print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + } +} +close VCF; diff --git a/variants/gatkrunner.sh b/variants/gatkrunner.sh index 30029aee9dbf719650c4d46edeb0a20c240ad344..41436e5685ac938f62ec9a4698973a3b1b99f708 100755 --- a/variants/gatkrunner.sh +++ b/variants/gatkrunner.sh @@ -34,11 +34,11 @@ if [[ -z $SLURM_CPUS_ON_NODE ]] then SLURM_CPUS_ON_NODE=1 fi -if [[ -a "${index_path}/dbSnp.vcf.gz" ]] +if [[ -a "${index_path}/dbSnp.gatk4.vcf.gz" ]] then - dbsnp="${index_path}/dbSnp.vcf.gz" + dbsnp="${index_path}/dbSnp.gatk4.vcf.gz" else - echo "Missing dbSNP File: ${index_path}/dbSnp.vcf.gz" + echo "Missing dbSNP File: ${index_path}/dbSnp.gatk4.vcf.gz" usage fi if [[ -a "${index_path}/GoldIndels.vcf.gz" ]] diff --git a/variants/germline_vc.sh b/variants/germline_vc.sh index 4550ad90a63dcfd6f67f020bd48744418a8356a8..a8f6ee49c08c4a742943411c683292aeb4f1dbcc 100755 --- a/variants/germline_vc.sh +++ b/variants/germline_vc.sh @@ -76,7 +76,7 @@ done if [[ $algo == 'mpileup' ]] then threads=`expr $SLURM_CPUS_ON_NODE - 10` - bcftools mpileup --threads $threads -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 99999 -C50 -f ${reffa} *.bam | bcftools call --threads 10 -vmO z -o ${pair_id}.vcf.gz + bcftools mpileup --threads $threads -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -A -d 1000000 -C50 -f ${reffa} *.bam | bcftools call -A --threads 10 -vmO z -o ${pair_id}.vcf.gz vcf-annotate -n --fill-type ${pair_id}.vcf.gz | bcftools norm -c s -f ${reffa} -w 10 -O v -o sam.vcf java -jar $PICARD/picard.jar SortVcf I=sam.vcf O=${pair_id}.sam.vcf R=${reffa} CREATE_INDEX=TRUE bgzip ${pair_id}.sam.vcf diff --git a/variants/itdseek.sh b/variants/itdseek.sh index 6d47a7aed25fe659c87ed284079ff6c286827b16..d867bb70d5f0c340fd152192ccd4f56d6243d373 100755 --- a/variants/itdseek.sh +++ b/variants/itdseek.sh @@ -52,4 +52,4 @@ stexe=`which samtools` samtools view -@ $SLURM_CPUS_ON_NODE -L ${itdbed} ${sbam} | /project/shared/bicf_workflow_ref/seqprg/itdseek-1.2/itdseek.pl --refseq ${reffa} --samtools ${stexe} --bam ${sbam} | vcf-sort | bedtools intersect -header -b ${itdbed} -a stdin | bgzip > ${pair_id}.itdseek.vcf.gz tabix ${pair_id}.itdseek.vcf.gz -bcftools norm --fasta-ref $reffa -m - -Ov ${pair_id}.itdseek.vcf.gz | java -Xmx30g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 - |bgzip > ${pair_id}.itdseek_tandemdup.vcf.gz +bcftools norm --fasta-ref $reffa -c w -m - -Ov ${pair_id}.itdseek.vcf.gz | java -Xmx30g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 - |bgzip > ${pair_id}.itdseek_tandemdup.vcf.gz diff --git a/variants/norm_annot.sh b/variants/norm_annot.sh index 536aa9c5d41691e9d428637151e2ddbe2392b799..a9e869a2e91317ba19c817761da88341bbfecba8 100755 --- a/variants/norm_annot.sh +++ b/variants/norm_annot.sh @@ -13,7 +13,6 @@ OPTIND=1 # Reset OPTIND while getopts :r:p:v:h opt do case $opt in - r) index_path=$OPTARG;; p) pair_id=$OPTARG;; v) vcf=$OPTARG;; h) usage;; @@ -24,17 +23,8 @@ shift $(($OPTIND -1)) baseDir="`dirname \"$0\"`" source /etc/profile.d/modules.sh -module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q +module load bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8 snpeff/4.3q -if [[ -a "${index_path}/genome.fa" ]] -then - reffa="${index_path}/genome.fa" - dict="${index_path}/genome.dict" -else - echo "Missing Fasta File: ${index_path}/genome.fa" - usage - -fi perl $baseDir\/uniform_vcf_gt.pl $pair_id $vcf bgzip -f ${pair_id}.uniform.vcf diff --git a/variants/somatic_vc.sh b/variants/somatic_vc.sh index 0d336ef121fe0d04b5fddff7d7e4eb5fad86062b..e11da3fb777692204d2a96c13a41da204aaa311d 100755 --- a/variants/somatic_vc.sh +++ b/variants/somatic_vc.sh @@ -119,7 +119,7 @@ then vcf-sort ${tid}.mutect.filt.vcf | vcf-annotate -n --fill-type | java -jar $SNPEFF_HOME/SnpSift.jar filter -p '(GEN[*].DP >= 10)' | bgzip > ${pair_id}.mutect.vcf.gz elif [ $algo == 'varscan' ] then - module load samtools/gcc/1.8 VarScan/2.4.2 vcftools/0.1.14 + module load bcftools/gcc/1.8 samtools/gcc/1.8 VarScan/2.4.2 vcftools/0.1.14 module rm java/oracle/jdk1.7.0_51 module load snpeff/4.3q samtools mpileup -C 50 -f ${reffa} $tumor > t.mpileup diff --git a/variants/svcalling.sh b/variants/svcalling.sh index bbd1eaad0f279b0c9fe19d08fd686329944fcea8..7579fa4cd4b03a8a9adff123c154a01a2ddc9eee 100755 --- a/variants/svcalling.sh +++ b/variants/svcalling.sh @@ -136,7 +136,7 @@ then pindel2vcf -P ${pair_id}.pindel_out -r ${reffa} -R HG38 -d ${genomefiledate} -v pindel.vcf cat pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter "( GEN[*].AD[1] >= 10 )" | bgzip > pindel.vcf.gz tabix pindel.vcf.gz - bash $baseDir/norm_annot.sh -r ${index_path} -p pindel -v pindel.vcf.gz + bash $baseDir/norm_annot.sh -p pindel -v pindel.vcf.gz perl $baseDir/parse_pindel.pl ${pair_id} pindel.norm.vcf.gz java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.indel.vcf |bgzip > ${pair_id}.pindel_indel.vcf.gz java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.dup.vcf | bedtools intersect -header -b ${bed} -a stdin | bgzip > ${pair_id}.pindel_tandemdup.vcf.gz