diff --git a/alignment/bam2tdf.sh b/alignment/bam2tdf.sh index db7faefcdd1b5a3bd900a0759961730302503a9a..e6299d58137337fa2ded981c5239809c2ecf8e43 100644 --- a/alignment/bam2tdf.sh +++ b/alignment/bam2tdf.sh @@ -23,13 +23,15 @@ shift $(($OPTIND -1)) # Check for mandatory options -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi + baseDir="`dirname \"$0\"`" source /etc/profile.d/modules.sh module load igvtools/2.3.71 samtools/1.6 -samtools index -@ $SLURM_CPUS_ON_NODE $bam +samtools index -@ $NPROC $bam igvtools count -z 5 $bam ${pair_id}.tdf ${index_path}/igv/human.genome diff --git a/alignment/bamqc.sh b/alignment/bamqc.sh index 1656e5bc6f7eb7b320eb8f72f24d4f24699bdc88..cee7b80555bb0b6aa724dda97bcc33b13f0151f7 100644 --- a/alignment/bamqc.sh +++ b/alignment/bamqc.sh @@ -35,35 +35,36 @@ shift $(($OPTIND -1)) #fi source /etc/profile.d/modules.sh -module load samtools/1.6 fastqc/0.11.5 +module load samtools/gcc/1.8 fastqc/0.11.5 samtools flagstat ${sbam} > ${pair_id}.flagstat.txt fastqc -f bam ${sbam} baseDir="`dirname \"$0\"`" -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi if [[ $dedup == 1 ]] then mv $sbam ori.bam - samtools view -@ $SLURM_CPUS_ON_NODE -F 1024 -b -o ${sbam} ori.bam + samtools view -@ $NPROC -F 1024 -b -o ${sbam} ori.bam fi - +tmpdir=`pwd` if [[ $nuctype == 'dna' ]]; then module load bedtools/2.26.0 picard/2.10.3 if [[ -z $skiplc ]] then - samtools view -@ $SLURM_CPUS_ON_NODE -b -L ${bed} -o ${pair_id}.ontarget.bam ${sbam} - samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.ontarget.bam + samtools view -@ $NPROC -b -L ${bed} -o ${pair_id}.ontarget.bam ${sbam} + samtools index -@ $NPROC ${pair_id}.ontarget.bam samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt - java -Xmx64g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt - java -Xmx64g -XX:ParallelGCThreads=$SLURM_CPUS_ON_NODE -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt - samtools view -@ $SLURM_CPUS_ON_NODE -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt - samtools view -@ $SLURM_CPUS_ON_NODE ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt + java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt TMP_DIR=${tmpdir} + java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir} + samtools view -@ $NPROC -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt + samtools view -@ $NPROC ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt fi - java -Xmx64g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt + java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt TMP_DIR=${tmpdir} bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt grep ^all ${pair_id}.covhist.txt > ${pair_id}.genomecov.txt perl $baseDir/calculate_depthcov.pl ${pair_id}.covhist.txt diff --git a/alignment/combine_idxstats.pl b/alignment/combine_idxstats.pl new file mode 100644 index 0000000000000000000000000000000000000000..e4bc42ea8bdc0788358c2c324c7f3cc8de8fd261 --- /dev/null +++ b/alignment/combine_idxstats.pl @@ -0,0 +1,61 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use diagnostics; +use Getopt::Long; +#use Cwd; + +#my $cwd = getcwd(); + +my @allFiles = @ARGV; +chomp(@allFiles); +my %data; +my @samples; +open OUT, ">viral.info" or die $!; + +my %virus; +$virus{'NC_001538.1'}="BK polyomavirus"; +$virus{'NC_007605.1'}="Human gammaherpesvirus 4"; +$virus{'NC_009334.1'}="Human herpesvirus 4"; +$virus{'NC_001488.1'}="Human T-lymphotropic virus 2"; +$virus{'D90400.1'}="Human papillomavirus type 58"; +$virus{'M14119.1'}="Human papillomavirus type 11 (HPV-11)"; +$virus{'J04353.1'}="Human papillomavirus type 31 (HPV-31)"; +$virus{'M12732.1'}="Human papillomavirus type 33"; +$virus{'M74117.1'}="Human papillomavirus type 35"; +$virus{'M62849.1'}="Human papillomavirus ORFs"; +$virus{'X74481.1'}="Human papillomavirus type 52"; +$virus{'X77858.1'}="Human papilloma virus type 59"; +$virus{'NC_001355.1'}="Human papillomavirus type 6b"; +$virus{'NC_001357.1'}="Human papillomavirus - 18"; +$virus{'NC_001436.1'}="Human T-lymphotropic virus 1"; +$virus{'NC_001699.1'}="JC polyomavirus"; +$virus{'EF177177.1'}="Human papillomavirus type 56 clone Qv26342"; +$virus{'NC_009333.1'}="Human herpesvirus 8"; +$virus{'EF202167.1'}="Human papillomavirus type 45 isolate Qv31748"; +$virus{'NC_014407.1'}="Human polyomavirus 7"; +$virus{'HM355825.1'}="Merkel cell polyomavirus isolate MCVw156"; +$virus{'NC_001526.4'}="Human papillomavirus type 16"; + + +print OUT join("\t","SampleName","VirusAcc","VirusName","Mapped","Unmapped"),"\n"; +foreach my $file_idx(@allFiles){ + print "File Included: ".$file_idx."\n"; + #my @filePath = split("/", $file_idx); + my $fileName = $file_idx; + $fileName =~ s/\.idxstats\.txt//g; + push @samples, $fileName; + + open INFILE, "<$file_idx" or die $!; + + foreach my $line(<INFILE>){ + chomp $line; + my @data_array = split("\t",$line); + if($data_array[2]!=0 and $data_array[0] ne '*'){ + $data{$data_array[0]}{$fileName} = $data_array[2]."\t".$data_array[3]; + print OUT join("\t",$fileName,$data_array[0],$virus{$data_array[0]},$data_array[2],$data_array[3]),"\n"; + } + } + close INFILE; +} diff --git a/alignment/dnaseqalign.sh b/alignment/dnaseqalign.sh index 666ba1e53078ed0d9a838ea28ebb3503959dc7af..14b2dc13f508d3c6bd6fb970c7760594eb7e363b 100644 --- a/alignment/dnaseqalign.sh +++ b/alignment/dnaseqalign.sh @@ -33,9 +33,10 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi if [[ -z $read_group ]] @@ -43,7 +44,12 @@ then read_group=$pair_id fi -testexe='/project/shared/bicf_workflow_ref/seqprg/bin' +if [[ $index_path == *project* ]] +then + testexe='/project/shared/bicf_workflow_ref/seqprg/bin' +else + testexe='/usr/local/bin' +fi source /etc/profile.d/modules.sh module load python/2.7.x-anaconda bwakit/0.7.15 samtools/gcc/1.8 picard/2.10.3 @@ -51,25 +57,22 @@ module load python/2.7.x-anaconda bwakit/0.7.15 samtools/gcc/1.8 picard/2.10.3 baseDir="`dirname \"$0\"`" diff $fq1 $fq2 > difffile -if [[ $aligner == 'bwa' ]] -then - module load bwa/intel/0.7.17 - if [ -s difffile ] - then - bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${read_group}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${read_group}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam - else - bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${read_group}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${read_group}" ${index_path}/genome.fa ${fq1} > out.sam - fi -elif [[ $aligner == 'hisat2' ]] +module load bwa/intel/0.7.17 + + +if [ -s difffile ] then - module load hisat2/2.1.0-intel - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal -x ${index_path}/hisat_index/genome -1 $fq1 -2 $fq2 -S out.sam --summary-file ${pair_id}.alignerout.txt + file_opt="${fq1} ${fq2}" +else + file_opt="${fq1}" fi -if [[ $umi == 'umi' ]] && [[ $index_path == '/project/shared/bicf_workflow_ref/human/GRCh38' ]] +bwa mem -M -t $NPROC -R "@RG\tID:${read_group}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${read_group}" ${index_path}/genome.fa $file_opt > out.sam + +if [[ $umi == 'umi' ]] && [[ -f "${index_path}/genome.fa.alt" ]] then k8 ${testexe}/bwa-postalt.js -p tmphla ${index_path}/genome.fa.alt out.sam | python ${baseDir}/add_umi_sam.py -s - -o output.unsort.bam -elif [[ $index_path == '/project/shared/bicf_workflow_ref/human/GRCh38' ]] +elif [[ -f "${index_path}/genome.fa.alt" ]] then k8 ${testexe}/bwa-postalt.js -p tmphla ${index_path}/genome.fa.alt out.sam| samtools view -1 - > output.unsort.bam elif [[ $umi == 'umi' ]] @@ -78,7 +81,8 @@ then else samtools view -1 -o output.unsort.bam out.sam fi + which samtools -samtools sort -n --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam +samtools sort -n --threads $NPROC -o output.dups.bam output.unsort.bam java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam samtools index ${pair_id}.bam diff --git a/alignment/filter_genefusions.pl b/alignment/filter_genefusions.pl index c7097db2ec82487ef53d66b3e82647673b1ee592..139989f1033bf3fcb0b68d041802347da5914cd4 100755 --- a/alignment/filter_genefusions.pl +++ b/alignment/filter_genefusions.pl @@ -4,25 +4,16 @@ use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); my %opt = (); -my $results = GetOptions (\%opt,'fusion|f=s','prefix|p=s','help|h'); -my %entrez; -open ENT, "</project/shared/bicf_workflow_ref/human/gene_info.human.txt" or die $!; -my $headline = <ENT>; -while (my $line = <ENT>) { - chomp($line); - my @row = split(/\t/,$line);{ - $entrez{$row[2]} = $row[1]; - } -} +my $results = GetOptions (\%opt,'fusion|f=s','prefix|p=s','help|h','datadir|r=s'); -open OM, "</project/shared/bicf_workflow_ref/human/GRCh38/clinseq_prj/utswv2_known_genefusions.txt" or die $!; +open OM, "<$opt{datadir}/known_genefusions.txt" or die $!; while (my $line = <OM>) { chomp($line); $known{$line} = 1; } close OM; -open OM, "</project/shared/bicf_workflow_ref/human/GRCh38/clinseq_prj/panel1410.genelist.txt" or die $!; +open OM, "<$opt{datadir}/panelgenes.txt" or die $!; while (my $line = <OM>) { chomp($line); $keep{$line} = 1; @@ -58,15 +49,11 @@ foreach $efile (@exonfiles) { } open OAS, ">$opt{prefix}\.translocations.answer.txt" or die $!; -open OUTIR, ">$opt{prefix}\.cbioportal.genefusions.txt" or die $!; -print OAS join("\t","FusionName","LeftGene","LefttBreakpoint","LeftGeneExons","LeftStrand", +print OAS join("\t","FusionName","LeftGene","LeftBreakpoint","LeftGeneExons","LeftStrand", "RightGene","RightBreakpoint","RightGeneExons","RightStrand", "RNAReads","DNAReads","FusionType","Annot",'Filter','ChrType','ChrDistance'),"\n"; -print OUTIR join("\t","Hugo_Symbol","Entrez_Gene_Id","Center","Tumor_Sample_Barcode", - "Fusion","DNA_support","RNA_support","Method","Frame"),"\n"; - my $sname = $opt{prefix}; open FUSION, "<$opt{fusion}" or die $!; @@ -132,12 +119,7 @@ while (my $line = <FUSION>) { print OAS join("\t",$fname,$hash{LeftGene},$hash{LeftBreakpoint},$leftexon,$hash{LeftStrand}, $hash{RightGene},$hash{RightBreakpoint},$rightexon,$hash{RightStrand}, $hash{SumRNAReads},0,lc($hash{PROT_FUSION_TYPE}),$fusion_annot,$qc,$chrtype,$chrdist),"\n"; - print OUTIR join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}},"UTSW",$sname,$fname." fusion", - $dna_support,$rna_support,"STAR Fusion",lc($hash{PROT_FUSION_TYPE})),"\n"; - print OUTIR join("\t",$hash{RightGene},$entrez{$hash{RightGene}},"UTSW",$sname,$fname." fusion", - $dna_support,$rna_support,"STAR Fusion",lc($hash{PROT_FUSION_TYPE})),"\n"; } close OAS; -close OUTIR; diff --git a/alignment/hisat_genotype.sh b/alignment/hisat_genotype.sh index 7e0655ecd393c73483cb63201d266c3c554b687c..4ee37c672dea6910e9f0a8e9b8516e55ed4d35b6 100644 --- a/alignment/hisat_genotype.sh +++ b/alignment/hisat_genotype.sh @@ -29,9 +29,10 @@ if [[ -z $pair_id ]]; then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi source /etc/profile.d/modules.sh diff --git a/alignment/indexbams.sh b/alignment/indexbams.sh index 38238fb1b337c8051e225e9e0f410ccd3fe2bd8a..2af125dd027eac1ad1ae2cefd1f776e0a9bc17a5 100644 --- a/alignment/indexbams.sh +++ b/alignment/indexbams.sh @@ -18,14 +18,16 @@ shift $(($OPTIND -1)) # Check for mandatory options -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi + baseDir="`dirname \"$0\"`" source /etc/profile.d/modules.sh module load samtools/1.6 for i in *.bam; do - samtools index -@ $SLURM_CPUS_ON_NODE ${i} + samtools index -@ $NPROC ${i} done diff --git a/alignment/markdups.sh b/alignment/markdups.sh index 5687a5d19c43811861e59b7b0c3552121046df23..dbe1b8ef0c81e4088621d8007a6513f306771943 100644 --- a/alignment/markdups.sh +++ b/alignment/markdups.sh @@ -10,12 +10,13 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :a:b:p:h opt +while getopts :a:b:r:p:h opt do case $opt in a) algo=$OPTARG;; b) sbam=$OPTARG;; p) pair_id=$OPTARG;; + r) index_path=$OPTARG;; h) usage;; esac done @@ -27,41 +28,67 @@ if [[ -z $pair_id ]] || [[ -z $sbam ]]; then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi + +if [[ -z $index_path ]] +then + index_path="/project/shared/bicf_workflow_ref/human/grch38_cloud/dnaref" +fi + +if [[ $index_path == *project* ]] +then + testexe='/project/shared/bicf_workflow_ref/seqprg/bin' +else + testexe='/usr/local/bin' +fi + baseDir="`dirname \"$0\"`" source /etc/profile.d/modules.sh -module load picard/2.10.3 samtools/gcc/1.8 +module load picard/2.10.3 if [ $algo == 'sambamba' ] then module load speedseq/20160506 - sambamba markdup -t $SLURM_CPUS_ON_NODE ${sbam} ${pair_id}.dedup.bam + sambamba markdup -t $NPROC ${sbam} ${pair_id}.dedup.bam + touch ${pair_id}.dedup.stat.txt elif [ $algo == 'samtools' ] then - samtools markdup -s --output-fmt BAM -@ $SLURM_CPUS_ON_NODE sort.bam ${pair_id}.dedup.bam + module load samtools/gcc/1.8 + samtools markdup -s --output-fmt BAM -@ $NPROC sort.bam ${pair_id}.dedup.bam + touch ${pair_id}.dedup.stat.txt elif [ $algo == 'picard' ] then - java -XX:ParallelGCThreads=$SLURM_CPUS_ON_NODE -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar MarkDuplicates I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt + java -XX:ParallelGCThreads=$NPROC -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar MarkDuplicates I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt elif [ $algo == 'picard_umi' ] then - java -XX:ParallelGCThreads=$SLURM_CPUS_ON_NODE -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt + java -XX:ParallelGCThreads=$NPROC -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt elif [ $algo == 'fgbio_umi' ] then - module load fgbio bwa/intel/0.7.15 - samtools index -@ $SLURM_CPUS_ON_NODE ${sbam} - fgbio GroupReadsByUmi -s identity -i ${sbam} -o ${pair_id}.group.bam --family-size-histogram ${pair_id}.umihist.txt -e 0 -m 0 - fgbio CallMolecularConsensusReads -i ${pair_id}.group.bam -p consensus -M 1 -o ${pair_id}.consensus.bam -S ':none:' - samtools index ${pair_id}.consensus.bam + module load fgbio bwakit/0.7.15 bwa/intel/0.7.17 samtools/gcc/1.8 + samtools index -@ $NPROC ${sbam} + fgbio --tmp-dir ./ GroupReadsByUmi -s identity -i ${sbam} -o group.bam --family-size-histogram ${pair_id}.umihist.txt -e 0 -m 0 + fgbio --tmp-dir ./ CallMolecularConsensusReads -i group.bam -p consensus -M 1 -o ${pair_id}.consensus.bam -S ':none:' + samtools index -@ $NPROC ${pair_id}.consensus.bam samtools fastq -1 ${pair_id}.consensus.R1.fastq -2 ${pair_id}.consensus.R2.fastq ${pair_id}.consensus.bam gzip ${pair_id}.consensus.R1.fastq gzip ${pair_id}.consensus.R2.fastq - bwa mem -M -C -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" /project/shared/bicf_workflow_ref/human/GRCh38/genome.fa ${pair_id}.consensus.R1.fastq.gz ${pair_id}.consensus.R2.fastq.gz | samtools view -1 - > ${pair_id}.consensus.bam - samtools sort --threads $SLURM_CPUS_ON_NODE -o ${pair_id}.dedup.bam ${pair_id}.consensus.bam + bwa mem -M -C -t $NPROC -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${pair_id}.consensus.R1.fastq.gz ${pair_id}.consensus.R2.fastq.gz > out.sam + if [[ ${index_path}/genome.fa.alt ]] + then + k8 ${testexe}/bwa-postalt.js -p tmphla ${index_path}/genome.fa.alt out.sam | samtools view -1 - > ${pair_id}.consensus.bam + else + samtools view -1 out.sam > ${pair_id}.consensus.bam + fi + samtools sort --threads $NPROC -o ${pair_id}.dedup.bam ${pair_id}.consensus.bam + samtools sort --threads $NPROC -o ${pair_id}.group.bam group.bam + samtools index -@ $NPROC ${pair_id}.group.bam else cp ${sbam} ${pair_id}.dedup.bam fi -samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.dedup.bam +module load samtools/gcc/1.8 +samtools index -@ $NPROC ${pair_id}.dedup.bam diff --git a/alignment/rnaseqalign.sh b/alignment/rnaseqalign.sh index 963143fa2682ddb77e1d326148d36f906a9323f2..a46a1f75af784aa1d6ef7df386b087e98fe5d39f 100644 --- a/alignment/rnaseqalign.sh +++ b/alignment/rnaseqalign.sh @@ -36,9 +36,10 @@ fi source /etc/profile.d/modules.sh module load samtools/1.6 picard/2.10.3 baseDir="`dirname \"$0\"`" -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi diff $fq1 $fq2 > difffile @@ -59,17 +60,17 @@ else module load hisat2/2.1.0-intel if [ -s difffile ] then - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal --dta -x ${index_path}/hisat_index/genome -1 $fq1 -2 $fq2 -S out.sam --summary-file ${pair_id}.alignerout.txt + hisat2 -p $NPROC --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal --dta -x ${index_path}/genome -1 $fq1 -2 $fq2 -S out.sam --summary-file ${pair_id}.alignerout.txt else - hisat2 -p $SLURM_CPUS_ON_NODE --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal --dta -x ${index_path}/hisat_index/genome -U $fq1 -S out.sam --summary-file ${pair_id}.alignerout.txt + hisat2 -p $NPROC --rg-id ${pair_id} --rg LB:tx --rg PL:illumina --rg PU:barcode --rg SM:${pair_id} --no-unal --dta -x ${index_path}/genome -U $fq1 -S out.sam --summary-file ${pair_id}.alignerout.txt fi if [[ $umi == 1 ]] then python ${baseDir}/add_umi_sam.py -s out.sam -o output.bam else - samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam + samtools view -1 --threads $NPROC -o output.bam out.sam fi - samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -o ${pair_id}.bam output.bam + samtools sort -@ $NPROC -O BAM -o ${pair_id}.bam output.bam fi -samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam +samtools index -@ $NPROC ${pair_id}.bam diff --git a/alignment/starfusion.sh b/alignment/starfusion.sh index cc0d8dfda4ef497503d6853906da14eef945f845..a9f984c59e288df27ea45277fb2c4848449564c5 100644 --- a/alignment/starfusion.sh +++ b/alignment/starfusion.sh @@ -14,7 +14,7 @@ OPTIND=1 # Reset OPTIND while getopts :r:a:b:p:m:fh opt do case $opt in - r) refgeno=$OPTARG;; + r) index_path=$OPTARG;; a) fq1=$OPTARG;; b) fq2=$OPTARG;; p) pair_id=$OPTARG;; @@ -31,9 +31,10 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi baseDir="`dirname \"$0\"`" @@ -49,14 +50,13 @@ then mkdir $tmphome fi export TMP_HOME=$tmphome - index_path=${refgeno}/CTAT_lib_trinity1.6 - trinity /usr/local/src/STAR-Fusion/STAR-Fusion --min_sum_frags 3 --CPU $SLURM_CPUS_ON_NODE --genome_lib_dir ${index_path} --left_fq ${fq1} --right_fq ${fq2} --examine_coding_effect --output_dir ${pair_id}_star_fusion - #cp ${pair_id}_star_fusion/star-fusion.fusion_predictions.abridged.tsv ${pair_id}.starfusion.txt + refgeno=${index_path}/CTAT_lib_trinity1.6 + trinity /usr/local/src/STAR-Fusion/STAR-Fusion --min_sum_frags 3 --CPU $NPROC --genome_lib_dir ${refgeno} --left_fq ${fq1} --right_fq ${fq2} --examine_coding_effect --output_dir ${pair_id}_star_fusion cp ${pair_id}_star_fusion/star-fusion.fusion_predictions.abridged.coding_effect.tsv ${pair_id}.starfusion.txt else module add star/2.5.2b - index_path=${refgeno}/CTAT_lib/ - STAR-Fusion --genome_lib_dir ${index_path} --min_sum_frags 3 --left_fq ${fq1} --right_fq ${fq2} --output_dir ${pair_id}_star_fusion &> star_fusion.err + refgeno=${index_path}/CTAT_lib/ + STAR-Fusion --genome_lib_dir ${refgeno} --min_sum_frags 3 --left_fq ${fq1} --right_fq ${fq2} --output_dir ${pair_id}_star_fusion &> star_fusion.err cp ${pair_id}_star_fusion/star-fusion.fusion_candidates.final.abridged ${pair_id}.starfusion.txt fi @@ -67,8 +67,8 @@ cut -f 5-8 ${pair_id}.starfusion.txt |perl -pe 's/\^|:/\t/g' | awk '{print "sing if [[ $filter == 1 ]] then cut -f 6,8 ${pair_id}.starfusion.txt |grep -v Breakpoint |perl -pe 's/\t/\n/g' |awk -F ':' '{print $1"\t"$2-1"\t"$2}' > temp.bed - bedtools intersect -wao -a temp.bed -b /project/shared/bicf_workflow_ref/human/GRCh38/cytoBand.txt |cut -f 1,2,7 > cytoband_pos.txt - perl $baseDir/filter_genefusions.pl -p ${pair_id} -f ${pair_id}.starfusion.txt + bedtools intersect -wao -a temp.bed -b ${index_path}/cytoBand.txt |cut -f 1,2,7 > cytoband_pos.txt + perl $baseDir/filter_genefusions.pl -p ${pair_id} -r ${index_path} -f ${pair_id}.starfusion.txt fi diff --git a/alignment/viralalign.sh b/alignment/viralalign.sh new file mode 100644 index 0000000000000000000000000000000000000000..d3608a59c349e98f5c382d1a8a9430388116d933 --- /dev/null +++ b/alignment/viralalign.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#abra.sh + +usage(){ + echo "-h Help documentation for gatk4runner.sh" + echo "-b --BAM File" + echo "-r --Reference path e.g. GRCh38" + echo "-p --Sample/Project ID" +} + +OPTIND=1 #Reset OPTIN +while getopts :b:r:p:h opt +do + case $opt in + b) bam=$OPTARG;; + r) ref=$OPTARG;; + p) pairid=$OPTARG;; + h) usage;; + esac +done + +shift $(($OPTIND -1)) + +#Check for mandatory options +if [[ -z $bam ]] || [[ -z $pairid ]] +then + usage +fi + +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi + +reffa=${ref}/idt_virus_reference.fa +source /etc/profile.d/modules.sh +module load bwa/intel/0.7.17 picard/2.10.3 samtools/1.6 + +samtools view -@ 8 -b -u -F 2 ${bam} |samtools sort -n - >unmapped.bam +java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar SamToFastq I=unmapped.bam FASTQ=unmapped.R1.fastq SECOND_END_FASTQ=unmapped.R2.fastq UNPAIRED_FASTQ=unmapped.unpaired.fastq + +bwa mem -M -t 4 -R "@RG\tID:${pairid}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pairid}" ${reffa} unmapped.R1.fastq unmapped.R2.fastq > out.sam + +samtools view -h -F 256 -b out.sam -o out.bam +samtools sort out.bam -o ${pairid}.viral.bam +samtools index ${pairid}.viral.bam +samtools idxstats ${pairid}.viral.bam >${pairid}.viral.idxstats.txt +samtools flagstat ${pairid}.viral.bam >${pairid}.viral.flagstat.txt diff --git a/genect_rnaseq/geneabundance.sh b/genect_rnaseq/geneabundance.sh index 29efd1587195b4cd50f5c9b25edc5efa02e0be0d..5d1a362da7f790c8dafb40218f83f5d78f3463aa 100644 --- a/genect_rnaseq/geneabundance.sh +++ b/genect_rnaseq/geneabundance.sh @@ -29,20 +29,23 @@ if [[ -z $pair_id ]] || [[ -z $sbam ]] then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi -if [[ $SLURM_CPUS_ON_NODE > 64 ]] +if [[ $NPROC > 64 ]] then - SLURM_CPUS_ON_NODE=64 + NPROC=64 fi source /etc/profile.d/modules.sh module load subread/1.6.1 + export PATH=/project/shared/bicf_workflow_ref/seqprg/bin:$PATH -featureCounts -s $stranded -M --fraction -J --ignoreDup -T $SLURM_CPUS_ON_NODE -t exon -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam} +featureCounts -s $stranded -M --fraction -J --ignoreDup -T $NPROC -p -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam} + mkdir ${pair_id}_stringtie cd ${pair_id}_stringtie -stringtie ../${sbam} -p $SLURM_CPUS_ON_NODE -G ${gtf} -B -e -o denovo.gtf -A ../${pair_id}.fpkm.txt +stringtie ../${sbam} -p $NPROC -G ${gtf} -B -e -o denovo.gtf -A ../${pair_id}.fpkm.txt diff --git a/genect_rnaseq/statanal.sh b/genect_rnaseq/statanal.sh index ad5cab388e0f1edbe81cb8af30c4515dc6b6d14a..9cba1e37976c079754bd830ebf1b38acf12dec03 100644 --- a/genect_rnaseq/statanal.sh +++ b/genect_rnaseq/statanal.sh @@ -20,9 +20,10 @@ shift $(($OPTIND -1)) baseDir="`dirname \"$0\"`" # Check for mandatory options -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi source /etc/profile.d/modules.sh diff --git a/variants/annot_sv.pl b/obsolete/annot_sv.pl similarity index 100% rename from variants/annot_sv.pl rename to obsolete/annot_sv.pl diff --git a/genect_rnaseq/cBioPortal_documents.pl b/obsolete/cBioPortal_documents.pl similarity index 89% rename from genect_rnaseq/cBioPortal_documents.pl rename to obsolete/cBioPortal_documents.pl index 31f6803d2e6dd0bde08ced0313be1b4f62910801..79b4f0a4ab2c08ddd9d7a57c38e9402755e8032b 100644 --- a/genect_rnaseq/cBioPortal_documents.pl +++ b/obsolete/cBioPortal_documents.pl @@ -4,9 +4,9 @@ use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); use File::Basename; -my $results= GetOptions (\%opt,'fpkm|f=s','logcpm|l=s','cnv|c=s','prefix|p=s','help|h'); +my $results= GetOptions (\%opt,'fpkm|f=s','logcpm|l=s','cnv|c=s','prefix|p=s','help|h','datadir|d=s'); -open ENT_GENE, "</project/shared/bicf_workflow_ref/human/gene_info.human.txt" or die $!; +open ENT_GENE, "<$datadir\/gene_info.human.txt" or die $!; my %entrez; my %entgene; my $ent_header = <ENT_GENE>; @@ -17,7 +17,7 @@ while (my $line = <ENT_GENE>){ #$entrez{$row[2]}=$row[1]; } close ENT_GENE; -open ENT_ENS, "</project/shared/bicf_workflow_ref/human/GRCh38/genenames.txt" or die $!; +open ENT_ENS, "<$datadir\/genenames.txt" or die $!; my $gn_header = <ENT_ENS>; my %ensym; while (my $line = <ENT_ENS>){ @@ -26,7 +26,7 @@ while (my $line = <ENT_ENS>){ $entrez{$row[3]}=$entrez{$row[4]}; } close ENT_ENS; -open ENT_ENS, "</project/shared/bicf_workflow_ref/human/gene2ensembl.human.txt" or die $!; +open ENT_ENS, "<$datadir\/gene2ensembl.human.txt" or die $!; my $ens_header = <ENT_ENS>; while (my $line = <ENT_ENS>){ chomp $line; diff --git a/variants/parse_svresults.pl b/obsolete/parse_svresults.pl similarity index 100% rename from variants/parse_svresults.pl rename to obsolete/parse_svresults.pl diff --git a/variants/somatic_callers.sh b/obsolete/somatic_callers.sh similarity index 100% rename from variants/somatic_callers.sh rename to obsolete/somatic_callers.sh diff --git a/variants/svannot.pl b/obsolete/svannot.pl similarity index 100% rename from variants/svannot.pl rename to obsolete/svannot.pl diff --git a/variants/uniform_integrated_vcf.pl b/obsolete/uniform_integrated_vcf.pl similarity index 100% rename from variants/uniform_integrated_vcf.pl rename to obsolete/uniform_integrated_vcf.pl diff --git a/preproc_fastq/trimgalore.sh b/preproc_fastq/trimgalore.sh index 1ae8ba12606bc808337fcea4e3fbe6fe7b28fea1..c9f00d14d3d7ad936056ec5abdc9eaba779d0d9f 100644 --- a/preproc_fastq/trimgalore.sh +++ b/preproc_fastq/trimgalore.sh @@ -10,17 +10,19 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :a:b:p:h opt +while getopts :a:b:p:fh opt do case $opt in a) fq1=$OPTARG;; b) fq2=$OPTARG;; p) pair_id=$OPTARG;; + f) filter=1;; h) usage;; esac done shift $(($OPTIND -1)) +baseDir="`dirname \"$0\"`" # Check for mandatory options if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then @@ -42,3 +44,8 @@ else mv ${r1base}_trimmed.fq.gz ${pair_id}.trim.R1.fastq.gz cp ${pair_id}.trim.R1.fastq.gz ${pair_id}.trim.R2.fastq.gz fi + +if [[ $filter == 1 ]] +then + perl $baseDir/parse_trimreport.pl ${pair_id}.trimreport.txt *trimming_report.txt +fi diff --git a/variants/annotvcf.sh b/variants/annotvcf.sh index c493fc17e600be9985b0b6971b3335bf7eb3e1b5..c538256f148a8b31c92b8513dd670caeb27ad33c 100755 --- a/variants/annotvcf.sh +++ b/variants/annotvcf.sh @@ -10,12 +10,13 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:v:p:h opt +while getopts :r:v:g:p:h opt do case $opt in r) index_path=$OPTARG;; p) pair_id=$OPTARG;; v) unionvcf=$OPTARG;; + g) snpeffgeno=$OPTARG;; h) usage;; esac done @@ -23,35 +24,57 @@ done shift $(($OPTIND -1)) source /etc/profile.d/modules.sh -module load bedtools/2.26.0 samtools/1.6 snpeff/4.3q +module load bedtools/2.26.0 snpeff/4.3q -if [[ $index_path == '/project/shared/bicf_workflow_ref/human/GRCh38/hisat_index' ]] +if [[ $index_path == '/project/shared/bicf_workflow_ref/human/grch38_cloud/rnaref' ]] then - index_path='/project/shared/bicf_workflow_ref/human/GRCh38' -elif [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]] + index_path='/project/shared/bicf_workflow_ref/human/grch38_cloud/dnaref' +fi +if [[ -z $snpeffgeno ]] then - index_path='/project/shared/bicf_workflow_ref/human/GRCh38' + snpeffgeno='GRCh38.86' fi - -if [[ $index_path == '/project/shared/bicf_workflow_ref/human/GRCh38' ]] +if [[ -f "${index_path}/gnomad.txt.gz" ]] then tabix -f ${unionvcf} bcftools annotate -Oz -a ${index_path}/gnomad.txt.gz -h ${index_path}/gnomad.header -c CHROM,POS,REF,ALT,GNOMAD_HOM,GNOMAD_AF,AF_POPMAX,GNOMAD_HG19_VARIANT -o ${pair_id}.gnomad.vcf.gz ${unionvcf} - tabix ${pair_id}.gnomad.vcf.gz - bcftools annotate -Oz -a ${index_path}/oncokb_hotspot.txt.gz -o ${pair_id}.oncohotspot.vcf.gz -h ${index_path}/oncokb_hotspot.header -c CHROM,FROM,TO,OncoKB_REF,OncoKB_ALT,Gene,OncoKB_ProteinChange,OncoKB_AF,OncoTree_Tissue,OncoTree_MainType,OncoTree_Code,OncoKBHotspot ${pair_id}.gnomad.vcf.gz - tabix ${pair_id}.oncohotspot.vcf.gz - bcftools annotate -Oz -a ${index_path}/repeat_regions.bed.gz -o ${pair_id}.repeat.vcf.gz --columns CHROM,FROM,TO,RepeatType -h /project/shared/bicf_workflow_ref/RepeatType.header ${pair_id}.oncohotspot.vcf.gz - java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-downstream -no-upstream -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.repeat.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar annotate -id ${index_path}/dbSnp.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CLNSIG,CLNDSDB,CLNDSDBID,CLNDBN,CLNREVSTAT,CLNACC ${index_path}/clinvar.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info LEGACY_ID,CNT ${index_path}/cosmic.vcf.gz - | java -Xmx10g -jar $SNPEFF_HOME/SnpSift.jar dbnsfp -v -db ${index_path}/dbNSFP.txt.gz - | bgzip > ${pair_id}.annot.vcf.gz - tabix ${pair_id}.annot.vcf.gz -else - if [[ $index_path == '/project/shared/bicf_workflow_ref/mouse/GRCm38' ]] - then - snpeffvers='GRCm38.86' - elif [[ $index_path == '/project/shared/bicf_workflow_ref/human/GRCh37' ]] - then - snpeffvers='GRCh37.75' - fi - java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffvers} ${unionvcf} |bgzip > ${pair_id}.annot.vcf.gz - tabix ${pair_id}.annot.vcf.gz + tabix -f ${pair_id}.gnomad.vcf.gz + unionvcf=${pair_id}.gnomad.vcf.gz +fi +if [[ -f "${index_path}/oncokb_hotspot.txt.gz" ]] +then + bcftools annotate -Oz -a ${index_path}/oncokb_hotspot.txt.gz -o ${pair_id}.oncohotspot.vcf.gz -h ${index_path}/oncokb_hotspot.header -c CHROM,FROM,TO,OncoKB_REF,OncoKB_ALT,Gene,OncoKB_ProteinChange,OncoKB_AF,OncoTree_Tissue,OncoTree_MainType,OncoTree_Code,OncoKBHotspot $unionvcf + tabix -f ${pair_id}.oncohotspot.vcf.gz + unionvcf=${pair_id}.oncohotspot.vcf.gz +fi +if [[ -f "${index_path}/repeat_regions.bed.gz" ]] +then + bcftools annotate -Oz -a ${index_path}/repeat_regions.bed.gz -o ${pair_id}.repeat.vcf.gz --columns CHROM,FROM,TO,RepeatType -h ${index_path}/RepeatType.header ${unionvcf} + unionvcf=${pair_id}.repeat.vcf.gz +fi + +acom="java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-downstream -no-upstream -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config $snpeffgeno ${unionvcf}" + +if [[ -f ${index_path}/dbSnp.vcf.gz ]] +then + acom+=" | java -jar $SNPEFF_HOME/SnpSift.jar annotate -id ${index_path}/dbSnp.vcf.gz -" fi +if [[ -f ${index_path}/clinvar.vcf.gz ]] +then + acom+=" | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CLNSIG,CLNDSDB,CLNDSDBID,CLNDBN,CLNREVSTAT,CLNACC ${index_path}/clinvar.vcf.gz -" +fi +if [[ -f ${index_path}/cosmic.vcf.gz ]] +then + acom+=" | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info LEGACY_ID,CNT ${index_path}/cosmic.vcf.gz -" +fi +if [[ -f ${index_path}/dbNSFP.txt.gz ]] +then + acom+=" | java -Xmx10g -jar $SNPEFF_HOME/SnpSift.jar dbnsfp -v -db ${index_path}/dbNSFP.txt.gz -" +fi + +acom+=" | bgzip > ${pair_id}.annot.vcf.gz " + +eval $acom + +tabix ${pair_id}.annot.vcf.gz diff --git a/variants/cnvkit.sh b/variants/cnvkit.sh index 1031ec3d77f414c91aa09ce56a5470474797ae17..bb9ea279be9b84e3466415fa82f1cb7e00feacc1 100755 --- a/variants/cnvkit.sh +++ b/variants/cnvkit.sh @@ -11,15 +11,13 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :b:p:n:f:t:c:uh opt +while getopts :b:p:n:d:t:r:uqh opt do case $opt in b) sbam=$OPTARG;; + d) paneldir=$OPTARG;; p) pair_id=$OPTARG;; - n) normals=$OPTARG;; r) index_path=$OPTARG;; - t) targets=$OPTARG;; - c) capture=$OPTARG;; u) umi='umi';; h) usage;; esac @@ -30,39 +28,92 @@ baseDir="`dirname \"$0\"`" if [[ -z $index_path ]] then - index_path='/project/shared/bicf_workflow_ref/human/GRCh38' + index_path='/project/shared/bicf_workflow_ref/human/grch38_cloud/dnaref' fi # Check for mandatory options if [[ -z $pair_id ]] || [[ -z $sbam ]] then + "missing pair_id or bam" usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi -if [[ -z $normals ]] || [[ -z $targets ]] +if [[ -z $paneldir ]] then + echo "missing panel dir" usage fi +if [[ -s "${index_path}/genome.fa" ]] +then + reffa="${index_path}/genome.fa" + dict="${index_path}/genome.dict" +else + echo "Missing Fasta File: ${index_path}/genome.fa" + usage +fi +if [[ -z $paneldir ]] +then + paneldir="UTSW_V3_pancancer" +fi + +capture="$paneldir/targetpanel.bed" +targets="$paneldir/cnvkit." +normals="$paneldir/pon.cnn" + +source /etc/profile.d/modules.sh +module load cnvkit/0.9.5 bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8 java/oracle/jdk1.8.0_171 snpeff/4.3q + +if [[ -f "${paneldir}/pon.downsample.cnn" ]] +then + bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${capture} -b ${sbam} -hist > covhist.txt + grep ^all covhist.txt > genomecov.txt + sumdepth=`awk '{ sum+= $2*$3;} END {print sum;}' genomecov.txt` + total=`head -n 1 genomecov.txt |cut -f 4` + avgdepth=$((${sumdepth}/${total})) + if [[ "$avgdepth" -lt 1000 ]] + then + normals="${paneldir}/pon.downsample.cnn" + fi +fi echo "${targets}targets.bed" echo "${targets}antitargets.bed" echo "${normals}" -source /etc/profile.d/modules.sh -module load cnvkit/0.9.5 bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8 -unset DISPLAY +numsnps=`grep -c -P "rs[0-9]+" ${targets}targets.bed` +panelsize=`awk '{ sum+=$3-$2} END {print sum}' ${targets}targets.bed` -#samtools index ${sbam} -#bcftools mpileup --threads 10 -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 99999 -C50 -f ${reffa} -t ${index_path}/NGSCheckMate.bed ${sbam} | bcftools call --threads 10 -vmO v -o common_variants.vcf +unset DISPLAY cnvkit.py coverage ${sbam} ${targets}targets.bed -o ${pair_id}.targetcoverage.cnn cnvkit.py coverage ${sbam} ${targets}antitargets.bed -o ${pair_id}.antitargetcoverage.cnn cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn ${normals} -o ${pair_id}.cnr -cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns -cnvkit.py call --filter cn ${pair_id}.cns -o ${pair_id}.call.cns -#cnvkit.py call --filter cn ${pair_id}.cns -v common_variants.vcf -o ${pair_id}.ballelecall.cns -cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf -cut -f 1,2,3 ${pair_id}.call.cns | grep -v chrom | bedtools intersect -wao -b /project/shared/bicf_workflow_ref/human/GRCh38/cytoBand.txt -a stdin |cut -f 1,2,3,7 > ${pair_id}.cytoband.bed -perl $baseDir/filter_cnvkit.pl ${pair_id}.call.cns +if [[ $panelsize -gt 4000000 ]] +then + cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns +else + cnvkit.py segment -m haar ${pair_id}.cnr -o ${pair_id}.cns +fi + +if [[ $numsnps -gt 100 ]] +then + #samtools index ${sbam} + #java -jar /cm/shared/apps/gatk/3.8/target/package/GenomeAnalysisTK.jar -T UnifiedGenotyper -R ${reffa} --output_mode EMIT_ALL_SITES -L ${index_path}/IDT_snps.hg38.bed -o common_variants.vcf -glm BOTH -dcov 10000 -I ${sbam} + bcftools mpileup -A -d 1000000 -C50 -Ou --gvcf 0 -f ${reffa} -a INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR -T ${index_path}/IDT_snps.hg38.bed ${sbam} | bcftools call -m --gvcf 0 -Ov | bcftools convert --gvcf2vcf -f ${reffa} -Ov -o common_variants.vcf + $baseDir/formatVcfCNV.pl cnvkit_common common_variants.vcf + echo -e "CHROM\tPOS\tAO\tRO\tDP\tMAF" > ${pair_id}.ballelefreq.txt + java -jar $SNPEFF_HOME/SnpSift.jar extractFields cnvkit_common.vcf CHROM POS GEN[0].AO GEN[0].RO GEN[0].DP |grep -v CHROM | awk '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$3/$5}' >> ${pair_id}.ballelefreq.txt + + cnvkit.py call --filter cn ${pair_id}.cns -v cnvkit_common.vcf -o ${pair_id}.call.cns + cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf -v cnvkit_common.vcf + +else + cnvkit.py call --filter cn ${pair_id}.cns -o ${pair_id}.call.cns + cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf +fi + +cut -f 1,2,3 ${pair_id}.call.cns | grep -v chrom | bedtools intersect -wao -b ${index_path}/cytoBand.txt -a stdin |cut -f 1,2,3,7 > ${pair_id}.cytoband.bed +perl $baseDir/filter_cnvkit.pl -s ${pair_id}.call.cns diff --git a/variants/filter_cnvkit.pl b/variants/filter_cnvkit.pl index e86ee768a88b5633cddc4ee05653e490b030df12..86545f5e029436d89e241c5b124a177aa950e35f 100755 --- a/variants/filter_cnvkit.pl +++ b/variants/filter_cnvkit.pl @@ -1,34 +1,14 @@ #!/usr/bin/perl -w #parse_cnvkit_table.pl -my $refdir = '/project/shared/bicf_workflow_ref/human/GRCh38/'; -open OM, "<$refdir\/clinseq_prj/panel1410.genelist.txt" or die $!; -while (my $line = <OM>) { - chomp($line); - $keep{$line} = 1; -} - -open ENT_ENS, "<$refdir/../gene2ensembl.human.txt" or die $!; -my %entrez; -my $ent_header = <ENT_ENS>; -while (my $line = <ENT_ENS>){ - chomp $line; - my @row = split(/\t/, $line); - $entrez{$row[2]}=$row[1]; -} -close ENT_ENS; -open ENT_SYM, "<$refdir/../gene_info.human.txt" or die $!; -$ent_header = <ENT_SYM>; -while (my $line = <ENT_SYM>){ - chomp $line; - my @row = split(/\t/, $line); - $entrez{$row[2]}=$row[1]; -} -close ENT_SYM; +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'input|s=s','help|h'); -my $file = shift @ARGV; +my $file = $opt{input}; my $sname = (split(/\./,(split(/\//,$file))[-1]))[0]; my $prefix = (split(/\./,$file))[0]; + my %cyto; open CYTO, "<$prefix\.cytoband.bed" or die $!; while (my $line = <CYTO>) { @@ -40,93 +20,99 @@ while (my $line = <CYTO>) { push @{$cyto{$key}{$1}},$2; } -open OUT, ">$prefix\.cnvcalls.txt" or die $!; -open OUT2, ">$prefix\.cnv.answer.txt" or die $!; -open OUT3, ">$prefix\.answerplot.cns" or die $!; -open BIO, ">$prefix\.data_cna_discrete.cbioportal.txt" or die $!; -open BIO2, ">$prefix\.data_cna_continuous.cbioportal.txt" or die $!; +open OUT, ">$prefix\.cnv.answer.txt" or die $!; +open CNSO, ">$prefix\.answerplot.cns" or die $!; -print OUT join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score"),"\n"; -print OUT3 join("\t","Chromosome","Start","End","Log2","CN"),"\n"; -print OUT2 join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score","CytoBand"),"\n"; -print BIO join("\t","Hugo_Symbol","Entrez_Gene_Id",$sname),"\n"; -print BIO2 join("\t","Hugo_Symbol","Entrez_Gene_Id",$sname),"\n"; +print CNSO join("\t","Chromosome","Start","End","Log2","CN"),"\n"; +print OUT join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score","CytoBand"),"\n"; open CNR, "<$prefix\.cnr" or die $!; open CNRO, ">$prefix\.answerplot.cnr" or die $!; print CNRO join("\t","Gene","Chromosome","Start","End","Log2","Depth","Weight"),"\n"; my $header = <CNR>; +chomp($header); +my @colnames = split(/\t/,$header); + while (my $line = <CNR>) { chomp($line); - my ($chr,$start,$end,$geneids,$log2,$depth,$weight) = split(/\t/,$line); - my $key = $chr.":".$start."-".$end; + my @row = split(/\t/,$line); + my %hash = (); + foreach my $j (0..$#row) { + $hash{$colnames[$j]} = $row[$j]; + } + my $key = $hash{chromosome}.":".$hash{start}."-".$hash{end}; + my $geneids = $hash{gene}; my %genes; if ($geneids =~ m/ensembl_gn/g) { my @ids = split(/;|,/,$geneids); foreach my $gid (@ids) { my ($key,$value) = split(/=/,$gid); if ($key eq 'ensembl_gn' || $key eq 'identifier') { - $genes{$value} = 1 if $keep{$value}; + $genes{$value} = 1; } } - }elsif ($geneids =~ m/:/) { - my ($gene,$chr,$pos) = split(/:/,$geneids); - $genes{$gene} = 1; - }else { + } else { my @ids = split(/,/,$geneids); foreach my $gid (@ids) { - my ($gene,$trxid,$exonnum,$strand) = split(/\|/,$gid); - $genes{$gene} = 1 if $keep{$gene}; + next if ($gid =~ /^rs\d+$|^SNP:rs\d+$|^-$|Fusion/); + my ($gene,@other) = split(/:/,$gid); + my ($gname,@loc) = split(/_/,$gene); + $genes{$gname} = 1; } } foreach $gene (keys %genes) { - print CNRO join("\t",$gene,$chr,$start,$end,$log2,$depth,$weight),"\n"; + print CNRO join("\t",$gene,$hash{chromosome},$hash{start},$hash{end}, + $hash{log2},$hash{depth},$hash{weight}),"\n"; } } open IN, "<$file" or die $!; $header = <IN>; +chomp($header); +@colnames = split(/\t/,$header); + while (my $line = <IN>) { chomp($line); - my ($chr,$start,$end,$geneids,$log2,$cn,$depth, - $probes,$weight) = split(/\t/,$line); - next if ($chr eq 'chrX' && $cn == 1); - my $key = $chr.":".$start."-".$end; + my @row = split(/\t/,$line); + my %hash = (); + foreach my $j (0..$#row) { + $hash{$colnames[$j]} = $row[$j]; + } + next if ($hash{chromosome} eq 'chrX' && $hash{cn} == 1); + my $key = $hash{chromosome}.":".$hash{start}."-".$hash{end}; + my $geneids = $hash{gene}; my %genes; if ($geneids =~ m/ensembl_gn/g) { my @ids = split(/;|,/,$geneids); foreach my $gid (@ids) { my ($key,$value) = split(/=/,$gid); if ($key eq 'ensembl_gn' || $key eq 'identifier') { - $genes{$value} = 1 if $keep{$value}; + $genes{$value} = 1; } } - }elsif ($geneids =~ m/:/) { - my ($gene,$chr,$pos) = split(/:/,$geneids); - $genes{$gene} = 1; - }else { + } else { my @ids = split(/,/,$geneids); foreach my $gid (@ids) { - my ($gene,$trxid,$exonnum,$strand) = split(/\|/,$gid); - $genes{$gene} = 1 if $keep{$gene}; + next if ($gid =~ /^rs\d+$|^SNP:rs\d+$|^-$|Fusion/); + my ($gene,@other) = split(/:/,$gid); + my ($gname,@loc) = split(/_/,$gene); + $genes{$gname} = 1; } } - my $len = sprintf("%.1f",($end-$start)/1000); - print OUT3 join("\t",$chr,$start,$end,$log2,$cn),"\n"; - next if ($cn == 2) || scalar(keys %genes) < 1; + my $len = sprintf("%.1f",($hash{end}-$hash{start})/1000); + print CNSO join("\t",$hash{chromosome},$hash{start},$hash{end}, + $hash{log2},$hash{cn}),"\n"; + + next if ($hash{cn} == 2) || scalar(keys %genes) < 1; my $abtype = 'amplification'; - $abtype = 'loss' if ($cn < 2); - $abtype = 'gain' if ($cn > 2 && $cn < 6); + $abtype = 'loss' if ($hash{cn} < 2); + $abtype = 'gain' if ($hash{cn} > 2 && $hash{cn} < 6); foreach $gene (keys %genes) { - $cn_cbio = $cn -2; - $cn_cbio = 2 if ($cn > 4); - print BIO join("\t",$gene,$entrez{$gene},$cn_cbio),"\n"; - print BIO2 join("\t",$gene,$entrez{$gene},$log2),"\n"; my @cytoband; - if (@{$cyto{$key}{'p'}}) { + if ($cyto{$key}{'p'}) { @nums = sort {$b <=> $a} @{$cyto{$key}{'p'}}; push @cytoband, 'p'.$nums[0],'p'.$nums[-1]; - } if (@{$cyto{$key}{'q'}}) { + } if ($cyto{$key}{'q'}) { @nums = sort {$a <=> $b} @{$cyto{$key}{'q'}}; push @cytoband, 'q'.$nums[0],'q'.$nums[-1]; } @@ -135,11 +121,8 @@ while (my $line = <IN>) { }else { $cband = join("-",$cytoband[0],$cytoband[-1]); } - print OUT2 join("\t",$gene,$chr,$start,$end,$abtype,$cn,$weight,$cband),"\n"; - print OUT join("\t",$gene,$chr,$start,$end,$abtype,$cn,$weight),"\n"; - } - } + print OUT join("\t",$gene,$hash{chromosome},$hash{start},$hash{end}, + $abtype,$hash{cn},$hash{weight},$cband),"\n"; + } +} close IN; -close OUT; -close BIO; -close BIO2; diff --git a/variants/filter_delly.pl b/variants/filter_delly.pl new file mode 100755 index 0000000000000000000000000000000000000000..859a7ba876f1a5a399c56464b2026ebd0c0bb261 --- /dev/null +++ b/variants/filter_delly.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl -w +#integrate_datasets.pl + +#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s','normal|n'); + +open VCFOUT, ">$opt{pid}\.delly.vcf" or die $!; +open DELFUS, ">$opt{pid}\.delly.potentialfusion.txt" or die $!; + +open IN, "gunzip -c $opt{in} |" or die $!; + +W1:while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#/) { + if ($line =~ m/^#CHROM/) { + print VCFOUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n"; + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@gtheader) = split(/\t/, $line); + unless ($opt{tumor}) { + if (grep(/T_DNA/,@gtheader)) { + my @tsamps = grep(/T_DNA/,@gtheader); + $opt{tumor} = $tsamps[0]; + }else { + $opt{tumor} = $gtheader[0]; + } + } + unless ($opt{normal}) { + if (grep(/N_DNA/,@gtheader)) { + my @tsamps = grep(/N_DNA/,@gtheader); + $opt{tumor} = $tsamps[0]; + } + } + } + print VCFOUT $line,"\n"; + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/); + next unless ($chrom =~ m/^chr\d+$/ || $chrom eq 'chrX' || $chrom eq 'chrY'); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + next unless ($hash{ANN}); + next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/); + my %gtinfo = (); + my @deschead = split(/:/,$format); + F1:foreach my $k (0..$#gtheader) { + my $subjid = $gtheader[$k]; + my $allele_info = $gts[$k]; + my @ainfo = split(/:/, $allele_info); + my @mutallfreq = (); + foreach my $k (0..$#ainfo) { + $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k]; + } + $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP}); + next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1); + my @altct = split(/,/,$gtinfo{$subjid}{AD}); + my $refct = shift @altct; + @altct2 = split(/,/,$gtinfo{$subjid}{AO}); + if (scalar(@altct) ne scalar(@altct2)) { + warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n"; + } + my $total = $refct; + foreach my $act (@altct) { + next if ($act eq '.'); + $total += $act; + push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP}); + } + $gtinfo{$subjid}{MAF} = \@mutallfreq; + } + next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20); + unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) { + warn "Missing Alt:$line\n"; + } + @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}}; + @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO}); + next if ($tumoraltct[0] eq '.'); + $hash{AF} = join(",",@tumormaf); + next if ($tumoraltct[0] < 20); + next if ($tumormaf[0] < 0.01); + my $keepforvcf = 0; + my $keeptrx; + F1:foreach $trx (split(/,/,$hash{ANN})) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); + next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i); + next if($effect eq 'sequence_feature'); + $keeptrx = $trx; + $keepforvcf = $gene; + last F1; + } + next unless $keepforvcf; + my @nannot; + foreach $info (sort {$a cmp $b} keys %hash) { + if (defined $hash{$info}) { + push @nannot, $info."=".$hash{$info}; + }else { + push @nannot, $info; + } + } + my $newannot = join(";",@nannot); + if ($filter ne 'PASS') { + $filter = 'FailedQC;'.$filter; + } + if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) { + print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot, + $format,@gts),"\n"; + }elsif ($hash{END} && $hash{END} - $pos > 9999) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx); + + print DELFUS join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts),"\n"; + } +} +close IN; diff --git a/variants/filter_pindel.pl b/variants/filter_pindel.pl index 63f33e3ad3b904de1690885f3639ec90fbf9b951..1802a79e5c8b8dd01f9f5ad1fe472afb96bf7784 100755 --- a/variants/filter_pindel.pl +++ b/variants/filter_pindel.pl @@ -88,14 +88,11 @@ foreach $file (@files) { $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i); next if($effect eq 'sequence_feature'); - if ($file eq $opt{sv}) { - next unless ($effect eq 'gene_fusion'); - } $keepforvcf = $gene; } next unless $keepforvcf; - if ($tumormaf[0] < 0.1) { - next unless ($outfile =~ m/pindel_tandemdup/); + if ($tumormaf[0] < 0.05) { + next unless ($outfile =~ m/tandemdup/); } my @fail = sort {$a cmp $b} keys %fail; next if (scalar(@fail) > 0); diff --git a/variants/filter_svaba.pl b/variants/filter_svaba.pl new file mode 100755 index 0000000000000000000000000000000000000000..e9625704e8fae361f70d8c1218f254c6ba5fffe6 --- /dev/null +++ b/variants/filter_svaba.pl @@ -0,0 +1,255 @@ +#!/usr/bin/perl -w +#integrate_datasets.pl + +#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0 +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s','sv|s=s'); + +open VCFOUT, ">$opt{pid}\.svaba.vcf" or die $!; +open DELFUS, ">$opt{pid}\.svaba.potentialfusion.txt" or die $!; + +open IN, "gunzip -c $opt{in} |" or die $!; + +W1:while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#/) { + if ($line =~ m/^#CHROM/) { + print VCFOUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n"; + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@gtheader) = split(/\t/, $line); + unless ($opt{tumor}) { + if (grep(/T_DNA/,@gtheader)) { + my @tsamps = grep(/T_DNA/,@gtheader); + $opt{tumor} = $tsamps[0]; + }else { + $opt{tumor} = $gtheader[0]; + } + } + } + print VCFOUT $line,"\n"; + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/); + next unless ($chrom =~ m/^chr\d+$/ || $chrom eq 'chrX' || $chrom eq 'chrY'); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + if (length($alt) > length($ref)) { + $hash{SVTYPE} = "INS"; + $hash{END} = $pos; + }elsif (length($alt) < length($ref)) { + my $diff = substr($ref, length($alt)); + $hash{SVTYPE} = "DEL"; + $hash{END} = $pos + length($diff); + } + next unless ($hash{ANN}); + next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/); + my %gtinfo = (); + my @deschead = split(/:/,$format); + F1:foreach my $k (0..$#gtheader) { + my $subjid = $gtheader[$k]; + my $allele_info = $gts[$k]; + my @ainfo = split(/:/, $allele_info); + my @mutallfreq = (); + foreach my $k (0..$#ainfo) { + $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k]; + } + $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP}); + next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1); + my @altct = split(/,/,$gtinfo{$subjid}{AD}); + my $refct = shift @altct; + @altct2 = split(/,/,$gtinfo{$subjid}{AO}); + if (scalar(@altct) ne scalar(@altct2)) { + warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n"; + } + my $total = $refct; + foreach my $act (@altct) { + next if ($act eq '.'); + $total += $act; + push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP}); + } + $gtinfo{$subjid}{MAF} = \@mutallfreq; + } + next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20); + unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) { + warn "Missing Alt:$line\n"; + } + @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}}; + @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO}); + next if ($tumoraltct[0] eq '.'); + $hash{AF} = join(",",@tumormaf); + next if ($tumoraltct[0] < 20); + next if ($tumormaf[0] < 0.01); + my $keepforvcf = 0; + my $keeptrx; + F1:foreach $trx (split(/,/,$hash{ANN})) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); + next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i); + next if($effect eq 'sequence_feature'); + $keeptrx = $trx; + $keepforvcf = $gene; + last F1; + } + next unless $keepforvcf; + my @nannot; + foreach $info (sort {$a cmp $b} keys %hash) { + if (defined $hash{$info}) { + push @nannot, $info."=".$hash{$info}; + }else { + push @nannot, $info; + } + } + + my $newannot = join(";",@nannot); + if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) { + if ($filter =~ m/LOWMAPQ|LowQual/i) { + $filter = 'FailedQC;'.$filter; + } + print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot, + $format,@gts),"\n"; + } elsif ($hash{SVTYPE} eq 'DEL' && $hash{SPAN} && $hash{SPAN} > 9999) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx); + print DELFUS join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts),"\n"; + } +} +close IN; + +my %svpairs; + +open IN, "gunzip -c $opt{sv} |" or die $!; + +W1:while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#/) { + if ($line =~ m/^#CHROM/) { + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@gtheader) = split(/\t/, $line); + unless ($opt{tumor}) { + if (grep(/T_DNA/,@gtheader)) { + my @tsamps = grep(/T_DNA/,@gtheader); + $opt{tumor} = $tsamps[0]; + }else { + $opt{tumor} = $gtheader[0]; + } + } + } + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + if ($alt =~ m/CHR(\w+):(\d+)/i) { + $hash{CHR2} = 'chr'.$1; + $hash{END} = $2; + } + next unless ($hash{ANN}); + next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/); + my %gtinfo = (); + my @deschead = split(/:/,$format); + F1:foreach my $k (0..$#gtheader) { + my $subjid = $gtheader[$k]; + my $allele_info = $gts[$k]; + my @ainfo = split(/:/, $allele_info); + my @mutallfreq = (); + foreach my $k (0..$#ainfo) { + $gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k]; + } + $gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP}); + next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1); + my @altct = split(/,/,$gtinfo{$subjid}{AD}); + my $refct = shift @altct; + @altct2 = split(/,/,$gtinfo{$subjid}{AO}); + if (scalar(@altct) ne scalar(@altct2)) { + warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n"; + } + my $total = $refct; + foreach my $act (@altct) { + next if ($act eq '.'); + $total += $act; + push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP}); + } + $gtinfo{$subjid}{MAF} = \@mutallfreq; + } + next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20); + unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) { + warn "Missing Alt:$line\n"; + } + @tumormaf = @{$gtinfo{$opt{tumor}}{MAF}}; + @tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO}); + next if ($tumoraltct[0] eq '.'); + $hash{AF} = join(",",@tumormaf); + next if ($tumoraltct[0] < 20); + next if ($tumormaf[0] < 0.01); + my $keepforvcf = 0; + my $keeptrx; + F1:foreach $trx (split(/,/,$hash{ANN})) { + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx); + next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i); + next if($effect eq 'sequence_feature'); + $keeptrx = $trx; + $keepforvcf = $gene; + last F1; + } + next unless $keepforvcf; + my @nannot; + foreach $info (sort {$a cmp $b} keys %hash) { + if (defined $hash{$info}) { + push @nannot, $info."=".$hash{$info}; + }else { + push @nannot, $info; + } + } + my $newannot = join(";",@nannot); + my ($svid,$svpair) = split(/:/,$id); + my $newline = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot,$format,@gts); + my ($allele,$effect,$impact,$gene,$geneid,$feature, + $featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna, + $cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx); + my $fusionline = join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts); + $svpairs{$svid}{$svpair} = {vcfline=>$line,fusionline=>$fusionline, + gene=>$keepforvcf,alt=>$alt,span=>$hash{SPAN}}; +} +close IN; + + +foreach my $id (keys %svpairs) { + if ($id =~ m/815016443/) { + warn "debugging\n"; + } + my $alt1 = $svpairs{$id}{1}{alt}; + my $alt2 = $svpairs{$id}{2}{alt}; + my $svtype; + if ($alt1 =~ m/^\w\[/ && $alt2 =~ m/^\]/) { + $svtype = 'DEL'; + }elsif ($alt2 =~ m/^\w\[/ && $alt1 =~ m/^\]/) { + $svtype = 'INS'; + }else { + $svtype = 'UNK'; + } + if ($svtype eq 'INS' || ($svtype eq 'DEL' && $svpairs{$id}{1}{gene} !~ m/&/ && $svpairs{$id}{1}{span} < 9999)) { + if ($filter =~ m/LOWMAPQ|LowQual/i) { + $filter = 'FailedQC'.$filter; + } + print VCFOUT $svpairs{$id}{1}{vcfline},"\n" + }elsif ($svtype eq 'DEL' && $svpairs{$id}{1}{span} && $svpairs{$id}{1}{span} > 9999) { + print DELFUS $svpairs{$id}{1}{fusionline},"\n"; + } +} diff --git a/variants/formatVcfCNV.pl b/variants/formatVcfCNV.pl new file mode 100755 index 0000000000000000000000000000000000000000..01b702f18bd539f6e941a41c0d78edd4c6ac1f3f --- /dev/null +++ b/variants/formatVcfCNV.pl @@ -0,0 +1,78 @@ +#!/usr/bin/perl +#migrate_db.pl + +my $pair_id = shift @ARGV; +my $vcf = shift @ARGV; +my $outfile = $pair_id.".vcf"; +open OUT, ">$outfile" or die $!; +open VCF, "$vcf" or die $!; +while (my $line = <VCF>) { + chomp($line); + if ($line =~ m/#/) { + if ($line =~ m/#CHROM/) { + print OUT "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count\">\n"; + print OUT "##FORMAT=<ID=RO,Number=1,Type=Integer,Description=\"Reference allele observation count\">\n"; + print OUT "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed\">\n"; + print OUT "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Approximate read depth (reads with MQ=255 or with bad mates are filtered)\">\n"; + } + unless ($line =~ m/FORMAT=<ID=AO/ || $line =~ m/FORMAT=<ID=AD/ || $line =~ m/FORMAT=<ID=RO/ || $line =~ m/FORMAT=<ID=DP/) { + print OUT $line,"\n"; + } + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + next if ($chrom =~ m/alt/); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val; + } + my @deschead = split(/:/,$format); + my $newformat = 'GT:DP:AD:AO:RO'; + my @newgts = (); + my $missingGT = 0; + FG:foreach my $allele_info (@gts) { + my @gtinfo = split(/:/,$allele_info); + my %gtdata; + if ($allele_info eq '.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + foreach my $i (0..$#deschead) { + $gtdata{$deschead[$i]} = $gtinfo[$i]; + } + if ($alt eq '.' || $alt eq '<NON_REF>') { + $gtdata{AO} = 0; + $gtdata{RO} = $gtdata{DP}; + $gtdata{AD} = join(",", $gtdata{RO},$gtdata{AO}); + } elsif ($gtdata{AD}){ + ($gtdata{RO},@alts) = split(/,/,$gtdata{AD}); + $gtdata{AO} = join(",",@alts); + $gtdata{DP} = $gtdata{RO}; + foreach (@alts) { + $gtdata{DP} += $_; + } + + } + if ($gtdata{DP} && $gtdata{DP} < 5) { + $missingGT ++; + } + if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); + } + next if ($missingGT == scalar(@gts)); + if ($hash{END}) { + foreach $i ($pos..$hash{END}) { + print OUT join("\t",$chrom,$i,$id,$ref,'.',$score,$filter,$annot,$newformat,@newgts),"\n"; + } + }else { + print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; + } +} +close VCF; diff --git a/variants/gatkrunner.sh b/variants/gatkrunner.sh index 30029aee9dbf719650c4d46edeb0a20c240ad344..df24cc7a0c197c0f8570e1d7537a56e3505887ed 100755 --- a/variants/gatkrunner.sh +++ b/variants/gatkrunner.sh @@ -30,15 +30,16 @@ then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi -if [[ -a "${index_path}/dbSnp.vcf.gz" ]] +if [[ -a "${index_path}/dbSnp.gatk4.vcf.gz" ]] then - dbsnp="${index_path}/dbSnp.vcf.gz" + dbsnp="${index_path}/dbSnp.gatk4.vcf.gz" else - echo "Missing dbSNP File: ${index_path}/dbSnp.vcf.gz" + echo "Missing dbSNP File: ${index_path}/dbSnp.gatk4.vcf.gz" usage fi if [[ -a "${index_path}/GoldIndels.vcf.gz" ]] @@ -57,9 +58,9 @@ else fi source /etc/profile.d/modules.sh -module load gatk/4.1.2.0 samtools/gcc/1.8 +module load gatk/4.1.4.0 samtools/gcc/1.8 which samtools -/cm/shared/apps/samtools/gcc/1.8/bin/samtools index -@ $SLURM_CPUS_ON_NODE ${sbam} +samtools index -@ $NPROC ${sbam} if [[ $algo == 'gatkbam_rna' ]] then @@ -67,21 +68,21 @@ then java -Xmx4g -jar $PICARD/picard.jar CleanSam INPUT=${sbam} OUTPUT=${pair_id}.clean.bam java -Xmx4g -jar $PICARD/picard.jar ReorderSam I=${pair_id}.clean.bam O=${pair_id}.sort.bam R=${reffa} CREATE_INDEX=TRUE java -Xmx4g -jar $PICARD/picard.jar AddOrReplaceReadGroups INPUT=${pair_id}.clean.bam O=${pair_id}.rg_added_sorted.bam SO=coordinate RGID=${pair_id} RGLB=tx RGPL=illumina RGPU=barcode RGSM=${pair_id} - samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.rg_added_sorted.bam + samtools index -@ $NPROC ${pair_id}.rg_added_sorted.bam gatk SplitNCigarReads -R ${reffa} -I ${pair_id}.rg_added_sorted.bam -O ${pair_id}.split.bam gatk --java-options "-Xmx32g" BaseRecalibrator -I ${pair_id}.split.bam --known-sites ${index_path}/dbSnp.gatk4.vcf.gz -R ${reffa} -O ${pair_id}.recal_data.table --use-original-qualities gatk --java-options "-Xmx32g" ApplyBQSR -I ${pair_id}.split.bam -R ${reffa} -O ${pair_id}.final.bam --use-original-qualities -bqsr ${pair_id}.recal_data.table - /cm/shared/apps/samtools/gcc/1.8/bin/samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.final.bam + samtools index -@ $NPROC ${pair_id}.final.bam elif [[ $algo == 'gatkbam' ]] then gatk --java-options "-Xmx32g" BaseRecalibrator -I ${sbam} --known-sites ${index_path}/dbSnp.gatk4.vcf.gz -R ${reffa} -O ${pair_id}.recal_data.table --use-original-qualities gatk --java-options "-Xmx32g" ApplyBQSR -I ${sbam} -R ${reffa} -O ${pair_id}.final.bam --use-original-qualities -bqsr ${pair_id}.recal_data.table - /cm/shared/apps/samtools/gcc/1.8/bin/samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.final.bam + samtools index -@ $NPROC ${pair_id}.final.bam elif [[ $algo == 'abra2' ]] then module load abra2/2.18 mkdir tmpdir - java -Xmx16G -jar /cm/shared/apps/abra2/lib/abra2.jar --in ${sbam} --in-vcf /archive/PHG/PHG_Clinical/phg_workflow/analysis/awesomeproject/GoldIndels.vcf --out ${pair_id}.final.bam --ref ${reffa} --threads $SLURM_CPUS_ON_NODE --tmpdir tmpdir - samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.final.bam + java -Xmx16G -jar /cm/shared/apps/abra2/lib/abra2.jar --in ${sbam} --in-vcf /archive/PHG/PHG_Clinical/phg_workflow/analysis/awesomeproject/GoldIndels.vcf --out ${pair_id}.final.bam --ref ${reffa} --threads $NPROC --tmpdir tmpdir + samtools index -@ $NPROC ${pair_id}.final.bam fi diff --git a/variants/germline_vc.sh b/variants/germline_vc.sh index f6c5e287ae5d6b0e175335b54ff711809fb71693..9f598d9f68435b27b3759a5c8adffe361ce3c188 100755 --- a/variants/germline_vc.sh +++ b/variants/germline_vc.sh @@ -11,13 +11,14 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:a:b:p:th opt +while getopts :r:a:b:q:p:th opt do case $opt in r) index_path=$OPTARG;; p) pair_id=$OPTARG;; a) algo=$OPTARG;; t) rna=1;; + q) pon==$OPTARG;; h) usage;; esac done @@ -29,13 +30,15 @@ shift $(($OPTIND -1)) if [[ -z $pair_id ]] || [[ -z $index_path ]]; then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi if [[ -s "${index_path}/dbSnp.vcf.gz" ]] then dbsnp="${index_path}/dbSnp.vcf.gz" + gatk4_dbsnp="${index_path}/dbSnp.gatk4.vcf.gz" else echo "Missing dbSNP File: ${index_path}/dbSnp.vcf.gz" usage @@ -55,6 +58,12 @@ else echo "Missing Fasta File: ${index_path}/genome.fa" usage fi +if [[ -f $pon ]] +then + ponopt="--pon $pon" +else + ponopt=''; +fi source /etc/profile.d/modules.sh module load python/2.7.x-anaconda picard/2.10.3 samtools/gcc/1.8 bcftools/gcc/1.8 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel @@ -62,31 +71,38 @@ module load python/2.7.x-anaconda picard/2.10.3 samtools/gcc/1.8 bcftools/gcc/1. for i in *.bam; do if [[ ! -f ${i}.bai ]] then - samtools index -@ $SLURM_CPUS_ON_NODE $i + samtools index -@ $NPROC $i fi done if [[ $algo == 'mpileup' ]] then - threads=`expr $SLURM_CPUS_ON_NODE - 10` - bcftools mpileup --threads $threads -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 99999 -C50 -f ${reffa} *.bam | bcftools call --threads 10 -vmO z -o ${pair_id}.vcf.gz + threads=`expr $NPROC - 10` + bcftools mpileup --threads $threads -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -A -d 1000000 -C50 -f ${reffa} *.bam | bcftools call -A --threads 10 -vmO z -o ${pair_id}.vcf.gz vcf-annotate -n --fill-type ${pair_id}.vcf.gz | bcftools norm -c s -f ${reffa} -w 10 -O v -o sam.vcf java -jar $PICARD/picard.jar SortVcf I=sam.vcf O=${pair_id}.sam.vcf R=${reffa} CREATE_INDEX=TRUE bgzip ${pair_id}.sam.vcf -elif [[ $algo == 'freebayes' ]] +elif [[ $algo == 'fb' ]] then module load freebayes/gcc/1.2.0 parallel/20150122 bamlist='' for i in *.bam; do bamlist="$bamlist --bam ${PWD}/${i}" done - cut -f 1 ${index_path}/genomefile.5M.txt | parallel --delay 2 -j $SLURM_CPUS_ON_NODE "freebayes -f ${index_path}/genome.fa --min-base-quality 20 --min-coverage 10 --min-alternate-fraction 0.01 -C 3 --use-best-n-alleles 3 -r {} ${bamlist} > fb.{}.vcf" - vcf-concat fb.*.vcf | vcf-sort | vcf-annotate -n --fill-type | bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.freebayes.vcf.gz - + cut -f 1 ${index_path}/genomefile.5M.txt | parallel --delay 2 -j $NPROC "freebayes -f ${index_path}/genome.fa --min-mapping-quality 0 --min-base-quality 20 --min-coverage 10 --min-alternate-fraction 0.01 -C 3 --use-best-n-alleles 3 -r {} ${bamlist} > fb.{}.vcf" + vcf-concat fb.*.vcf | vcf-sort | vcf-annotate -n --fill-type | bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.fb.vcf.gz - +elif [[ $algo == 'platypus' ]] +then + module load platypus/gcc/0.8.1 + bamlist=`join_by , *.bam` + Platypus.py callVariants --minMapQual=0 --minReads=3 --mergeClusteredVariants=1 --nCPU=$NPROC --bamFiles=${bamlist} --refFile=${reffa} --output=platypus.vcf + vcf-sort platypus.vcf |vcf-annotate -n --fill-type -n |bgzip > platypus.vcf.gz + tabix platypus.vcf.gz + bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.platypus.vcf.gz platypus.vcf.gz elif [[ $algo == 'gatk' ]] then - gatk4_dbsnp=/project/shared/bicf_workflow_ref/human/GRCh38/clinseq_prj/dbSnp.gatk4.vcf.gz user=$USER - module load gatk/4.1.2.0 + module load gatk/4.1.4.0 gvcflist='' for i in *.bam; do prefix="${i%.bam}" @@ -101,14 +117,18 @@ then gatk --java-options "-Xmx32g" GenotypeGVCFs -V gendb://gendb -R ${reffa} -D ${gatk4_dbsnp} -O gatk.vcf bcftools norm -c s -f ${reffa} -w 10 -O v gatk.vcf | vcf-annotate -n --fill-type gatk.vcf | bgzip > ${pair_id}.gatk.vcf.gz tabix ${pair_id}.gatk.vcf.gz -elif [[ $algo == 'platypus' ]] +elif [ $algo == 'mutect' ] then - module load platypus/gcc/0.8.1 - bamlist=`join_by , *.bam` - Platypus.py callVariants --minMapQual=0 --minReads=3 --mergeClusteredVariants=1 --nCPU=$SLURM_CPUS_ON_NODE --bamFiles=${bamlist} --refFile=${reffa} --output=platypus.vcf - vcf-sort platypus.vcf |vcf-annotate -n --fill-type -n |bgzip > platypus.vcf.gz - tabix platypus.vcf.gz - bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.platypus.vcf.gz platypus.vcf.gz + gatk4_dbsnp=${index_path}/clinseq_prj/dbSnp.gatk4.vcf.gz + module load gatk/4.1.4.0 + bamlist='' + for i in *.bam; do + bamlist+="-I ${i} " + done + gatk --java-options "-Xmx20g" Mutect2 $ponopt -R ${reffa} ${bamlist} --output ${pair_id}.mutect.vcf -RF AllowAllReadsReadFilter --independent-mates --tmp-dir `pwd` + #gatk --java-options "-Xmx20g" FilterMutectCalls -R ${reffa} -V ${pair_id}.mutect.vcf -O ${pair_id}.mutect.filt.vcf + vcf-sort ${pair_id}.mutect.vcf | vcf-annotate -n --fill-type | java -jar $SNPEFF_HOME/SnpSift.jar filter -p '(GEN[*].DP >= 10)' | bgzip > ${pair_id}.mutect.vcf.gz + elif [[ $algo == 'strelka2' ]] then if [[ $rna == 1 ]] @@ -124,8 +144,13 @@ then gvcflist="$gvcflist --bam ${i}" done configManta.py $gvcflist --referenceFasta ${reffa} $mode --runDir manta - manta/runWorkflow.py -m local -j $SLURM_CPUS_ON_NODE - configureStrelkaGermlineWorkflow.py $gvcflist --referenceFasta ${reffa} $mode --indelCandidates manta/results/variants/candidateSmallIndels.vcf.gz --runDir strelka - strelka/runWorkflow.py -m local -j $SLURM_CPUS_ON_NODE + manta/runWorkflow.py -m local -j $NPROC + if [[ -f manta/results/variants/candidateSmallIndels.vcf.gz ]] + then + configureStrelkaGermlineWorkflow.py $gvcflist --referenceFasta ${reffa} $mode --indelCandidates manta/results/variants/candidateSmallIndels.vcf.gz --runDir strelka + else + configureStrelkaGermlineWorkflow.py $gvcflist --referenceFasta ${reffa} $mode --runDir strelka + fi + strelka/runWorkflow.py -m local -j $NPROC bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.strelka2.vcf.gz strelka/results/variants/variants.vcf.gz fi diff --git a/variants/itdseek.sh b/variants/itdseek.sh index 6d47a7aed25fe659c87ed284079ff6c286827b16..bee8d2cedd5275a0f955ff3f2b16bc4d2375b652 100755 --- a/variants/itdseek.sh +++ b/variants/itdseek.sh @@ -9,13 +9,15 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:b:l:p:h opt +while getopts :r:b:l:p:fh opt do case $opt in r) index_path=$OPTARG;; b) sbam=$OPTARG;; p) pair_id=$OPTARG;; l) itdbed=$OPTARG;; + g) snpeffgeno=$OPTARG;; + f) filter=1;; h) usage;; esac done @@ -29,9 +31,14 @@ baseDir="`dirname \"$0\"`" if [[ -z $pair_id ]] || [[ -z $index_path ]]; then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` +fi +if [[ -z $snpeffgeno ]] +then + snpeffgeno='GRCh38.86' fi if [[ -a "${index_path}/genome.fa" ]] @@ -45,11 +52,19 @@ else fi source /etc/profile.d/modules.sh - +export PATH=/project/shared/bicf_workflow_ref/seqprg/bin:$PATH module load samtools/gcc/1.8 snpeff/4.3q vcftools/0.1.14 htslib/gcc/1.8 bcftools/gcc/1.8 bedtools/2.26.0 stexe=`which samtools` -samtools view -@ $SLURM_CPUS_ON_NODE -L ${itdbed} ${sbam} | /project/shared/bicf_workflow_ref/seqprg/itdseek-1.2/itdseek.pl --refseq ${reffa} --samtools ${stexe} --bam ${sbam} | vcf-sort | bedtools intersect -header -b ${itdbed} -a stdin | bgzip > ${pair_id}.itdseek.vcf.gz +samtools view -@ $NPROC -L ${itdbed} ${sbam} | itdseek.pl --refseq ${reffa} --samtools ${stexe} --bam ${sbam} | vcf-sort | bedtools intersect -header -b ${itdbed} -a stdin | bgzip > ${pair_id}.itdseek.vcf.gz tabix ${pair_id}.itdseek.vcf.gz -bcftools norm --fasta-ref $reffa -m - -Ov ${pair_id}.itdseek.vcf.gz | java -Xmx30g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 - |bgzip > ${pair_id}.itdseek_tandemdup.vcf.gz +bcftools norm --fasta-ref $reffa -c w -m - -Ov ${pair_id}.itdseek.vcf.gz | java -Xmx30g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config $snpeffgeno - |bgzip > ${pair_id}.itdseek_tandemdup.vcf.gz + +if [[ $filter == 1 ]] +then + perl $baseDir/filter_itdseeker.pl -t ${pair_id} -d ${pair_id}.itdseek_tandemdup.vcf.gz + mv ${pair_id}.itdseek_tandemdup.vcf.gz ${pair_id}.itdseek_tandemdup.unfilt.vcf.gz + mv ${pair_id}.itdseek_tandemdup.pass.vcf ${pair_id}.itdseek_tandemdup.vcf + bgzip ${pair_id}.itdseek_tandemdup.pass.vcf +fi diff --git a/variants/msisensor.sh b/variants/msisensor.sh new file mode 100755 index 0000000000000000000000000000000000000000..ef226caeee217b1b6f3b91dfe9f42cc47907227a --- /dev/null +++ b/variants/msisensor.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#svcalling.sh + +usage() { + echo "-h Help documentation for gatkrunner.sh" + echo "-r --Path to Reference Genome with the file genome.fa" + echo "-p --Prefix for output file name" + echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:l:b:p:h opt +do + case $opt in + r) index_path=$OPTARG;; + p) pair_id=$OPTARG;; + b) sbam=$OPTARG;; + n) normal=$OPTARG;; + h) usage;; + esac +done +function join_by { local IFS="$1"; shift; echo "$*"; } + +shift $(($OPTIND -1)) +baseDir="`dirname \"$0\"`" + +# Check for mandatory options +if [[ -z $sbam ]] || [[ -z $index_path ]]; then + usage +fi + +source /etc/profile.d/modules.sh +export PATH=/project/shared/bicf_workflow_ref/seqprg/bin:$PATH + +if [[ -n $normal ]] +then + msisensor2 msi -d ${index_path}/microsatellites.list -n $normal -t $sbam -o ${pair_id}.msi + +else + # -M ${index_path}/msi_tumoronly_model + msisensor2 msi -d ${index_path}/microsatellites.list -t $sbam -o ${pair_id}.msi +fi diff --git a/variants/norm_annot.sh b/variants/norm_annot.sh index 536aa9c5d41691e9d428637151e2ddbe2392b799..776c306c6bae5c965e07f648b16ecf1554e14194 100755 --- a/variants/norm_annot.sh +++ b/variants/norm_annot.sh @@ -10,12 +10,13 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:p:v:h opt +while getopts :r:p:v:sh opt do case $opt in - r) index_path=$OPTARG;; p) pair_id=$OPTARG;; v) vcf=$OPTARG;; + r) index_path=$OPTARG;; + s) skipnorm=1;; h) usage;; esac done @@ -24,7 +25,7 @@ shift $(($OPTIND -1)) baseDir="`dirname \"$0\"`" source /etc/profile.d/modules.sh -module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q +module load bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8 snpeff/4.3q if [[ -a "${index_path}/genome.fa" ]] then @@ -40,4 +41,9 @@ perl $baseDir\/uniform_vcf_gt.pl $pair_id $vcf bgzip -f ${pair_id}.uniform.vcf j=${pair_id}.uniform.vcf.gz tabix -f $j -bcftools norm --fasta-ref $reffa -m - -Oz $j -o ${pair_id}.norm.vcf.gz +if [[ skipnorm==1 ]] +then + cp $j ${pair_id}.norm.vcf.gz +else + bcftools norm --fasta-ref $reffa -m - -Oz $j -o ${pair_id}.norm.vcf.gz +fi diff --git a/variants/parse_pindel.pl b/variants/parse_pindel.pl index 79e64f0babd98d1fa211800330be873cfc74feb5..714fe9453e8db92fdfea0c378a3084ab89fb4fb7 100755 --- a/variants/parse_pindel.pl +++ b/variants/parse_pindel.pl @@ -64,7 +64,7 @@ while (my $line = <VCF>) { $gtdata{DP} += $_; } } - if (($gtdata{DP} && $gtdata{DP} < 10) || ()) { + if (($gtdata{DP} && $gtdata{DP} < 20) || ()) { $missingGT ++; } if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') { push @newgts, '.:.:.:.:.'; @@ -79,12 +79,15 @@ while (my $line = <VCF>) { if ($hash{SVTYPE} eq 'DUP:TANDEM') { print DUP join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; }elsif ($hash{SVTYPE} eq 'DEL' || $hash{SVTYPE} eq 'INS') { - if (abs($hash{SVLEN}) < 50) { + if (abs($hash{SVLEN}) < 30) { print SI join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; }else { $newalt = "<".$hash{SVTYPE}.">"; print SV join("\t",$chrom,$pos,$id,'N',$newalt,$score,$filter,$annot,$newformat,@newgts),"\n"; } + }else { + $newalt = "<".$hash{SVTYPE}.">"; + print SV join("\t",$chrom,$pos,$id,'N',$newalt,$score,$filter,$annot,$newformat,@newgts),"\n"; } } close VCF; diff --git a/variants/pindel.sh b/variants/pindel.sh index 452d2585a0b8f7d80d2266a35f209c2764594432..ebbd3fe316bf36bf42c4aa6ecb3795ea397d9dcb 100755 --- a/variants/pindel.sh +++ b/variants/pindel.sh @@ -15,6 +15,7 @@ do r) index_path=$OPTARG;; p) pair_id=$OPTARG;; l) idtbed=$OPTARG;; + g) snpeffgeno=$OPTARG;; h) usage;; esac done @@ -28,9 +29,10 @@ baseDir="`dirname \"$0\"`" if [[ -z $pair_id ]] || [[ -z $index_path ]]; then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi if [[ -a "${index_path}/genome.fa" ]] @@ -46,20 +48,31 @@ fi source /etc/profile.d/modules.sh genomefiledate=`find ${reffa} -maxdepth 0 -printf "%TY%Tm%Td\n"` +if [[ -z $snpeffgeno ]] +then + snpeffgeno='GRCh38.86' +fi -module load samtools/1.6 pindel/0.2.5-intel snpeff/4.3q bedtools/2.26.0 +module load samtools/gcc/1.8 bcftools/gcc/1.8 htslib/gcc/1.8 pindel/0.2.5-intel snpeff/4.3q bedtools/2.26.0 touch ${pair_id}.pindel.config for i in *.bam; do sname=`echo "$i" |cut -f 1 -d '.'` echo -e "${i}\t400\t${sname}" >> ${pair_id}.pindel.config done -pindel -T $SLURM_CPUS_ON_NODE -f ${reffa} -i ${pair_id}.pindel.config -o ${pair_id}.pindel_out --RP +pindel -T $NPROC -f ${reffa} -i ${pair_id}.pindel.config -o ${pair_id}.pindel_out --RP pindel2vcf -P ${pair_id}.pindel_out -r ${reffa} -R HG38 -d ${genomefiledate} -v pindel.vcf cat pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter "( GEN[*].AD[1] >= 10 )" | bgzip > pindel.vcf.gz tabix pindel.vcf.gz bash $baseDir/norm_annot.sh -r ${index_path} -p pindel -v pindel.vcf.gz perl $baseDir/parse_pindel.pl ${pair_id} pindel.norm.vcf.gz -java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.indel.vcf |bgzip > ${pair_id}.pindel_indel.vcf.gz -java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.dup.vcf | bedtools intersect -header -b ${idtbed} -a stdin | bgzip > ${pair_id}.pindel_tandemdup.vcf.gz -java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.sv.vcf | bgzip > ${pair_id}.pindel_sv.vcf.gz +java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} ${pair_id}.indel.vcf |bgzip > ${pair_id}.pindel_indel.vcf.gz + +if [[ -a $idtbed ]] +then + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} ${pair_id}.dup.vcf | bedtools intersect -header -b ${idtbed} -a stdin | bgzip > ${pair_id}.pindel_tandemdup.vcf.gz +else + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} ${pair_id}.dup.vcf | bgzip > ${pair_id}.pindel_tandemdup.vcf.gz +fi + +java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} ${pair_id}.sv.vcf | bgzip > ${pair_id}.pindel_sv.vcf.gz diff --git a/variants/somatic_vc.sh b/variants/somatic_vc.sh index 8613f4cc8488f1289407247dfc135841536ee8f1..36a09b0a1b2f85a6bfdc96e1a51c9848cbdaa8ca 100755 --- a/variants/somatic_vc.sh +++ b/variants/somatic_vc.sh @@ -20,7 +20,7 @@ usage(){ } OPTIND=1 # Reset OPTIND -while getopts :n:t:p:r:x:y:i:j:a:b:h opt +while getopts :n:t:p:r:x:y:i:j:q:a:b:h opt do case $opt in r) index_path=$OPTARG;; @@ -33,6 +33,7 @@ do j) mtumor=$OPTARG;; a) algo=$OPTARG;; b) tbed=$OPTARG;; + q) pon==$OPTARG;; h) usage;; esac done @@ -44,9 +45,10 @@ if [[ -z $normal ]] || [[ -z $tumor ]] || [[ -z $algo ]]; then echo $normal $tumor $algo usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi #pair_id=${tid}_${nid} if [[ -z $mtumor ]] @@ -54,6 +56,12 @@ then mtumor=tumor mnormal=normal fi +if [[ -f $pon ]] +then + ponopt="--pon $pon" +else + ponopt=''; +fi if [[ -a "${index_path}/genome.fa" ]] then @@ -89,34 +97,35 @@ if [ $algo == 'strelka2' ] mkdir manta strelka configManta.py --normalBam ${normal} --tumorBam ${tumor} --referenceFasta ${reffa} --runDir manta manta/runWorkflow.py -m local -j 8 - configureStrelkaSomaticWorkflow.py --normalBam ${normal} --tumorBam ${tumor} --referenceFasta ${reffa} --targeted --indelCandidates manta/results/variants/candidateSmallIndels.vcf.gz --runDir strelka + if [[ -f manta/results/variants/candidateSmallIndels.vcf.gz ]] + then + configureStrelkaSomaticWorkflow.py --normalBam ${normal} --tumorBam ${tumor} --referenceFasta ${reffa} --targeted --indelCandidates manta/results/variants/candidateSmallIndels.vcf.gz --runDir strelka + else + configureStrelkaSomaticWorkflow.py --normalBam ${normal} --tumorBam ${tumor} --referenceFasta ${reffa} --targeted --indelCandidates --runDir strelka + fi strelka/runWorkflow.py -m local -j 8 vcf-concat strelka/results/variants/*.vcf.gz | vcf-annotate -n --fill-type -n |vcf-sort |java -jar $SNPEFF_HOME/SnpSift.jar filter "(GEN[*].DP >= 10)" | perl -pe "s/TUMOR/${tid}/g" | perl -pe "s/NORMAL/${nid}/g" |bgzip > ${pair_id}.strelka2.vcf.gz fi if [ $algo == 'virmid' ] then module load virmid/1.2 samtools/gcc/1.8 vcftools/0.1.14 - virmid -R ${reffa} -D ${tumor} -N ${normal} -s ${cosmic} -t $SLURM_CPUS_ON_NODE -M 2000 -c1 10 -c2 10 + virmid -R ${reffa} -D ${tumor} -N ${normal} -s ${cosmic} -t $NPROC -M 2000 -c1 10 -c2 10 perl $baseDir/addgt_virmid.pl ${tumor}.virmid.som.passed.vcf perl $baseDir/addgt_virmid.pl ${tumor}.virmid.loh.passed.vcf module rm java/oracle/jdk1.7.0_51 module load snpeff/4.3q vcf-concat *gt.vcf | vcf-sort | vcf-annotate -n --fill-type -n | java -jar $SNPEFF_HOME/SnpSift.jar filter '((NDP >= 10) & (DDP >= 10))' | perl -pe "s/TUMOR/${tid}/g" | perl -pe "s/NORMAL/${nid}/g" | bgzip > ${pair_id}.virmid.vcf.gz -elif [ $algo == 'mutect2' ] +elif [ $algo == 'mutect' ] then gatk4_dbsnp=${index_path}/clinseq_prj/dbSnp.gatk4.vcf.gz - user=$USER - module load gatk/4.x singularity/2.6.1 picard/2.10.3 - mkdir /tmp/${user} - export TMP_HOME=/tmp/${user} - java -XX:ParallelGCThreads=$SLURM_CPUS_ON_NODE -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar CollectSequencingArtifactMetrics I=${tumor} O=artifact_metrics.txt R=${reffa} - singularity exec -H /tmp/${user} /project/apps/singularity-images/gatk4/gatk-4.x.simg /gatk/gatk --java-options "-Xmx20g" Mutect2 -R ${reffa} -A FisherStrand -A QualByDepth -A StrandArtifact -A DepthPerAlleleBySample -I ${tumor} -tumor ${tid} -I ${normal} -normal ${nid} --output ${tid}.mutect.vcf - singularity exec -H /tmp/${user} /project/apps/singularity-images/gatk4/gatk-4.x.simg /gatk/gatk --java-options "-Xmx20g" FilterMutectCalls -V ${tid}.mutect.vcf -O ${tid}.mutect.filt.vcf - module load snpeff/4.3q samtools/gcc/1.8 vcftools/0.1.14 - vcf-sort ${tid}.mutect.filt.vcf | vcf-annotate -n --fill-type | java -jar $SNPEFF_HOME/SnpSift.jar filter -p '(GEN[*].DP >= 10)' | bgzip > ${pair_id}.mutect.vcf.gz + module load gatk/4.1.4.0 picard/2.10.3 snpeff/4.3q samtools/gcc/1.8 vcftools/0.1.14 + java -XX:ParallelGCThreads=$NPROC -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar CollectSequencingArtifactMetrics I=${tumor} O=artifact_metrics.txt R=${reffa} + gatk --java-options "-Xmx20g" Mutect2 $ponopt --independent-mates -RF AllowAllReadsReadFilter -R ${reffa} -I ${tumor} -tumor ${tid} -I ${normal} -normal ${nid} --output ${tid}.mutect.vcf + #gatk --java-options "-Xmx20g" FilterMutectCalls -R ${reffa} -V ${tid}.mutect.vcf -O ${tid}.mutect.filt.vcf + vcf-sort ${tid}.mutect.vcf | vcf-annotate -n --fill-type | java -jar $SNPEFF_HOME/SnpSift.jar filter -p '(GEN[*].DP >= 10)' | bgzip > ${pair_id}.mutect.vcf.gz elif [ $algo == 'varscan' ] then - module load samtools/gcc/1.8 VarScan/2.4.2 vcftools/0.1.14 + module load bcftools/gcc/1.8 samtools/gcc/1.8 VarScan/2.4.2 vcftools/0.1.14 module rm java/oracle/jdk1.7.0_51 module load snpeff/4.3q samtools mpileup -C 50 -f ${reffa} $tumor > t.mpileup diff --git a/variants/svcalling.sh b/variants/svcalling.sh index 881eefde49010df1ee7c2c0a9ec911628e7977cc..0c3b97d85cd3c35dc2a7c739e54aef0d8124755e 100755 --- a/variants/svcalling.sh +++ b/variants/svcalling.sh @@ -11,7 +11,7 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:p:b:i:n:a:h opt +while getopts :r:p:b:i:x:y:n:l:a:hf opt do case $opt in r) index_path=$OPTARG;; @@ -20,6 +20,10 @@ do i) tumor=$OPTARG;; n) normal=$OPTARG;; a) method=$OPTARG;; + x) tid=$OPTARG;; + y) nid=$OPTARG;; + f) filter=1;; + l) bed=$OPTARG;; h) usage;; esac done @@ -33,9 +37,10 @@ baseDir="`dirname \"$0\"`" if [[ -z $pair_id ]] || [[ -z $index_path ]]; then usage fi -if [[ -z $SLURM_CPUS_ON_NODE ]] +NPROC=$SLURM_CPUS_ON_NODE +if [[ -z $NPROC ]] then - SLURM_CPUS_ON_NODE=1 + NPROC=`nproc` fi if [[ -a "${index_path}/genome.fa" ]] @@ -47,65 +52,149 @@ else usage fi +if [[ -z $snpeffgeno ]] +then + snpeffgeno='GRCh38.86' +fi source /etc/profile.d/modules.sh -module load samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 -mkdir temp +module load htslib/gcc/1.8 samtools/gcc/1.8 bcftools/gcc/1.8 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 +export PATH=/project/shared/bicf_workflow_ref/seqprg/bin:$PATH +mkdir -p temp + +if [[ -z $tid ]] +then + tid=`samtools view -H ${sbam} |grep '^@RG' |perl -pi -e 's/\t/\n/g' |grep ID |cut -f 2 -d ':'` +fi if [[ $method == 'delly' ]] then - module load delly2/v0.7.7-multi samtools/1.6 snpeff/4.3q + #module load delly2/v0.7.7-multi if [[ -n ${normal} ]] then #RUN DELLY - echo -e "${normal}\tcontrol"> samples.tsv - echo -e "${tumor}\ttumor" >> samples.tsv - delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam} ${normal} - delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam} ${normal} - delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam} ${normal} - delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam} ${normal} - delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam} ${normal} - delly2 filter -t BND -o delly_tra.bcf -f somatic -s samples.tsv delly_translocations.bcf - delly2 filter -t DUP -o delly_dup.bcf -f somatic -s samples.tsv delly_duplications.bcf - delly2 filter -t INV -o delly_inv.bcf -f somatic -s samples.tsv delly_inversions.bcf - delly2 filter -t DEL -o delly_del.bcf -f somatic -s samples.tsv delly_deletion.bcf - delly2 filter -t INS -o delly_ins.bcf -f somatic -s samples.tsv delly_insertion.bcf + echo -e "${nid}\tcontrol"> samples.tsv + echo -e "${tid}\ttumor" >> samples.tsv + delly2 call -t BND -o ${pair_id}.delly_translocations.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 call -t DUP -o ${pair_id}.delly_duplications.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 call -t INV -o ${pair_id}.delly_inversions.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 call -t DEL -o ${pair_id}.delly_deletion.bcf -q 30 -g ${reffa} ${sbam} ${normal} + delly2 call -t INS -o ${pair_id}.delly_insertion.bcf -q 30 -g ${reffa} ${sbam} ${normal} + #delly2 filter -o ${pair_id}.delly_tra.bcf -f somatic -s samples.tsv ${pair_id}.delly_translocations.bcf else #RUN DELLY - delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam} - delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam} - delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam} - delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam} - delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam} - delly2 filter -t BND -o delly_tra.bcf -f germline delly_translocations.bcf - delly2 filter -t DUP -o delly_dup.bcf -f germline delly_duplications.bcf - delly2 filter -t INV -o delly_inv.bcf -f germline delly_inversions.bcf - delly2 filter -t DEL -o delly_del.bcf -f germline delly_deletion.bcf - delly2 filter -t INS -o delly_ins.bcf -f germline delly_insertion.bcf + delly2 call -t BND -o ${pair_id}.delly_translocations.bcf -q 30 -g ${reffa} ${sbam} + delly2 call -t DUP -o ${pair_id}.delly_duplications.bcf -q 30 -g ${reffa} ${sbam} + delly2 call -t INV -o ${pair_id}.delly_inversions.bcf -q 30 -g ${reffa} ${sbam} + delly2 call -t DEL -o ${pair_id}.delly_deletion.bcf -q 30 -g ${reffa} ${sbam} + delly2 call -t INS -o ${pair_id}.delly_insertion.bcf -q 30 -g ${reffa} ${sbam} + #delly2 filter -o ${pair_id}.delly_tra.bcf -f germline ${pair_id}.delly_translocations.bcf fi #MERGE DELLY AND MAKE BED - bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf| vcf-sort -t temp > delly.vcf - bgzip delly.vcf - java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 delly.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter " ( GEN[*].AD[1] >= 20 )" | bgzip > ${pair_id}.sv.vcf.gz -fi + + bcftools concat -a -O v ${pair_id}.delly_duplications.bcf ${pair_id}.delly_inversions.bcf ${pair_id}.delly_translocations.bcf ${pair_id}.delly_deletion.bcf ${pair_id}.delly_insertion.bcf | vcf-sort -t temp | bgzip > ${pair_id}.delly.svar.vcf.gz + bash $baseDir/norm_annot.sh -r ${index_path} -p ${pair_id}.delly.sv -v ${pair_id}.delly.svar.vcf.gz -s + java -jar $SNPEFF_HOME/SnpSift.jar filter "( GEN[*].DP >= 20 )" ${pair_id}.delly.sv.norm.vcf.gz | java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} - | bgzip > ${pair_id}.delly.vcf.gz + if [[ $filter == 1 ]] + then + zcat ${pair_id}.delly.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS CHR2 END ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u > ${pair_id}.dgf.txt + mv ${pair_id}.delly.vcf.gz ${pair_id}.delly.ori.vcf.gz + perl $baseDir/filter_delly.pl -t $tid -p $pair_id -i ${pair_id}.delly.ori.vcf.gz + bgzip -f ${pair_id}.delly.vcf + zgrep '#CHROM' ${pair_id}.delly.vcf.gz > ${pair_id}.delly.genefusion.txt + cat ${pair_id}.delly.potentialfusion.txt ${pair_id}.dgf.txt |sort -u >> ${pair_id}.delly.genefusion.txt + fi +elif [[ $method == 'svaba' ]] +then + if [[ -n ${normal} ]] + then + svaba run -p $NPROC -G ${reffa} -t ${sbam} -n ${normal} -a ${pair_id} + else + svaba run -p $NPROC -G ${reffa} -t ${sbam} -a ${pair_id} + fi + #Create SV FILE + vcf-concat ${pair_id}.svaba.unfiltered*sv.vcf | perl -pe 's/\.consensus|\.bam//g' | vcf-sort| bgzip > ${pair_id}.svaba.unfiltered.sv.vcf.gz + bash $baseDir/norm_annot.sh -r ${index_path} -p svaba.sv -v ${pair_id}.svaba.unfiltered.sv.vcf.gz + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} svaba.sv.norm.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar filter "( GEN[*].AO >= 20)" | bgzip > ${pair_id}.svaba.sv.vcf.gz + + vcf-concat ${pair_id}.svaba.unfiltered*indel.vcf | perl -pe 's/\.consensus|\.bam//g' | vcf-sort | java -jar $SNPEFF_HOME/SnpSift.jar filter "( SPAN >= 20)" - |bgzip > ${pair_id}.svaba.indel.vcf.gz + bash $baseDir/norm_annot.sh -r ${index_path} -p svaba.indel -v ${pair_id}.svaba.indel.vcf.gz + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} svaba.indel.norm.vcf.gz | bgzip > ${pair_id}.svaba.vcf.gz -if [[ $method == 'lumpy' ]] + if [[ $filter == 1 ]] + then + zcat ${pair_id}.svaba.sv.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS ALT ID ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u > ${pair_id}.sgf.txt + mv ${pair_id}.svaba.vcf.gz ${pair_id}.svaba.ori.vcf.gz + perl $baseDir/filter_svaba.pl -t $tid -p ${pair_id} -i ${pair_id}.svaba.ori.vcf.gz -s ${pair_id}.svaba.sv.vcf.gz + bgzip ${pair_id}.svaba.vcf + zgrep '#CHROM' ${pair_id}.svaba.sv.vcf.gz > ${pair_id}.svaba.genefusion.txt + cat ${pair_id}.svaba.potentialfusion.txt ${pair_id}.sgf.txt | sort -u >> ${pair_id}.svaba.genefusion.txt + fi +elif [[ $method == 'lumpy' ]] then #MAKE FILES FOR LUMPY - samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${sbam} + samtools sort -@ $NPROC -n -o namesort.bam ${sbam} samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o splitters.bam - gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o discordants.bam - #RUN LUMPY if [[ -n ${normal} ]] then - samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${normal} + samtools sort -@ $NPROC -n -o namesort.bam ${normal} samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o normal.splitters.bam - gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o normal.discordants.bam - - speedseq sv -t $SLURM_CPUS_ON_NODE -o lumpy -R ${reffa} -B ${normal},${sbam} -D normal.discordants.bam,discordants.bam -S normal.splitters.bam,splitters.bam -x ${index_path}/exclude_alt.bed + speedseq sv -t $NPROC -o lumpy -R ${reffa} -B ${normal},${sbam} -D normal.discordants.bam,discordants.bam -S normal.splitters.bam,splitters.bam -x ${index_path}/exclude_alt.bed else - speedseq sv -t $SLURM_CPUS_ON_NODE -o lumpy -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed + speedseq sv -t $NPROC -o lumpy -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed + fi + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} lumpy.sv.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar filter " ( GEN[*].DV >= 20 )" | bgzip > ${pair_id}.lumpy.vcf.gz +elif [[ $method == 'pindel' ]] +then + module load pindel/0.2.5-intel + genomefiledate=`find ${reffa} -maxdepth 0 -printf "%TY%Tm%Td\n"` + touch ${pair_id}.pindel.config + for i in *.bam; do + sname=`echo "$i" |cut -f 1 -d '.'` + echo -e "${i}\t400\t${sname}" >> ${pair_id}.pindel.config + samtools index -@ $NPROC $i + done + pindel -T $NPROC -f ${reffa} -i ${pair_id}.pindel.config -o ${pair_id}.pindel_out --RP + pindel2vcf -P ${pair_id}.pindel_out -r ${reffa} -R HG38 -d ${genomefiledate} -v pindel.vcf + cat pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter "( GEN[*].AD[1] >= 10 )" | bgzip > pindel.vcf.gz + tabix pindel.vcf.gz + bash $baseDir/norm_annot.sh -r ${index_path} -p pindel -v pindel.vcf.gz + perl $baseDir/parse_pindel.pl ${pair_id} pindel.norm.vcf.gz + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} ${pair_id}.indel.vcf |bgzip > ${pair_id}.pindel_indel.vcf.gz + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} ${pair_id}.dup.vcf | bedtools intersect -header -b ${bed} -a stdin | bgzip > ${pair_id}.pindel_tandemdup.vcf.gz + java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} ${pair_id}.sv.vcf | bgzip > ${pair_id}.pindel.sv.vcf.gz + if [[ $filter == 1 ]] + then + perl $baseDir/filter_pindel.pl -d ${pair_id}.pindel_tandemdup.vcf.gz -s ${pair_id}.pindel.sv.vcf.gz -i ${pair_id}.pindel_indel.vcf.gz + mv ${pair_id}.pindel_tandemdup.vcf.gz ${pair_id}.pindel_tandemdup.unfilt.vcf.gz + mv ${pair_id}.pindel_tandemdup.pass.vcf ${pair_id}.pindel_tandemdup.vcf + bgzip ${pair_id}.pindel_tandemdup.vcf + mv ${pair_id}.pindel_indel.pass.vcf ${pair_id}.pindel.vcf + bgzip ${pair_id}.pindel.vcf + mv ${pair_id}.pindel.sv.vcf.gz ${pair_id}.pindel.sv.unfilt.vcf.gz + mv ${pair_id}.pindel.sv.pass.vcf ${pair_id}.pindel.sv.vcf + bgzip ${pair_id}.pindel.sv.vcf + zgrep '#CHROM' ${pair_id}.pindel.sv.vcf.gz > ${pair_id}.pindel.genefusion.txt + zcat ${pair_id}.pindel.sv.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS CHROM END ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.pindel.genefusion.txt + fi +elif [[ $method == 'itdseek' ]] +then + stexe=`which samtools` + samtools view -@ $NPROC -L ${bed} ${sbam} | itdseek.pl --refseq ${reffa} --samtools ${stexe} --bam ${sbam} | vcf-sort | bedtools intersect -header -b ${bed} -a stdin | java -Xmx30g -jar $SNPEFF_HOME/SnpSift.jar filter "( LEN < 10000 )" | bgzip > ${pair_id}.itdseek.vcf.gz + + tabix ${pair_id}.itdseek.vcf.gz + + bcftools norm --fasta-ref $reffa -m - -Ov ${pair_id}.itdseek.vcf.gz | java -Xmx30g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config ${snpeffgeno} - |bgzip > ${pair_id}.itdseek_tandemdup.vcf.gz + if [[ $filter == 1 ]] + then + perl $baseDir/filter_itdseeker.pl -t ${pair_id} -d ${pair_id}.itdseek_tandemdup.vcf.gz + mv ${pair_id}.itdseek_tandemdup.vcf.gz ${pair_id}.itdseek_tandemdup.unfilt.vcf.gz + mv ${pair_id}.itdseek_tandemdup.pass.vcf ${pair_id}.itdseek_tandemdup.vcf + bgzip ${pair_id}.itdseek_tandemdup.vcf fi - java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 lumpy.sv.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar filter " ( GEN[*].DV >= 20 )" | bgzip > ${pair_id}.sv.vcf.gz fi diff --git a/variants/uni_norm_annot.sh b/variants/uni_norm_annot.sh index 6a09f64e64933ac2ce2a6e98a0944dabbce6eab9..e47e0451b6661361de44fcac4e696b88eba04807 100755 --- a/variants/uni_norm_annot.sh +++ b/variants/uni_norm_annot.sh @@ -10,12 +10,13 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:p:v:h opt +while getopts :r:p:g:v:h opt do case $opt in r) index_path=$OPTARG;; p) pair_id=$OPTARG;; v) vcf=$OPTARG;; + g) snpeffgeno=$OPTARG;; h) usage;; esac done @@ -32,16 +33,22 @@ else usage fi +if [[ -z $snpeffgeno ]] +then + snpeffgeno='GRCh38.86' +fi source /etc/profile.d/modules.sh module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q +export PATH=/project/shared/bicf_workflow_ref/seqprg/bin:$PATH + perl $baseDir\/uniform_vcf_gt.pl $pair_id $vcf mv ${vcf} ${pair_id}.ori.vcf.gz bgzip -f ${pair_id}.uniform.vcf j=${pair_id}.uniform.vcf.gz tabix -f $j bcftools norm --fasta-ref $reffa -m - -Oz $j -o ${pair_id}.norm.vcf.gz -bash $baseDir/annotvcf.sh -p ${pair_id} -r $index_path -v ${pair_id}.norm.vcf.gz -/project/shared/bicf_workflow_ref/seqprg/vt/vt decompose_blocksub ${pair_id}.annot.vcf.gz -p -a -o ${pair_id}.vcf +bash $baseDir/annotvcf.sh -p ${pair_id} -r $index_path -v ${pair_id}.norm.vcf.gz -g $snpeffgeno +vt decompose_blocksub ${pair_id}.annot.vcf.gz -p -a -o ${pair_id}.vcf bgzip -f ${pair_id}.vcf diff --git a/variants/uniform_vcf_gt.pl b/variants/uniform_vcf_gt.pl index fd497db5c5da310a7497437bb73bb044798cdd35..661f7f21008897d707c753a3f9360eeacfcca108 100755 --- a/variants/uniform_vcf_gt.pl +++ b/variants/uniform_vcf_gt.pl @@ -26,7 +26,20 @@ while (my $line = <VCF>) { foreach $a (split(/;/,$annot)) { my ($key,$val) = split(/=/,$a); $hash{$key} = $val; - } + } + if ($alt =~ m/chr(\w+):(\d+)/i) { + $chr2='chr'.$1; + $p2 = $2; + $hash{CHR2} = $chr2; + $hash{'END'} = $p2; + $annot .= ";CHR2=$chr2;END=$p2"; + }elsif ($alt =~ m/CHR(\w+):(\d+)/i) { + $chr2='chr'.$1; + $p2 = $2; + $hash{CHR2} = 'chr'.$1; + $hash{END} = $2; + $annot .= ";CHR2=$chr2;END=$p2"; + } my @deschead = split(/:/,$format); my $newformat = 'GT:DP:AD:AO:RO'; my @newgts = (); @@ -42,17 +55,39 @@ while (my $line = <VCF>) { foreach my $i (0..$#deschead) { $gtdata{$deschead[$i]} = $gtinfo[$i]; } - if ($gtdata{AD}){ + if ($gtdata{AD} =~ m/\d+,\d+/){ ($gtdata{RO},@alts) = split(/,/,$gtdata{AD}); $gtdata{AO} = join(",",@alts); $gtdata{DP} = $gtdata{RO}; foreach (@alts) { $gtdata{DP} += $_; } + } elsif ($gtdata{AD} =~ m/^\d+$/){ + $gtdata{AO} = $gtdata{AD}; + $gtdata{RO} = $gtdata{DP} - $gtdata{AO}; + if ($gtdata{RO} < 0) { + $gtdata{DP} += $gtdata{AO}; + $gtdata{RO} = $gtdata{DP} - $gtdata{AO}; + } + $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO}); + } elsif (exists $gtdata{DV} && exists $gtdata{RV}) { + $gtdata{AO} = $gtdata{DV} + $gtdata{RV}; + $gtdata{RO} = $gtdata{DR} + $gtdata{RR}; + $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO}); + $gtdata{DP} = $gtdata{RO}+$gtdata{AO}; + } elsif (exists $gtdata{DR} && exists $gtdata{SR}){ + $gtdata{AO} = $gtdata{AD}; + $gtdata{DP} = $gtdata{AO} unless $gtdata{DP}; + if ($gtdata{DP} > $gtdata{AD}) { + $gtdata{RO} = $gtdata{DP} - $gtdata{AD}; + } else { + $gtdata{RO} = 0; + } + $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO}); } elsif (exists $gtdata{NR} && exists $gtdata{NV}) { - $gtdata{DP} = $gtdata{NR}; - $gtdata{AO} = $gtdata{NV}; - $gtdata{RO} = $gtdata{DP} - $gtdata{AO}; + $gtdata{DP} = $gtdata{NR}; + $gtdata{AO} = $gtdata{NV}; + $gtdata{RO} = $gtdata{DP} - $gtdata{AO}; } elsif (exists $gtdata{AO} && exists $gtdata{RO}) { $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO}); $gtdata{DP} = $gtdata{RO}; diff --git a/variants/union.sh b/variants/union.sh index 6c2edeefa94f7bfe8fa4e69a4d41f77dfc76926a..5dd626ce70070000a2e75e83380b5288a1aa110d 100755 --- a/variants/union.sh +++ b/variants/union.sh @@ -41,7 +41,7 @@ for i in ${dir}/*.vcf.gz; do if [[ $i == $HS ]] then bcftools norm -m - -O z -o hotspot.norm.vcf.gz $i - java -jar /cm/shared/apps/snpeff/4.3q/SnpSift.jar filter "(GEN[*].AD[1] > 3)" hotspot.norm.vcf.gz |bgzip > hotspot.lowfilt.vcf.gz + java -jar $SNPEFF_HOME/SnpSift.jar filter "(GEN[*].AD[1] > 3)" hotspot.norm.vcf.gz |bgzip > hotspot.lowfilt.vcf.gz bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a hotspot.lowfilt.vcf.gz -b stdin |bgzip > nooverlap.hotspot.vcf.gz list2="$list2 nooverlap.hotspot.vcf.gz" fi diff --git a/variants/unionvcf.pl b/variants/unionvcf.pl index c63fbae307a92537c0eca3500c617f1fdba5034f..84eedd53a0f63723b3207e7d430a2b0002d700b2 100755 --- a/variants/unionvcf.pl +++ b/variants/unionvcf.pl @@ -3,7 +3,7 @@ my $headerfile = shift @ARGV; my @vcffiles = @ARGV; -my @callers = ('fb','mutect','gatk','platypus','strelka2','shimmer','virmid'); +my @callers = ('fb','mutect','gatk','platypus','strelka2','shimmer','pindel','svaba'); %algos = map {$_=>1} @callers; open HEADER, "<$headerfile" or die $!;