diff --git a/alignment/bamqc.sh b/alignment/bamqc.sh index 52f904e385e2e536a3f8e8858d0e2dbb179536b1..b5257a772d730efa4d686ecf4b4b4e787109cad2 100644 --- a/alignment/bamqc.sh +++ b/alignment/bamqc.sh @@ -13,7 +13,7 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:b:c:n:p:s:d:h opt +while getopts :r:b:c:n:p:e:s:d:h opt do case $opt in r) index_path=$OPTARG;; @@ -22,6 +22,7 @@ do n) nuctype=$OPTARG;; p) pair_id=$OPTARG;; d) dedup=$OPTARG;; + e) version=$OPTARG;; s) skiplc=1;; h) usage;; esac @@ -34,6 +35,11 @@ shift $(($OPTIND -1)) # usage #fi +if [[ -z $version ]] +then + version='NA' +fi + source /etc/profile.d/modules.sh module load samtools/gcc/1.10 fastqc/0.11.8 samtools flagstat ${sbam} > ${pair_id}.flagstat.txt @@ -45,27 +51,36 @@ if [[ -z $NPROC ]] then NPROC=`nproc` fi +threads=`expr $NPROC - 10` if [[ $dedup == 1 ]] then mv $sbam ori.bam - samtools view -@ $NPROC -F 1024 -b -o ${sbam} ori.bam + samtools view -@ $threads -F 1024 -b -o ${sbam} ori.bam fi tmpdir=`pwd` if [[ $nuctype == 'dna' ]]; then module load bedtools/2.29.2 picard/2.10.3 - bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt + bedtools coverage -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt grep ^all ${pair_id}.covhist.txt > ${pair_id}.genomecov.txt perl $baseDir/calculate_depthcov.pl ${pair_id}.covhist.txt if [[ -z $skiplc ]] then - samtools view -@ $NPROC -b -L ${bed} -o ${pair_id}.ontarget.bam ${sbam} - samtools index -@ $NPROC ${pair_id}.ontarget.bam + samtools view -@ $threads -b -L ${bed} -o ${pair_id}.ontarget.bam ${sbam} + samtools index -@ $threads ${pair_id}.ontarget.bam samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt - samtools view -@ $NPROC -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt + samtools view -@ $threads -b -q 1 ${sbam} | bedtools coverage -hist -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt + java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$threads -jar $PICARD/picard.jar EstimateLibraryComplexity BARCODE_TAG=RG I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir} #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt TMP_DIR=${tmpdir} - #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir} - #samtools view -@ $NPROC ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt + #samtools view -@ $threads ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt fi #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt TMP_DIR=${tmpdir} + if [[ $index_path/reference_info.txt ]] + then + perl $baseDir/sequenceqc_dna.pl -e ${version} -r $index_path ${pair_id}.genomecov.txt + else + touch ${pair_id}.genomecov.txt + fi +else + perl $baseDir/sequenceqc_rna.pl -e ${version} -r $index_path ${pair_id}.flagstat.txt fi diff --git a/alignment/sequenceqc_dna.pl b/alignment/sequenceqc_dna.pl new file mode 100755 index 0000000000000000000000000000000000000000..6634923731839925d2e9a57c1169c35ebcad0e70 --- /dev/null +++ b/alignment/sequenceqc_dna.pl @@ -0,0 +1,141 @@ +#!/usr/bin/perl -w +#sequenceqc_alignment.p + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s'); + +my @statfiles = @ARGV; + +my $fileowner = $opt{user}; +my $gittag = 'v5'; +if ($opt{gitdir}) { + $gittag = $opt{gitdir}; +}elsif($ENV{'gitv'}) { + $gittag = $ENV{'gitv'}; +} +foreach $sfile (@statfiles) { + $sfile =~ m/(\S+)\.genomecov.txt/; + my $prefix = $1; + my %hash; + open FLAG, "<$prefix\.trimreport.txt"; + while (my $line = <FLAG>) { + chomp($line); + my ($file,$raw,$trim) = split(/\t/,$line); + $hash{rawct} += $raw; + } + open FLAG, "<$prefix\.flagstat.txt" or die $!; + while (my $line = <FLAG>) { + chomp($line); + if ($line =~ m/(\d+) \+ \d+ in total/) { + $hash{total} = $1; + }elsif ($line =~ m/(\d+) \+ \d+ read1/) { + $hash{pairs} = $1; + }elsif ($line =~ m/(\d+) \+ \d+ mapped/) { + $hash{maprate} = 100*sprintf("%.4f",$1/$hash{total}); + }elsif ($line =~ m/(\d+) \+ \d+ properly paired/) { + $hash{propair} = 100*sprintf("%.4f",$1/$hash{total}); + } + } + close FLAG; + open FLAG, "<$prefix\.ontarget.flagstat.txt" or die $!; + while (my $line = <FLAG>) { + chomp($line); + if ($line =~ m/(\d+) \+ \d+ in total/) { + $hash{ontarget} = $1; + } + } + unless ($hash{rawct}) { + $hash{rawct} = $hash{total}; + } + my %lc; + open DUP, "<$prefix.libcomplex.txt" or die $!; + while (my $line = <DUP>) { + chomp($line); + if ($line =~ m/## METRICS/) { + $header = <DUP>; + $nums = <DUP>; + chomp($header); + chomp($nums); + my @stats = split(/\t/,$header); + my @row = split(/\t/,$nums); + my %info; + foreach my $i (0..$#stats) { + $info{$stats[$i]} = $row[$i]; + } + $lc{TOTREADSLC} += $info{UNPAIRED_READS_EXAMINED} + $info{READ_PAIRS_EXAMINED}; + $hash{libsize} = $info{ESTIMATED_LIBRARY_SIZE}; + $lc{TOTDUPLC} += $info{UNPAIRED_READ_DUPLICATES} + $info{READ_PAIR_DUPLICATES}; + } + } + close DUP; + $hash{percdups} = sprintf("%.4f",$lc{TOTDUPLC}/$lc{TOTREADSLC}); + my %cov; + open COV, "<$prefix\.genomecov.txt" or die $!; + my $sumdepth; + my $totalbases; + while (my $line = <COV>) { + chomp($line); + my ($all,$depth,$bp,$total,$percent) = split(/\t/,$line); + $cov{$depth} = $percent; + $sumdepth += $depth*$bp; + $totalbases = $total unless ($totalbases); + } + my $avgdepth = sprintf("%.0f",$sumdepth/$totalbases); + my @depths = sort {$a <=> $b} keys %cov; + my @perc = @cov{@depths}; + my @cum_sum = cumsum(@perc); + my $median = 0; + foreach my $i (0..$#cum_sum) { + if ($cum_sum[$i] < 0.5) { + $median = $i; + } + } + $hash{'perc100x'} = 100*sprintf("%.4f",1-$cum_sum[100]); + $hash{'perc200x'} = 100*sprintf("%.4f",1-$cum_sum[200]); + $hash{'perc500x'} = 100*sprintf("%.4f",1-$cum_sum[500]); + + #### Begin File Information ######## + my $status = 'PASS'; + $status = 'FAIL' if ($hash{maprate} < 0.90 && $hash{'perc100x'} < 0.90); + my @stats = stat("$prefix\.flagstat.txt"); + my ($day,$month,$year) = (localtime($stats[9]))[3,4,5]; + $year += 1900; + $month++; + $date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day)); + $fileowner = 's'.$stats[4] unless $fileowner; + $hash{status} = $status; + $hash{date}=$date; + $hash{fileowner} = $fileowner; + ##### End File Information ######## + ##### START separateFilesPerSample ###### + open OUT, ">".$prefix.".sequence.stats.txt" or die $!; + print OUT join("\n", "Sample\t".$prefix,"Total_Raw_Count\t".$hash{rawct}, + "On_Target\t".$hash{ontarget},"Map_Rate\t".$hash{maprate}, + "Properly_Paired\t".$hash{propair}, + "Percent_Duplicates\t".sprintf("%.2f",100*$hash{percdups}), + "Percent_on_Target\t".sprintf("%.2f",100*$hash{ontarget}/$hash{rawct}), + "All_Average_Depth\t".$avgdepth,"All_Median_Depth\t".$median, + "Percent_over_100x\t".$hash{'perc100x'}, + "Percent_over_200x\t".$hash{'perc200x'}, + "Percent_over_500x\t".$hash{'perc500x'}, + "Alignment_Status\t".$hash{status},"Alignment_Date\t".$hash{date}, + "File_Owner\t".$hash{fileowner},"Workflow_Version\t".$gittag),"\n"; + close OUT; + if (-e "$opt{refdir}\/reference_info.txt") { + system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt}); + } + ##### END separateFilesPerSample ###### +} + + +sub cumsum { + my @nums = @_; + my @cumsum = (); + my $mid = 0; + for my $i (0..$#nums) { + $mid += $nums[$i]; + push(@cumsum,$mid); + } + return @cumsum; +} diff --git a/alignment/sequenceqc_rna.pl b/alignment/sequenceqc_rna.pl new file mode 100755 index 0000000000000000000000000000000000000000..906b802404a6ba82cd463ca09ebe92f1d4d6c931 --- /dev/null +++ b/alignment/sequenceqc_rna.pl @@ -0,0 +1,78 @@ +#!/usr/bin/perl -w +#sequenceqc_rnaseq.pl + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s'); + +my @files = @ARGV; +chomp(@files); + +my $fileowner = $opt{user}; +my $gittag = 'v5'; +if ($opt{gitdir}) { + $gittag = $opt{gitdir}; +}elsif($ENV{'gitv'}) { + $gittag = $ENV{'gitv'}; +} + +foreach my $file (@files) { + chomp($file); + $file =~ m/(\S+)\.flagstat.txt/; + my @directory = split(/\//, $file); + $prefix = $directory[-1]; + my $sample = (split(/\./,$prefix))[0]; + open OUT, ">".$sample.".sequence.stats.txt" or die;#separateFilesPerSample + $hisatfile = $file; + $hisatfile =~ s/flagstat/alignerout/; + open HISAT, "<$hisatfile"; + my ($total, $pairs,$read2ct,$maprate,$concorrate); + while (my $line = <HISAT>) { + chomp($line); + if ($line =~ m/(\d+) reads; of these:/) { + $total = $1; + }elsif ($line =~ m/Number of input reads\s+\|\s+(\d+)/) { + $total = $1; + }elsif ($line =~ m/(\d+) \S+ were paired; of these:/) { + $pairs = $1; + $total = $pairs*2 if ($total == $pairs); + }elsif ($line =~ m/(\d+) pairs aligned concordantly 0 times; of these:/) { + $concorrate = sprintf("%.4f",1-($1/$pairs)); + }elsif ($line =~ m/(\S+)% overall alignment rate/) { + $maprate = $1/100; + } + } + close HISAT; + open IN, "<$file" or die $!; + while ($line = <IN>) { + chomp($line); + if ($line =~ m/(\d+) \+ \d+ in total/) { + $total = $1 unless $total; + }elsif ($line =~ m/(\d+) \+ \d+ read1/) { + $pairs = $1; + }elsif ($line =~ m/(\d+) \+ \d+ read2/) { + $read2ct = $1; + }elsif ($line =~ m/(\d+) \+ \d+ mapped\s+\((\S+)%.+\)/) { + $maprate = sprintf("%.2f",$2/100) unless ($maprate); + }elsif ($line =~ m/(\d+) \+ \d+ properly paired\s+\((\S+)%.+\)/) { + $concorrate = sprintf("%.2f",$2/100) unless ($concorrate); + } + } + close IN; + + my @stats=stat($file); + my ($day,$month,$year) = (localtime($stats[9]))[3,4,5]; + $year += 1900; + $month++; + $date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day)); + $fileowner = 's'.$stats[4] unless $fileowner; + $total = $pairs*2 if ($total == $pairs); + my $status = 'PASS'; + $status = 'FAIL' if ($maprate < 0.90); + $status = 'FAIL' if ($total < 6000000); + + print OUT join("\n","Sample\t".$sample,"Total_Raw_Count\t".$total, "Read1_Map\t".$pairs,"Read2_Map\t".$read2ct, + "Map_Rate\t".$maprate,"Concordant_Rate\t".$concorrate,"Alignment_Status\t".$status,"Alignment_Date\t".$date, + "File_Owner\t".$fileowner,"Workflow_Version\t".$gittag),"\n"; + system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt}); +} diff --git a/alignment/starfusion.sh b/alignment/starfusion.sh index a9f984c59e288df27ea45277fb2c4848449564c5..c143ea31bb20bb6e1ee5c60fac3c7b52743228e2 100644 --- a/alignment/starfusion.sh +++ b/alignment/starfusion.sh @@ -53,22 +53,19 @@ then refgeno=${index_path}/CTAT_lib_trinity1.6 trinity /usr/local/src/STAR-Fusion/STAR-Fusion --min_sum_frags 3 --CPU $NPROC --genome_lib_dir ${refgeno} --left_fq ${fq1} --right_fq ${fq2} --examine_coding_effect --output_dir ${pair_id}_star_fusion cp ${pair_id}_star_fusion/star-fusion.fusion_predictions.abridged.coding_effect.tsv ${pair_id}.starfusion.txt + module load singularity/3.0.2 + export PYENSEMBL_CACHE_DIR="/project/shared/bicf_workflow_ref/singularity_images" + cut -f 5-8 ${pair_id}.starfusion.txt |perl -pe 's/\^|:/\t/g' | awk '{print "singularity exec /project/shared/bicf_workflow_ref/singularity_images/agfusion.simg agfusion annotate -db /project/shared/bicf_workflow_ref/singularity_images/pyensembl/GRCh38/ensembl92/agfusion.homo_sapiens.92.db -g5", $1,"-j5",$4,"-g3",$6,"-j3",$9,"-o",$1"_"$4"_"$6"_"$9}' |grep -v 'LeftGene' |sh else - module add star/2.5.2b - refgeno=${index_path}/CTAT_lib/ + refgeno=${index_path}/CTAT_resource_lib STAR-Fusion --genome_lib_dir ${refgeno} --min_sum_frags 3 --left_fq ${fq1} --right_fq ${fq2} --output_dir ${pair_id}_star_fusion &> star_fusion.err cp ${pair_id}_star_fusion/star-fusion.fusion_candidates.final.abridged ${pair_id}.starfusion.txt + cut -f 5-8 ${pair_id}.starfusion.txt |perl -pe 's/\^|:/\t/g' | awk '{print "agfusion annotate -db agfusion.homo_sapiens.87.db -g5", $1,"-j5",$4,"-g3",$6,"-j3",$9,"-o",$1"_"$4"_"$6"_"$9}' |grep -v 'LeftGene' |sh fi -module load singularity/3.0.2 -export PYENSEMBL_CACHE_DIR="/project/shared/bicf_workflow_ref/singularity_images" -cut -f 5-8 ${pair_id}.starfusion.txt |perl -pe 's/\^|:/\t/g' | awk '{print "singularity exec /project/shared/bicf_workflow_ref/singularity_images/agfusion.simg agfusion annotate -db /project/shared/bicf_workflow_ref/singularity_images/pyensembl/GRCh38/ensembl92/agfusion.homo_sapiens.92.db -g5", $1,"-j5",$4,"-g3",$6,"-j3",$9,"-o",$1"_"$4"_"$6"_"$9}' |grep -v 'LeftGene' |sh - if [[ $filter == 1 ]] then cut -f 6,8 ${pair_id}.starfusion.txt |grep -v Breakpoint |perl -pe 's/\t/\n/g' |awk -F ':' '{print $1"\t"$2-1"\t"$2}' > temp.bed bedtools intersect -wao -a temp.bed -b ${index_path}/cytoBand.txt |cut -f 1,2,7 > cytoband_pos.txt perl $baseDir/filter_genefusions.pl -p ${pair_id} -r ${index_path} -f ${pair_id}.starfusion.txt fi - - diff --git a/variants/filter_cnvkit.pl b/variants/filter_cnvkit.pl index ef06a3e56e17a58ef415b98aa54a875c95476690..7a87800f5f511963b685f5c8b2e76e44ba9c11c5 100755 --- a/variants/filter_cnvkit.pl +++ b/variants/filter_cnvkit.pl @@ -78,7 +78,6 @@ while (my $line = <IN>) { foreach my $j (0..$#row) { $hash{$colnames[$j]} = $row[$j]; } - next if ($hash{chromosome} eq 'chrX' && $hash{cn} == 1); my $key = $hash{chromosome}.":".$hash{start}."-".$hash{end}; my $geneids = $hash{gene}; my %genes; diff --git a/variants/gatkrunner.sh b/variants/gatkrunner.sh index df24cc7a0c197c0f8570e1d7537a56e3505887ed..0e32b5de34e22243642cb378d2132cae540541d6 100755 --- a/variants/gatkrunner.sh +++ b/variants/gatkrunner.sh @@ -62,27 +62,6 @@ module load gatk/4.1.4.0 samtools/gcc/1.8 which samtools samtools index -@ $NPROC ${sbam} -if [[ $algo == 'gatkbam_rna' ]] -then - module load picard/2.10.3 - java -Xmx4g -jar $PICARD/picard.jar CleanSam INPUT=${sbam} OUTPUT=${pair_id}.clean.bam - java -Xmx4g -jar $PICARD/picard.jar ReorderSam I=${pair_id}.clean.bam O=${pair_id}.sort.bam R=${reffa} CREATE_INDEX=TRUE - java -Xmx4g -jar $PICARD/picard.jar AddOrReplaceReadGroups INPUT=${pair_id}.clean.bam O=${pair_id}.rg_added_sorted.bam SO=coordinate RGID=${pair_id} RGLB=tx RGPL=illumina RGPU=barcode RGSM=${pair_id} - samtools index -@ $NPROC ${pair_id}.rg_added_sorted.bam - gatk SplitNCigarReads -R ${reffa} -I ${pair_id}.rg_added_sorted.bam -O ${pair_id}.split.bam - gatk --java-options "-Xmx32g" BaseRecalibrator -I ${pair_id}.split.bam --known-sites ${index_path}/dbSnp.gatk4.vcf.gz -R ${reffa} -O ${pair_id}.recal_data.table --use-original-qualities - gatk --java-options "-Xmx32g" ApplyBQSR -I ${pair_id}.split.bam -R ${reffa} -O ${pair_id}.final.bam --use-original-qualities -bqsr ${pair_id}.recal_data.table - samtools index -@ $NPROC ${pair_id}.final.bam -elif [[ $algo == 'gatkbam' ]] -then - gatk --java-options "-Xmx32g" BaseRecalibrator -I ${sbam} --known-sites ${index_path}/dbSnp.gatk4.vcf.gz -R ${reffa} -O ${pair_id}.recal_data.table --use-original-qualities - gatk --java-options "-Xmx32g" ApplyBQSR -I ${sbam} -R ${reffa} -O ${pair_id}.final.bam --use-original-qualities -bqsr ${pair_id}.recal_data.table - samtools index -@ $NPROC ${pair_id}.final.bam - -elif [[ $algo == 'abra2' ]] -then - module load abra2/2.18 - mkdir tmpdir - java -Xmx16G -jar /cm/shared/apps/abra2/lib/abra2.jar --in ${sbam} --in-vcf /archive/PHG/PHG_Clinical/phg_workflow/analysis/awesomeproject/GoldIndels.vcf --out ${pair_id}.final.bam --ref ${reffa} --threads $NPROC --tmpdir tmpdir - samtools index -@ $NPROC ${pair_id}.final.bam -fi +gatk --java-options "-Xmx32g" BaseRecalibrator -I ${sbam} --known-sites ${index_path}/dbSnp.gatk4.vcf.gz -R ${reffa} -O ${pair_id}.recal_data.table --use-original-qualities +gatk --java-options "-Xmx32g" ApplyBQSR -I ${sbam} -R ${reffa} -O ${pair_id}.final.bam --use-original-qualities -bqsr ${pair_id}.recal_data.table +samtools index -@ $NPROC ${pair_id}.final.bam diff --git a/variants/germline_vc.sh b/variants/germline_vc.sh index 38ce8eeb0d1bd0cc26d1108f71372a9c7f343885..09e14265bc3d3c5b5464afdcd7152c1f4e1f19bb 100755 --- a/variants/germline_vc.sh +++ b/variants/germline_vc.sh @@ -66,6 +66,13 @@ else ponopt=''; fi +if [[ -n $tbed ]] +then + interval=$tbed +else + interval=`cat ${reffa}.fai |cut -f 1 |grep -v decoy |grep -v 'HLA' |grep -v alt |grep -v 'chrUn' |grep -v 'random' | perl -pe 's/\n/ -L /g' |perl -pe 's/-L $//'` +fi + source /etc/profile.d/modules.sh module load python/2.7.x-anaconda picard/2.10.3 samtools/gcc/1.8 bcftools/gcc/1.8 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel @@ -108,15 +115,14 @@ then for i in *.bam; do prefix="${i%.bam}" echo ${prefix} - gatk --java-options "-Xmx32g" HaplotypeCaller -R ${reffa} -I ${i} -A FisherStrand -A QualByDepth -A DepthPerAlleleBySample -A TandemRepeat --emit-ref-confidence GVCF -O haplotypecaller.vcf.gz + gatk --java-options "-Xmx32g" HaplotypeCaller -R ${reffa} -I ${i} -A FisherStrand -A QualByDepth -A DepthPerAlleleBySample -A TandemRepeat --emit-ref-confidence GVCF -G StandardAnnotation -G AS_StandardAnnotation -G StandardHCAnnotation -O haplotypecaller.vcf.gz -L $interval java -jar $PICARD/picard.jar SortVcf I=haplotypecaller.vcf.gz O=${prefix}.gatk.g.vcf R=${reffa} CREATE_INDEX=TRUE gvcflist="$gvcflist -V ${prefix}.gatk.g.vcf" done - interval=`cat ${reffa}.fai |cut -f 1 |grep -v decoy |grep -v 'HLA' |grep -v alt |grep -v 'chrUn' |grep -v 'random' | perl -pe 's/\n/ -L /g' |perl -pe 's/-L $//'` - gatk --java-options "-Xmx32g" GenomicsDBImport $gvcflist --genomicsdb-workspace-path gendb -L $interval - gatk --java-options "-Xmx32g" GenotypeGVCFs -V gendb://gendb -R ${reffa} -D ${gatk4_dbsnp} -O gatk.vcf - bcftools norm -c s -f ${reffa} -w 10 -O v gatk.vcf | vcf-annotate -n --fill-type gatk.vcf | bgzip > ${pair_id}.gatk.vcf.gz + gatk --java-options "-Xmx32g" GenomicsDBImport $gvcflist --genomicsdb-workspace-path gendb -L $interval --reader-threads $NPROC + gatk --java-options "-Xmx32g" GenotypeGVCFs -V gendb://gendb -R ${reffa} -D ${gatk4_dbsnp} -O gatk.vcf -L $interval + bcftools norm -c s -f ${reffa} -w 10 -O v gatk.vcf | vcf-annotate -n --fill-type | bgzip > ${pair_id}.gatk.vcf.gz tabix ${pair_id}.gatk.vcf.gz elif [ $algo == 'mutect' ] then @@ -126,10 +132,8 @@ then for i in *.bam; do bamlist+="-I ${i} " done - gatk --java-options "-Xmx20g" Mutect2 $ponopt -R ${reffa} ${bamlist} --output ${pair_id}.mutect.vcf -RF AllowAllReadsReadFilter --independent-mates --tmp-dir `pwd` - #gatk --java-options "-Xmx20g" FilterMutectCalls -R ${reffa} -V ${pair_id}.mutect.vcf -O ${pair_id}.mutect.filt.vcf + gatk --java-options "-Xmx20g" Mutect2 $ponopt -R ${reffa} ${bamlist} --output ${pair_id}.mutect.vcf -RF AllowAllReadsReadFilter --independent-mates --tmp-dir `pwd` -L $interval vcf-sort ${pair_id}.mutect.vcf | vcf-annotate -n --fill-type | java -jar $SNPEFF_HOME/SnpSift.jar filter -p '(GEN[*].DP >= 10)' | bgzip > ${pair_id}.mutect.vcf.gz - elif [[ $algo == 'strelka2' ]] then opt='' diff --git a/variants/somatic_vc.sh b/variants/somatic_vc.sh index 4631780783270f6c1776f1b7f94788921553be51..40843d0d50000b417506e757acaa7dd7dd4fd18f 100755 --- a/variants/somatic_vc.sh +++ b/variants/somatic_vc.sh @@ -88,6 +88,22 @@ else fi baseDir="`dirname \"$0\"`" +if [[ -n $tbed ]] +then + interval=$tbed +else + interval=`cat ${reffa}.fai |cut -f 1 |grep -v decoy |grep -v 'HLA' |grep -v alt |grep -v 'chrUn' |grep -v 'random' | perl -pe 's/\n/ -L /g' |perl -pe 's/-L $//'` +fi +if [[ -z $tid ]] +then + tid=`samtools view -H ${tumor} |grep '^@RG' |perl -pi -e 's/\t/\n/g' |grep ID |cut -f 2 -d ':'` +fi +if [[ -z $nid ]] +then + nid=`samtools view -H ${normal} |grep '^@RG' |perl -pi -e 's/\t/\n/g' |grep ID |cut -f 2 -d ':'` +fi + + source /etc/profile.d/modules.sh module load htslib/gcc/1.8 export PATH=/project/shared/bicf_workflow_ref/seqprg/bin:$PATH @@ -126,8 +142,7 @@ then gatk4_dbsnp=${index_path}/clinseq_prj/dbSnp.gatk4.vcf.gz module load gatk/4.1.4.0 picard/2.10.3 snpeff/4.3q samtools/gcc/1.8 vcftools/0.1.14 java -XX:ParallelGCThreads=$NPROC -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar CollectSequencingArtifactMetrics I=${tumor} O=artifact_metrics.txt R=${reffa} - gatk --java-options "-Xmx20g" Mutect2 $ponopt --independent-mates -RF AllowAllReadsReadFilter -R ${reffa} -I ${tumor} -tumor ${tid} -I ${normal} -normal ${nid} --output ${tid}.mutect.vcf - #gatk --java-options "-Xmx20g" FilterMutectCalls -R ${reffa} -V ${tid}.mutect.vcf -O ${tid}.mutect.filt.vcf + gatk --java-options "-Xmx20g" Mutect2 $ponopt --independent-mates -RF AllowAllReadsReadFilter -R ${reffa} -I ${tumor} -tumor ${tid} -I ${normal} -normal ${nid} --output ${tid}.mutect.vcf -L $interval vcf-sort ${tid}.mutect.vcf | vcf-annotate -n --fill-type | java -jar $SNPEFF_HOME/SnpSift.jar filter -p '(GEN[*].DP >= 10)' | bgzip > ${pair_id}.mutect.vcf.gz elif [ $algo == 'varscan' ] then