diff --git a/obselete/combine_variants.union.sh b/obselete/combine_variants.union.sh new file mode 100644 index 0000000000000000000000000000000000000000..ea447574cd0a4a749ad1eccde1a77aaf5853f1fb --- /dev/null +++ b/obselete/combine_variants.union.sh @@ -0,0 +1,66 @@ +#!/bin/bash +#union.sh + +usage() { + echo "-h Help documentation for gatkrunner.sh" + echo "-r --Reference Genome: GRCh38 or GRCm38" + echo "-p --Prefix for output file name" + echo "Example: bash hisat.sh -p prefix -r /path/GRCh38" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:p:h opt +do + case $opt in + r) index_path=$OPTARG;; + p) pair_id=$OPTARG;; + h) usage;; + esac +done +function join_by { local IFS="$1"; shift; echo "$*"; } +shift $(($OPTIND -1)) +baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14 + +HS=*.hotspot.vcf.gz +list1=`ls *vcf.gz|grep -v hotspot` +list2=`ls *vcf.gz|grep -v hotspot` +varlist='' +calllist='' +for i in *.vcf.gz; do + EXT="${i#*.}" + CALL="${EXT%%.*}" + calllist="$calllist $CALL" + tabix $i + if [[ $i == $HS ]] + then + bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz + tabix hotspot.nooverlap.vcf.gz + list2="$list2 hotspot.nooverlap.vcf.gz" + varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz" + else + varlist="$varlist --variant:$CALL $i" + fi +done + +bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct > ${pair_id}_integrate.bed + +priority='ssvar' +if [[ *.platypus.vcf.gz ]] +then + priority="$priority,platypus" +fi +priority="$priority,sam,gatk" +if [[ -f *.hotspot.vcf.gz ]] +then + priority="$priority,hotspot" +fi + +java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf + +perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf +bgzip ${pair_id}_integrate.bed +tabix ${pair_id}_integrate.bed.gz +bgzip ${pair_id}.uniform.vcf +tabix ${pair_id}.uniform.vcf.gz +bcftools annotate -a ${pair_id}_integrate.bed.gz --columns CHROM,FROM,TO,CallSet -h ${index_path}/CallSet.header ${pair_id}.uniform.vcf.gz | bgzip > ${pair_id}.union.vcf.gz diff --git a/preproc_fastq/check_designfile_rnaseq.pl b/preproc_fastq/check_designfile_rnaseq.pl deleted file mode 100755 index 15ae06e908541ace6dab13bfc59f1c3567d28599..0000000000000000000000000000000000000000 --- a/preproc_fastq/check_designfile_rnaseq.pl +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/perl -w -#check_designfile.pl - -my $pe = shift @ARGV; -my $dfile = shift @ARGV; -open OUT, ">design.valid.txt" or die $!; -#open OUT2, ">tochannel.csv" or die $!; -open DFILE, "<$dfile" or die $!; -my $head = <DFILE>; -chomp($head); -$head =~ s/FullPathTo//g; -my @colnames = split(/\t/,$head); -my %newcols = map {$_=> 1} @colnames; - - -unless (grep(/FqR1/,@colnames)) { - die "Missing Sequence Files in Design File: FqR1\n"; -} -unless (grep(/SampleID/,@colnames)) { - die "Missing SampleID in Design File\n"; -} - -if ($pe eq 'pe') { - unless (grep(/FqR2/,@colnames)) { - die "Missing Sequence Files in Design File: FqR2\n"; - } -}else { - delete $newcols{FqR2}; -} - -my @cols = sort {$a cmp $b} keys %newcols; -print OUT join("\t","SampleID","FqR1","FqR2"),"\n"; -my @grp = ('a','b'); -my $lnct = 0; -while (my $line = <DFILE>) { - chomp($line); - $line =~ s/FullPathTo//g; - my @row = split(/\t/,$line); - my %hash; - foreach my $i (0..$#row) { - next unless ($newcols{$colnames[$i]}); - $hash{$colnames[$i]} = $row[$i]; - } - if ($hash{SampleID} =~ m/^\d/) { - $hash{SampleID} =~ s/^/S/; - } - $hash{SampleName} = $hash{SampleID} unless ($hash{SampleName}); - $hash{SubjectID} = $hash{SampleID} unless ($hash{SubjectID}); - - unless ($hash{SampleGroup}) { - $j = $lnct %% 2; - $hash{SampleGroup} = $grp[$j]; - } - $hash{SampleGroup} =~ s/_/./g; - my @line; - foreach $f (@cols) { - push @line, $hash{$f}; - } - print OUT join("\t",@line),"\n"; - $hash{FqR2} = 'na' unless ($hash{FqR2}); - print join(",",$hash{SampleID},$hash{FqR1},$hash{FqR2}),"\n"; - $lnct ++; -} diff --git a/variants/annotvcf.sh b/variants/annotvcf.sh index ba25c22e89bb0211ab7d5cd6153f207337bf8828..2d2e121dc82df79500075b0cfe63442be56a56b2 100644 --- a/variants/annotvcf.sh +++ b/variants/annotvcf.sh @@ -21,7 +21,7 @@ do done function join_by { local IFS="$1"; shift; echo "$*"; } shift $(($OPTIND -1)) -module load python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q +module load python/2.7.x-anaconda bedtools/2.26.0 samtools/1.6 snpeff/4.3q if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]] then tabix ${unionvcf} diff --git a/variants/gatkrunner.sh b/variants/gatkrunner.sh index 077ce71fc4d18407a9a225de4123d2d331aa8b28..7918eca1aeb46ef773de8d10663a3b553c0e723b 100644 --- a/variants/gatkrunner.sh +++ b/variants/gatkrunner.sh @@ -63,8 +63,9 @@ if [[ $algo == 'gatkbam_rna' ]] then module load picard/2.10.3 java -Xmx4g -jar $PICARD/picard.jar CleanSam INPUT=${sbam} OUTPUT=${pair_id}.clean.bam + java -Xmx4g -jar $PICARD/picard.jar ReorderSam I=${pair_id}.clean.bam O=${pair_id}.sort.bam R=${reffa} CREATE_INDEX=TRUE #java -Xmx4g -jar $PICARD/picard.jar AddOrReplaceReadGroups INPUT=${pair_id}.clean.bam O=${pair_id}.rg_added_sorted.bam SO=coordinate RGID=${pair_id} RGLB=tx RGPL=illumina RGPU=barcode RGSM=${pair_id} - samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.clean.bam + #samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.clean.bam java -Xmx4g -jar $GATK_JAR -T SplitNCigarReads -R ${reffa} -I ${pair_id}.clean.bam -o ${pair_id}.split.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${pair_id}.split.bam -nt $SLURM_CPUS_ON_NODE -nct 1 java -Xmx32g -jar $GATK_JAR -I ${pair_id}.split.bam -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam diff --git a/variants/uniform_vcf_gt.pl b/variants/uniform_vcf_gt.pl new file mode 100755 index 0000000000000000000000000000000000000000..7dd813e6c3f1a2e04d511cb5b67585fd9f565a11 --- /dev/null +++ b/variants/uniform_vcf_gt.pl @@ -0,0 +1,68 @@ +#!/usr/bin/perl +#migrate_db.pl + +my $vcf = shift @ARGV; +my $outfile = $vcf; +$outfile =~ s/vcf.gz/uniform.vcf/; +open OUT, ">$outfile" or die $!; +open VCF, "gunzip -c $vcf|" or die $!; +while (my $line = <VCF>) { + chomp($line); + if ($line =~ m/#/) { + print OUT $line,"\n"; + next; + } + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val; + } + my @deschead = split(/:/,$format); + my $newformat = 'GT:DP:AD:AO:RO'; + my @newgts = (); + my $missingGT = 0; + FG:foreach my $allele_info (@gts) { + my @gtinfo = split(/:/,$allele_info); + my %gtdata; + if ($allele_info eq '.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + foreach my $i (0..$#deschead) { + $gtdata{$deschead[$i]} = $gtinfo[$i]; + } + if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') { + push @newgts, '.:.:.:.:.'; + $missingGT ++; + next FG; + } + if ($gtdata{AD}){ + ($gtdata{RO},@alts) = split(/,/,$gtdata{AD}); + $gtdata{AO} = join(",",@alts); + $gtdata{DP} = $gtdata{RO}; + foreach (@alts) { + $gtdata{DP} += $_; + } + } elsif (exists $gtdata{NR} && exists $gtdata{NV}) { + $gtdata{DP} = $gtdata{NR}; + $gtdata{AO} = $gtdata{NV}; + $gtdata{RO} = $gtdata{DP} - $gtdata{AO}; + } elsif (exists $gtdata{AO} && exists $gtdata{RO}) { + $gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO}); + $gtdata{DP} = $gtdata{RO}; + foreach (split(',',$gtdata{AO})) { + $gtdata{DP} += $_; + } + } + if ($gtdata{DP} && $gtdata{DP} < 5) { + $missingGT ++; + } + push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); + } + next if ($missingGT == scalar(@gts)); + print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; +} +close VCF; diff --git a/variants/union.sh b/variants/union.sh index 0b829440e8e91c433e5b34bb99764e32a58a33f6..2ed49e260b52b009afa8d908e82bccc735e9c5bb 100644 --- a/variants/union.sh +++ b/variants/union.sh @@ -28,9 +28,6 @@ list2=`ls *vcf.gz|grep -v hotspot` varlist='' calllist='' for i in *.vcf.gz; do - EXT="${i#*.}" - CALL="${EXT%%.*}" - calllist="$calllist $CALL" if [[ $i == $HS ]] then bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz @@ -38,6 +35,6 @@ for i in *.vcf.gz; do fi done -perl $baseDir/unionvcf.pl $list2 +perl $baseDir/unionvcf.pl ${index_path}/union.header.vcf $list2 perl $baseDir/vcfsorter.pl ${index_path}/genome.dict int.vcf |bgzip > ${pair_id}.union.vcf.gz diff --git a/variants/unionvcf.pl b/variants/unionvcf.pl index f04e19507e7155435500956b908ec9883f9a09c6..6329433bad9c6ffd046e6b3ee1266487eed0b111 100755 --- a/variants/unionvcf.pl +++ b/variants/unionvcf.pl @@ -3,23 +3,33 @@ my $headerfile = shift @ARGV; my @vcffiles = @ARGV; -my $outfile = $vcf; -$outfile =~ s/vcf.gz/uniform.vcf/; + open HEADER, "<$headerfile" or die $!; -open OUT, ">union.vcf" or die $!; +open OUT, ">int.vcf" or die $!; while (my $line = <HEADER>) { - print OUT $line; + chomp($line); + print OUT $line,"\n";; } close HEADER; +my @sampleorder; + my %headerlines; foreach $vcf (@vcffiles) { $caller = (split(/\./,$vcf))[1]; open VCF, "gunzip -c $vcf|" or die $!; + my @sampleids; while (my $line = <VCF>) { chomp($line); if ($line =~ m/#/) { - $headerlines{$line} = 1; - next; + if ($line =~ m/#CHROM/) { + ($chromhd, $posd,$idhd,$refhd,$althd,$scorehd, + $filterhd,$annothd,$formathd,@sampleids) = split(/\t/, $line); + unless (@sampleorder) { + @sampleorder = @sampleids; + print OUT $line,"\n"; + } + next; + } } my ($chrom, $pos,$id,$ref,$alt,$score, $filter,$annot,$format,@gts) = split(/\t/, $line); @@ -30,13 +40,15 @@ foreach $vcf (@vcffiles) { } my @deschead = split(/:/,$format); my $newformat = 'GT:DP:AD:AO:RO'; - my @newgts = (); + my %newgts; my $missingGT = 0; - FG:foreach my $allele_info (@gts) { + FG:foreach my $i (0..$#gts) { + my $allele_info = $gts[$i]; + my $sid = $sampleids[$i]; my @gtinfo = split(/:/,$allele_info); my %gtdata; if ($allele_info eq '.') { - push @newgts, '.:.:.:.:.'; + $newgts{$sid} = '.:.:.:.:.'; $missingGT ++; next FG; } @@ -44,7 +56,7 @@ foreach $vcf (@vcffiles) { $gtdata{$deschead[$i]} = $gtinfo[$i]; } if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') { - push @newgts, '.:.:.:.:.'; + $newgts{$sid} = '.:.:.:.:.'; $missingGT ++; next FG; } @@ -69,9 +81,13 @@ foreach $vcf (@vcffiles) { if ($gtdata{DP} && $gtdata{DP} < 3) { $missingGT ++; } - push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); + $newgts{$sid} = join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO}); } next if ($missingGT == scalar(@gts)); + my @newgts; + foreach $id (@sampleorder) { + push @newgts, $newgts{$id}; + } $lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n"; } close VCF;