diff --git a/alignment/add_umi_sam.py b/alignment/add_umi_sam.py new file mode 100644 index 0000000000000000000000000000000000000000..c234af2846efd9c36531e430fa2210e9661a4188 --- /dev/null +++ b/alignment/add_umi_sam.py @@ -0,0 +1,29 @@ +#!/bin/env python +import sys +import pysam +import argparse +def get_args(): + '''Define arguments.''' + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + + parser.add_argument('-s', '--sam', + help="The sam file", + required=True) + parser.add_argument('-o', '--out', + help="The outfile", + default='outfile.bam') + args = parser.parse_args() + return args + +# set the tag names - take a look at SAM spec to pick an appropriate one +args = get_args() +infile = pysam.AlignmentFile(args.sam, "r") +out = pysam.AlignmentFile(args.out, "wb", template=infile) +for read in infile.fetch(): + read.set_tag('RX', read.qname.split(":")[-1]) + out.write(read) + +infile.close() +out.close() diff --git a/alignment/bamqc.sh b/alignment/bamqc.sh index 604cfe4f133aeb6310d6c0c8f35aff4ed3aa8b32..44ba8cdf53cef29452af150fd0ea4c5dad4ae50b 100644 --- a/alignment/bamqc.sh +++ b/alignment/bamqc.sh @@ -12,13 +12,13 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:b:n:p:h opt +while getopts :r:b:c:n:p:h opt do case $opt in r) index_path=$OPTARG;; b) sbam=$OPTARG;; c) bed=$OPTARG;; - y) nuctype=$OPTARG;; + n) nuctype=$OPTARG;; p) pair_id=$OPTARG;; h) usage;; esac @@ -27,13 +27,19 @@ done shift $(($OPTIND -1)) # Check for mandatory options -if [[ -z $pair_id ]] || [[ -z $sbam ]]; then - usage -fi +#if [[ -z $pair_id ]] || [[ -z $sbam ]]; then +# usage +#fi module load samtools/1.6 fastqc/0.11.5 samtools flagstat ${sbam} > ${pair_id}.flagstat.txt fastqc -f bam ${sbam} +baseDir="`dirname \"$0\"`" + +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi if [[ $nuctype == 'dna' ]]; then module load bedtools/2.26.0 picard/2.10.3 @@ -42,10 +48,10 @@ if [[ $nuctype == 'dna' ]]; then samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt samtools view -b -F 1024 ${pair_id}.ontarget.bam | bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b stdin -hist | grep ^all > ${pair_id}.dedupcov.txt - java -Xmx32g -jar \$PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt - java -Xmx64g -jar \$PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt - samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=\$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt - java -Xmx4g -jar \$PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt + java -Xmx32g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt + java -Xmx64g -jar $PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt + samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt + java -Xmx4g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt perl $baseDir/scripts/calculate_depthcov.pl ${pair_id}.covhist.txt diff --git a/alignment/dnaseqalign.sh b/alignment/dnaseqalign.sh index 0e1962388c69848a3fbb226e9add3e42dcf6e751..f0042b4e023989924f811bfd5e27c4e78e93d54e 100644 --- a/alignment/dnaseqalign.sh +++ b/alignment/dnaseqalign.sh @@ -2,22 +2,23 @@ #dnaseqalign.sh usage() { - echo "-h Help documentation for hisat.sh" + echo "-h Help documentation for dnaseqalign.sh" echo "-r --Reference Genome: GRCh38 or GRCm38" echo "-x --FastQ R1" echo "-y --FastQ R2" echo "-p --Prefix for output file name" - echo "Example: bash dnaseqalign.sh -p prefix -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz" + echo "-u UMI" + echo "Example: bash dnaseqalign.sh -p prefix -u 1 -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz" exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:a:x:y:p:h opt +while getopts :r:x:y:p:uh opt do case $opt in r) index_path=$OPTARG;; x) fq1=$OPTARG;; y) fq2=$OPTARG;; - a) algo=$OPTARG;; + u) umi='umi';; p) pair_id=$OPTARG;; h) usage;; esac @@ -35,25 +36,27 @@ then SLURM_CPUS_ON_NODE=1 fi -module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3 speedseq/20160506 -if ($fq1 != $fq2) +module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3 + +baseDir="`dirname \"$0\"`" + +if [[ $fq1 == $fq2 ]] then - bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam + bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam else - bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam + bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam fi -if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]] + +if [[ $umi=='umi' ]] then - cat out.sam | k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt | samtools view -1 - > output.unsort.bam - run-HLA ${pair_id}.hla > ${pair_id}.hla.top 2> ${pair_id}.log.hla - touch ${pair_id}.hla.HLA-dummy.gt - cat ${pair_id}.hla.HLA*.gt | grep ^GT | cut -f2- > ${pair_id}.hla.all + k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam | python ${baseDir}/add_umi_bam.py -s - -o output.unsort.bam +elif [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]] +then + k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam| samtools view -1 - > output.unsort.bam else samtools view -1 -o output.unsort.bam out.sam -fi -samtools sort --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam -samtools view -h output.unsort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam -gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o ${pair_id}.splitters.bam - -gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o ${pair_id}.discordants.bam - +fi + +samtools sort -n --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam diff --git a/alignment/markdups.sh b/alignment/markdups.sh index 4f44d3cb9d8fce34720d156f74b42d90e9f160b7..5158e78e1126b7d592f636cc1537a478c8303bb0 100644 --- a/alignment/markdups.sh +++ b/alignment/markdups.sh @@ -31,6 +31,7 @@ if [[ -z $SLURM_CPUS_ON_NODE ]] then SLURM_CPUS_ON_NODE=1 fi +baseDir="`dirname \"$0\"`" module load picard/2.10.3 samtools/1.6 @@ -53,6 +54,7 @@ then java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt elif [ $algo == 'fgbio_umi' ] then + samtools index -@ $SLURM_CPUS_ON_NODE ${sbam} source activate fgbiotools fgbio GroupReadsByUmi -s identity -i ${sbam} -o ${pair_id}.group.bam -m 10 fgbio CallMolecularConsensusReads -i ${pair_id}.group.bam -p consensus -M 1 -o ${pair_id}.consensus.bam -S ':none:' diff --git a/alignment/rnaseqalign.sh b/alignment/rnaseqalign.sh index d994edb11e68b3daa259f157e2856e6e3b761de7..81ff2c06c15c1ef8966b8bc06bb433cc878d98a5 100644 --- a/alignment/rnaseqalign.sh +++ b/alignment/rnaseqalign.sh @@ -8,17 +8,19 @@ usage() { echo "-y --FastQ R2" echo "-a --Method: hisat or star" echo "-p --Prefix for output file name" - echo "Example: bash hisat.sh -p prefix -r GRCh38 -a hisat -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz" + echo "-u [UMI sequences are in FQ Read Name]" + echo "Example: bash rnaseqalign.sh -a hisat -p prefix -u -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz" exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:a:x:y:p:h opt +while getopts :r:a:x:y:p:hu opt do case $opt in r) index_path=$OPTARG;; x) fq1=$OPTARG;; y) fq2=$OPTARG;; a) algo=$OPTARG;; + u) umi=1;; p) pair_id=$OPTARG;; h) usage;; esac @@ -32,6 +34,7 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then fi module load samtools/gcc/1.6 picard/2.10.3 +baseDir="`dirname \"$0\"`" if [[ -z $SLURM_CPUS_ON_NODE ]] then SLURM_CPUS_ON_NODE=1 @@ -57,6 +60,11 @@ else fi samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam fi +if [[ $umi==1 ]] +then + python ${baseDir}/add_umi_bam.py -b output.bam -o output.unsort2.bam + mv output.unsort2.bam output.bam +fi samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o output.nsort.bam output.bam java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam diff --git a/variants/annot_sv.pl b/variants/annot_sv.pl new file mode 100755 index 0000000000000000000000000000000000000000..c6b12c704ffc2dc9320b7bdde4dd7c255f6c9384 --- /dev/null +++ b/variants/annot_sv.pl @@ -0,0 +1,154 @@ +#!/usr/bin/perl -w +#svvcf2bed.pl + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); + +my %opt = (); +my $results = GetOptions (\%opt,'input|i=s','refdb|r=s','help|h'); + +my $vcf = $opt{input}; + +my %lines; +my %eventid; +my $ct = 0; +open IN, "<$vcf" or die $!; +while (my $line = <IN>) { + chomp($line); + if ($line =~ m/^#CHROM/) { + my @header = split(/\t/,$line); + ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$info,$format,@subjacc) = split(/\t/, $line); + } + next if $line =~ m/#/; + my ($chrom, $pos,$id,$ref,$alt,$score, + $filter,$annot,$format,@gts) = split(/\t/, $line); + $ct ++; + my %hash = (); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + if ($id eq 'N') { + $id = 'NB'.sprintf("%06s",$ct); + } + my $evid = (split(/_/,$id))[0]; + $hash{'END'} = $pos+1 unless $hash{'END'}; + @deschead = split(":",$format); + F1:foreach $sample (@subjacc) { + my $allele_info = shift @gts; + @ainfo = split(/:/, $allele_info); + my %gtinfo = (); + foreach $k (0..$#deschead) { + $gtinfo{$deschead[$k]} = $ainfo[$k]; + } + unless ($gtinfo{SU}) { + $gtinfo{SU} = 0; + $gtinfo{SU} = $gtinfo{RV}+$gtinfo{DV} if ($gtinfo{RV} && $gtinfo{DV}); + } + $gtinfo{CN} = '' unless $gtinfo{CN}; + if ($alt =~ m/chr(\w+):(\d+)/i) { + if ($1 eq $chrom) { + my $locus = join(":",$chrom,$pos,$2); + $eventid{$id} = $locus; + $lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU}); + }elsif ($id =~ m/_\d+/) { + my $locus = join(":",$chrom,$pos,$hash{END}); + $eventid{$id} = $locus; + $lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU}); + }else { + my $locus1 = join(":",$chrom,$pos,$hash{END}); + my $id1 = $id."_1"; + $eventid{$id1} = $locus1; + my $locus2 = join(":",'chr'.$1,$2,$2+1); + my $id2 = $id."_2"; + $eventid{$id2} = $locus2; + $lines{$evid}{$locus1}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU}); + $lines{$evid}{$locus2}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU}); + } + }else { + if ($hash{CHR2} && $hash{CHR2} eq $chrom) { + my $locus = join(":",$chrom,$pos,$hash{END}); + $eventid{$id} = $locus; + $lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU}); + }elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) { + my $locus1 = join(":",$chrom,$pos,$pos+1); + my $id1 = $id."_1"; + $eventid{$id1} = $locus1; + my $locus2 = join(":",$hash{CHR2},$hash{END},$hash{END}+1); + my $id2 = $id."_2"; + $eventid{$id2} = $locus2; + $lines{$evid}{$locus1}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU}); + $lines{$evid}{$locus2}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU}); + }unless ($hash{CHR2}) { + my $locus = join(":",$chrom,$pos,$hash{END}); + $eventid{$id} = $locus; + $lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU}); + } + } + } +} + +close IN; + +my @files = ("exonoverlap_sv.txt","geneoverlap_sv.txt"); +foreach $file (@files) { + open IN, "<$file" or die $!; + while (my $line = <IN>) { + chomp($line); + next if ($line =~ m/#/); + my ($chrom,$start,$end,$id,$chr,$start2,$end2,$gene) = split(/\t/, $line); + my $evid = (split(/_/,$id))[0]; + my $locus = $eventid{$id}; + unless ($locus) { + $locus = $eventid{$evid}; + } + if ($file eq 'exonoverlap_sv.txt') { + push @{$gene{$locus}}, $gene; + $inc{$evid} = 1; + }elsif ($file eq 'geneoverlap_sv.txt') { + push @{$gene{$locus}}, $gene; + $inc{$evid} = 1; + } + } +} + +my $outfile = $vcf; +$outfile =~ s/vcf/annot.txt/g; + +open OUT, ">$outfile" or die $!; +foreach $events (sort keys %lines) { + next unless ($inc{$events}); + foreach $locus (sort {$a cmp $b} keys %{$lines{$events}}) { + my %gene_annots; + if ($gene{$locus}) { + my @genes = @{$gene{$locus}}; + my %hash; + foreach $g (@genes) { + foreach $trx (split(/,/,$g)) { + my ($symbol,$trxid,$exonnum) = split(/\|/,$trx); + next unless $exonnum; + $hash{$symbol}{$trxid}{$exonnum} = 1; + } + } + foreach $sym (keys %hash) { + foreach $trxid (keys %{$hash{$sym}}) { + my @exons = sort {$a <=> $b} keys %{$hash{$sym}{$trxid}}; + my $exon_bound = $exons[0]; + if ($#exons > 0) { + $exon_bound = join("-",$exons[0],$exons[-1]); + } + $gene_annots{$sym}{$exon_bound} = 1; + } + } + } + foreach $sample (sort {$a cmp $b} keys %{$lines{$events}{$locus}}) { + foreach $genesym (keys %gene_annots) { + my ($SVTYPE,$CN,$SU) = split(/\t/,$lines{$events}{$locus}{$sample}); + next if $SU < 3; + print OUT join("\t",'SV.'.$events,split(":",$locus),$lines{$events}{$locus}{$sample}, + $genesym, join(";",sort {$a cmp $b} keys %{$gene_annots{$genesym}}),$sample),"\n"; + } + } + } +} + diff --git a/variants/cnvkit.sh b/variants/cnvkit.sh index 8d0a735d65de8fb8c1b11d1498836ac8ecede389..bdbb50ac56f5f9f20f679ef32319096c383bfaa0 100644 --- a/variants/cnvkit.sh +++ b/variants/cnvkit.sh @@ -35,4 +35,4 @@ cnvkit.py coverage ${sbam} /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.cnvki cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.normals.cnn -o ${pair_id}.cnr cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns cnvkit.py call ${pair_id}.cns -o ${pair_id}.call.cns -cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.pdf +cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.cnv.pdf diff --git a/variants/germline_vc.sh b/variants/germline_vc.sh index b45e7ec287b4319d30fe06ec466716d5eb7b0cba..41866e6591ffa00b04b9a0c23796c62360ca40d5 100644 --- a/variants/germline_vc.sh +++ b/variants/germline_vc.sh @@ -53,7 +53,7 @@ else echo "Missing Fasta File: ${index_path}/genome.fa" usage fi -module load python/2.7.x-anaconda samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel +module load python/2.7.x-anaconda picard/2.10.3 samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel for i in *.bam; do if [[ ! -f ${i}.bai ]] diff --git a/variants/svcalling.sh b/variants/svcalling.sh new file mode 100644 index 0000000000000000000000000000000000000000..50c95904d7e63af666db6a60cd8703eb7b0c2235 --- /dev/null +++ b/variants/svcalling.sh @@ -0,0 +1,88 @@ +!/bin/bash +#svcalling.sh + +usage() { + echo "-h Help documentation for gatkrunner.sh" + echo "-r --Path to Reference Genome with the file genome.fa" + echo "-p --Prefix for output file name" + echo "-b --Bam File" + echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk" + exit 1 +} +OPTIND=1 # Reset OPTIND +while getopts :r:a:b:p:h opt +do + case $opt in + r) index_path=$OPTARG;; + p) pair_id=$OPTARG;; + b) sbam=$OPTARG;; + h) usage;; + esac +done +function join_by { local IFS="$1"; shift; echo "$*"; } + +shift $(($OPTIND -1)) +baseDir="`dirname \"$0\"`" + + +# Check for mandatory options +if [[ -z $pair_id ]] || [[ -z $index_path ]]; then + usage +fi +if [[ -z $SLURM_CPUS_ON_NODE ]] +then + SLURM_CPUS_ON_NODE=1 +fi + +if [[ -a "${index_path}/genome.fa" ]] +then + reffa="${index_path}/genome.fa" + dict="${index_path}/genome.dict" +else + echo "Missing Fasta File: ${index_path}/genome.fa" + usage + +fi +baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +module load speedseq/20160506 novoBreak/v1.1.3 delly2/v0.7.7-multi samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 +#RUN DELLY +mkdir temp +delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam} +delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam} +delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam} +delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam} +delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam} +delly2 filter -t BND -o delly_tra.bcf -f germline delly_translocations.bcf +delly2 filter -t DUP -o delly_dup.bcf -f germline delly_translocations.bcf +delly2 filter -t INV -o delly_inv.bcf -f germline delly_translocations.bcf +delly2 filter -t DEL -o delly_del.bcf -f germline delly_translocations.bcf +delly2 filter -t INS -o delly_ins.bcf -f germline delly_translocations.bcf + +#MERGE DELLY AND MAKE BED +bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf | vcf-sort > ${pair_id}.delly.vcf +perl $baseDir/vcf2bed.sv.pl ${pair_id}.delly.vcf > delly.bed +bgzip ${pair_id}.delly.vcf +tabix ${pair_id}.delly.vcf.gz + +#MAKE FILES FOR LUMPY +samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${sbam} +samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam +gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o splitters.bam - +gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o discordants.bam - + +#RUN LUMPY +speedseq sv -t $SLURM_CPUS_ON_NODE -o ${pair_id}.sssv -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed +java -jar $SNPEFF_HOME/SnpSift.jar filter "GEN[0].SU > 2" ${pair_id}.sssv.sv.vcf.gz > lumpy.vcf +perl $baseDir/vcf2bed.sv.pl lumpy.vcf > lumpy.bed + +#COMPARE DELLY & LUMPY +bedtools intersect -v -a lumpy.bed -b delly.bed > lumpy_only.bed +bedtools intersect -header -b lumpy_only.bed -a lumpy.vcf |bgzip > lumpy_only.vcf.gz +vcf-concat ${pair_id}.delly.vcf.gz lumpy_only.vcf.gz |vcf-sort -t temp > ${pair_id}.sv.vcf +perl $baseDir/vcf2bed.sv.pl ${pair_id}.sv.vcf |sort -V -k 1,1 -k 2,2n | grep -v 'alt' |grep -v 'random' |uniq > svs.bed +bedtools intersect -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed > exonoverlap_sv.txt +bedtools intersect -v -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed | bedtools intersect -header -wb -a stdin -b ${index_path}/gencode.genes.chr.bed > geneoverlap_sv.txt +perl $baseDir/annot_sv.pl -r ${index_path} -i ${pair_id}.sv.vcf +bgzip ${pair_id}.sv.vcf + diff --git a/variants/union.sh b/variants/union.sh index 2ed49e260b52b009afa8d908e82bccc735e9c5bb..d6686105c8d3a851134330fa8e80853b83f0ffed 100644 --- a/variants/union.sh +++ b/variants/union.sh @@ -19,7 +19,8 @@ do done function join_by { local IFS="$1"; shift; echo "$*"; } shift $(($OPTIND -1)) -baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +baseDir="`dirname \"$0\"`" + module load bedtools/2.26.0 samtools/1.6 HS=*.hotspot.vcf.gz diff --git a/variants/vcf2bed.sv.pl b/variants/vcf2bed.sv.pl new file mode 100755 index 0000000000000000000000000000000000000000..2dcb652bcf761cd6d6f6c6a206392eeb518296d3 --- /dev/null +++ b/variants/vcf2bed.sv.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl + +use strict; + +if (scalar @ARGV != 1) { + print "vcf2bed.pl <vcf_file>\n"; + exit 1; +} + +my $vcfFile = $ARGV[0]; + +open VCF, $vcfFile + or die "Cannot open $vcfFile"; + +my $ct = 0; +while (<VCF>) { + next if (index($_, '#') == 0); + my @fields = split /\t/, $_; + $ct ++; + my $chrom = $fields[0]; + my $pos = $fields[1]; + my $ref = $fields[3]; + my $annot = $fields[7]; + my $alt = $fields[4]; + if ($fields[2] eq 'N') { + $fields[2] = 'NB'.sprintf("%06s",$ct); + } + my %hash = (); + next if ($chrom =~ m/_/); + foreach $a (split(/;/,$annot)) { + my ($key,$val) = split(/=/,$a); + $hash{$key} = $val unless ($hash{$key}); + } + $hash{'END'} = $pos+1 unless $hash{'END'}; + next if ($hash{CHR2} && $hash{CHR2} =~ m/_/); + if ($alt =~ m/chr(\w+):(\d+)/i) { + if ($1 eq $chrom) { + my $end = $2; + if ($pos > $end) { + my $temp = $end; + $end = $pos; + $pos = $temp; + } + print join("\t",$chrom,$pos,$end,$fields[2]),"\n"; + }elsif ($fields[2] =~ m/_\d+/) { + print join("\t",$chrom,$pos,$hash{END},$fields[2]),"\n"; + }else { + print join("\t",$chrom,$pos,$pos+1,$fields[2]."_1"),"\n"; + print join("\t",'chr'.$1,$2,$2+1,$fields[2]."_2"),"\n"; + } + }else { + if ($hash{CHR2} && $hash{CHR2} eq $chrom) { + my $end = $hash{END}; + if ($pos > $end) { + my $temp = $end; + $end = $pos; + $pos = $temp; + } + print join("\t",$chrom,$pos,$end,$fields[2]),"\n"; + }elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) { + print join("\t",$chrom,$pos,$pos+1,$fields[2]."_1"),"\n"; + print join("\t",$hash{CHR2},$hash{END},$hash{END}+1,$fields[2]."_2"),"\n"; + }unless ($hash{CHR2}) { + print join("\t",$chrom,$pos,$hash{END},$fields[2]),"\n"; + } + } +} + +close VCF;