Skip to content
Snippets Groups Projects
Commit 3d9955f6 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

update for variant calling debugging

parent 1fa984ad
Branches
Tags
No related merge requests found
#!/bin/env python
import sys
import pysam
import argparse
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-s', '--sam',
help="The sam file",
required=True)
parser.add_argument('-o', '--out',
help="The outfile",
default='outfile.bam')
args = parser.parse_args()
return args
# set the tag names - take a look at SAM spec to pick an appropriate one
args = get_args()
infile = pysam.AlignmentFile(args.sam, "r")
out = pysam.AlignmentFile(args.out, "wb", template=infile)
for read in infile.fetch():
read.set_tag('RX', read.qname.split(":")[-1])
out.write(read)
infile.close()
out.close()
......@@ -12,13 +12,13 @@ usage() {
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:b:n:p:h opt
while getopts :r:b:c:n:p:h opt
do
case $opt in
r) index_path=$OPTARG;;
b) sbam=$OPTARG;;
c) bed=$OPTARG;;
y) nuctype=$OPTARG;;
n) nuctype=$OPTARG;;
p) pair_id=$OPTARG;;
h) usage;;
esac
......@@ -27,13 +27,19 @@ done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
usage
fi
#if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
# usage
#fi
module load samtools/1.6 fastqc/0.11.5
samtools flagstat ${sbam} > ${pair_id}.flagstat.txt
fastqc -f bam ${sbam}
baseDir="`dirname \"$0\"`"
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
if [[ $nuctype == 'dna' ]]; then
module load bedtools/2.26.0 picard/2.10.3
......@@ -42,10 +48,10 @@ if [[ $nuctype == 'dna' ]]; then
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
samtools view -b -F 1024 ${pair_id}.ontarget.bam | bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b stdin -hist | grep ^all > ${pair_id}.dedupcov.txt
java -Xmx32g -jar \$PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar \$PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=\$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
java -Xmx4g -jar \$PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt
java -Xmx32g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar $PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
java -Xmx4g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt
samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt
perl $baseDir/scripts/calculate_depthcov.pl ${pair_id}.covhist.txt
......
......@@ -2,22 +2,23 @@
#dnaseqalign.sh
usage() {
echo "-h Help documentation for hisat.sh"
echo "-h Help documentation for dnaseqalign.sh"
echo "-r --Reference Genome: GRCh38 or GRCm38"
echo "-x --FastQ R1"
echo "-y --FastQ R2"
echo "-p --Prefix for output file name"
echo "Example: bash dnaseqalign.sh -p prefix -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz"
echo "-u UMI"
echo "Example: bash dnaseqalign.sh -p prefix -u 1 -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:a:x:y:p:h opt
while getopts :r:x:y:p:uh opt
do
case $opt in
r) index_path=$OPTARG;;
x) fq1=$OPTARG;;
y) fq2=$OPTARG;;
a) algo=$OPTARG;;
u) umi='umi';;
p) pair_id=$OPTARG;;
h) usage;;
esac
......@@ -35,25 +36,27 @@ then
SLURM_CPUS_ON_NODE=1
fi
module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3 speedseq/20160506
if ($fq1 != $fq2)
module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3
baseDir="`dirname \"$0\"`"
if [[ $fq1 == $fq2 ]]
then
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
else
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam
fi
if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
if [[ $umi=='umi' ]]
then
cat out.sam | k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt | samtools view -1 - > output.unsort.bam
run-HLA ${pair_id}.hla > ${pair_id}.hla.top 2> ${pair_id}.log.hla
touch ${pair_id}.hla.HLA-dummy.gt
cat ${pair_id}.hla.HLA*.gt | grep ^GT | cut -f2- > ${pair_id}.hla.all
k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam | python ${baseDir}/add_umi_bam.py -s - -o output.unsort.bam
elif [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
then
k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam| samtools view -1 - > output.unsort.bam
else
samtools view -1 -o output.unsort.bam out.sam
fi
samtools sort --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam
samtools view -h output.unsort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o ${pair_id}.splitters.bam -
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o ${pair_id}.discordants.bam -
fi
samtools sort -n --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
......@@ -31,6 +31,7 @@ if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
baseDir="`dirname \"$0\"`"
module load picard/2.10.3 samtools/1.6
......@@ -53,6 +54,7 @@ then
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt
elif [ $algo == 'fgbio_umi' ]
then
samtools index -@ $SLURM_CPUS_ON_NODE ${sbam}
source activate fgbiotools
fgbio GroupReadsByUmi -s identity -i ${sbam} -o ${pair_id}.group.bam -m 10
fgbio CallMolecularConsensusReads -i ${pair_id}.group.bam -p consensus -M 1 -o ${pair_id}.consensus.bam -S ':none:'
......
......@@ -8,17 +8,19 @@ usage() {
echo "-y --FastQ R2"
echo "-a --Method: hisat or star"
echo "-p --Prefix for output file name"
echo "Example: bash hisat.sh -p prefix -r GRCh38 -a hisat -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz"
echo "-u [UMI sequences are in FQ Read Name]"
echo "Example: bash rnaseqalign.sh -a hisat -p prefix -u -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:a:x:y:p:h opt
while getopts :r:a:x:y:p:hu opt
do
case $opt in
r) index_path=$OPTARG;;
x) fq1=$OPTARG;;
y) fq2=$OPTARG;;
a) algo=$OPTARG;;
u) umi=1;;
p) pair_id=$OPTARG;;
h) usage;;
esac
......@@ -32,6 +34,7 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
fi
module load samtools/gcc/1.6 picard/2.10.3
baseDir="`dirname \"$0\"`"
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
......@@ -57,6 +60,11 @@ else
fi
samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam
fi
if [[ $umi==1 ]]
then
python ${baseDir}/add_umi_bam.py -b output.bam -o output.unsort2.bam
mv output.unsort2.bam output.bam
fi
samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o output.nsort.bam output.bam
java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
#!/usr/bin/perl -w
#svvcf2bed.pl
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'input|i=s','refdb|r=s','help|h');
my $vcf = $opt{input};
my %lines;
my %eventid;
my $ct = 0;
open IN, "<$vcf" or die $!;
while (my $line = <IN>) {
chomp($line);
if ($line =~ m/^#CHROM/) {
my @header = split(/\t/,$line);
($chrom, $pos,$id,$ref,$alt,$score,
$filter,$info,$format,@subjacc) = split(/\t/, $line);
}
next if $line =~ m/#/;
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
$ct ++;
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val unless ($hash{$key});
}
if ($id eq 'N') {
$id = 'NB'.sprintf("%06s",$ct);
}
my $evid = (split(/_/,$id))[0];
$hash{'END'} = $pos+1 unless $hash{'END'};
@deschead = split(":",$format);
F1:foreach $sample (@subjacc) {
my $allele_info = shift @gts;
@ainfo = split(/:/, $allele_info);
my %gtinfo = ();
foreach $k (0..$#deschead) {
$gtinfo{$deschead[$k]} = $ainfo[$k];
}
unless ($gtinfo{SU}) {
$gtinfo{SU} = 0;
$gtinfo{SU} = $gtinfo{RV}+$gtinfo{DV} if ($gtinfo{RV} && $gtinfo{DV});
}
$gtinfo{CN} = '' unless $gtinfo{CN};
if ($alt =~ m/chr(\w+):(\d+)/i) {
if ($1 eq $chrom) {
my $locus = join(":",$chrom,$pos,$2);
$eventid{$id} = $locus;
$lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}elsif ($id =~ m/_\d+/) {
my $locus = join(":",$chrom,$pos,$hash{END});
$eventid{$id} = $locus;
$lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}else {
my $locus1 = join(":",$chrom,$pos,$hash{END});
my $id1 = $id."_1";
$eventid{$id1} = $locus1;
my $locus2 = join(":",'chr'.$1,$2,$2+1);
my $id2 = $id."_2";
$eventid{$id2} = $locus2;
$lines{$evid}{$locus1}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
$lines{$evid}{$locus2}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}
}else {
if ($hash{CHR2} && $hash{CHR2} eq $chrom) {
my $locus = join(":",$chrom,$pos,$hash{END});
$eventid{$id} = $locus;
$lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) {
my $locus1 = join(":",$chrom,$pos,$pos+1);
my $id1 = $id."_1";
$eventid{$id1} = $locus1;
my $locus2 = join(":",$hash{CHR2},$hash{END},$hash{END}+1);
my $id2 = $id."_2";
$eventid{$id2} = $locus2;
$lines{$evid}{$locus1}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
$lines{$evid}{$locus2}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}unless ($hash{CHR2}) {
my $locus = join(":",$chrom,$pos,$hash{END});
$eventid{$id} = $locus;
$lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}
}
}
}
close IN;
my @files = ("exonoverlap_sv.txt","geneoverlap_sv.txt");
foreach $file (@files) {
open IN, "<$file" or die $!;
while (my $line = <IN>) {
chomp($line);
next if ($line =~ m/#/);
my ($chrom,$start,$end,$id,$chr,$start2,$end2,$gene) = split(/\t/, $line);
my $evid = (split(/_/,$id))[0];
my $locus = $eventid{$id};
unless ($locus) {
$locus = $eventid{$evid};
}
if ($file eq 'exonoverlap_sv.txt') {
push @{$gene{$locus}}, $gene;
$inc{$evid} = 1;
}elsif ($file eq 'geneoverlap_sv.txt') {
push @{$gene{$locus}}, $gene;
$inc{$evid} = 1;
}
}
}
my $outfile = $vcf;
$outfile =~ s/vcf/annot.txt/g;
open OUT, ">$outfile" or die $!;
foreach $events (sort keys %lines) {
next unless ($inc{$events});
foreach $locus (sort {$a cmp $b} keys %{$lines{$events}}) {
my %gene_annots;
if ($gene{$locus}) {
my @genes = @{$gene{$locus}};
my %hash;
foreach $g (@genes) {
foreach $trx (split(/,/,$g)) {
my ($symbol,$trxid,$exonnum) = split(/\|/,$trx);
next unless $exonnum;
$hash{$symbol}{$trxid}{$exonnum} = 1;
}
}
foreach $sym (keys %hash) {
foreach $trxid (keys %{$hash{$sym}}) {
my @exons = sort {$a <=> $b} keys %{$hash{$sym}{$trxid}};
my $exon_bound = $exons[0];
if ($#exons > 0) {
$exon_bound = join("-",$exons[0],$exons[-1]);
}
$gene_annots{$sym}{$exon_bound} = 1;
}
}
}
foreach $sample (sort {$a cmp $b} keys %{$lines{$events}{$locus}}) {
foreach $genesym (keys %gene_annots) {
my ($SVTYPE,$CN,$SU) = split(/\t/,$lines{$events}{$locus}{$sample});
next if $SU < 3;
print OUT join("\t",'SV.'.$events,split(":",$locus),$lines{$events}{$locus}{$sample},
$genesym, join(";",sort {$a cmp $b} keys %{$gene_annots{$genesym}}),$sample),"\n";
}
}
}
}
......@@ -35,4 +35,4 @@ cnvkit.py coverage ${sbam} /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.cnvki
cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.normals.cnn -o ${pair_id}.cnr
cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns
cnvkit.py call ${pair_id}.cns -o ${pair_id}.call.cns
cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.pdf
cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.cnv.pdf
......@@ -53,7 +53,7 @@ else
echo "Missing Fasta File: ${index_path}/genome.fa"
usage
fi
module load python/2.7.x-anaconda samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel
module load python/2.7.x-anaconda picard/2.10.3 samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel
for i in *.bam; do
if [[ ! -f ${i}.bai ]]
......
!/bin/bash
#svcalling.sh
usage() {
echo "-h Help documentation for gatkrunner.sh"
echo "-r --Path to Reference Genome with the file genome.fa"
echo "-p --Prefix for output file name"
echo "-b --Bam File"
echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:a:b:p:h opt
do
case $opt in
r) index_path=$OPTARG;;
p) pair_id=$OPTARG;;
b) sbam=$OPTARG;;
h) usage;;
esac
done
function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
baseDir="`dirname \"$0\"`"
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $index_path ]]; then
usage
fi
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
if [[ -a "${index_path}/genome.fa" ]]
then
reffa="${index_path}/genome.fa"
dict="${index_path}/genome.dict"
else
echo "Missing Fasta File: ${index_path}/genome.fa"
usage
fi
baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
module load speedseq/20160506 novoBreak/v1.1.3 delly2/v0.7.7-multi samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14
#RUN DELLY
mkdir temp
delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam}
delly2 filter -t BND -o delly_tra.bcf -f germline delly_translocations.bcf
delly2 filter -t DUP -o delly_dup.bcf -f germline delly_translocations.bcf
delly2 filter -t INV -o delly_inv.bcf -f germline delly_translocations.bcf
delly2 filter -t DEL -o delly_del.bcf -f germline delly_translocations.bcf
delly2 filter -t INS -o delly_ins.bcf -f germline delly_translocations.bcf
#MERGE DELLY AND MAKE BED
bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf | vcf-sort > ${pair_id}.delly.vcf
perl $baseDir/vcf2bed.sv.pl ${pair_id}.delly.vcf > delly.bed
bgzip ${pair_id}.delly.vcf
tabix ${pair_id}.delly.vcf.gz
#MAKE FILES FOR LUMPY
samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${sbam}
samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o splitters.bam -
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o discordants.bam -
#RUN LUMPY
speedseq sv -t $SLURM_CPUS_ON_NODE -o ${pair_id}.sssv -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed
java -jar $SNPEFF_HOME/SnpSift.jar filter "GEN[0].SU > 2" ${pair_id}.sssv.sv.vcf.gz > lumpy.vcf
perl $baseDir/vcf2bed.sv.pl lumpy.vcf > lumpy.bed
#COMPARE DELLY & LUMPY
bedtools intersect -v -a lumpy.bed -b delly.bed > lumpy_only.bed
bedtools intersect -header -b lumpy_only.bed -a lumpy.vcf |bgzip > lumpy_only.vcf.gz
vcf-concat ${pair_id}.delly.vcf.gz lumpy_only.vcf.gz |vcf-sort -t temp > ${pair_id}.sv.vcf
perl $baseDir/vcf2bed.sv.pl ${pair_id}.sv.vcf |sort -V -k 1,1 -k 2,2n | grep -v 'alt' |grep -v 'random' |uniq > svs.bed
bedtools intersect -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed > exonoverlap_sv.txt
bedtools intersect -v -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed | bedtools intersect -header -wb -a stdin -b ${index_path}/gencode.genes.chr.bed > geneoverlap_sv.txt
perl $baseDir/annot_sv.pl -r ${index_path} -i ${pair_id}.sv.vcf
bgzip ${pair_id}.sv.vcf
......@@ -19,7 +19,8 @@ do
done
function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
baseDir="`dirname \"$0\"`"
module load bedtools/2.26.0 samtools/1.6
HS=*.hotspot.vcf.gz
......
#!/usr/bin/perl
use strict;
if (scalar @ARGV != 1) {
print "vcf2bed.pl <vcf_file>\n";
exit 1;
}
my $vcfFile = $ARGV[0];
open VCF, $vcfFile
or die "Cannot open $vcfFile";
my $ct = 0;
while (<VCF>) {
next if (index($_, '#') == 0);
my @fields = split /\t/, $_;
$ct ++;
my $chrom = $fields[0];
my $pos = $fields[1];
my $ref = $fields[3];
my $annot = $fields[7];
my $alt = $fields[4];
if ($fields[2] eq 'N') {
$fields[2] = 'NB'.sprintf("%06s",$ct);
}
my %hash = ();
next if ($chrom =~ m/_/);
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val unless ($hash{$key});
}
$hash{'END'} = $pos+1 unless $hash{'END'};
next if ($hash{CHR2} && $hash{CHR2} =~ m/_/);
if ($alt =~ m/chr(\w+):(\d+)/i) {
if ($1 eq $chrom) {
my $end = $2;
if ($pos > $end) {
my $temp = $end;
$end = $pos;
$pos = $temp;
}
print join("\t",$chrom,$pos,$end,$fields[2]),"\n";
}elsif ($fields[2] =~ m/_\d+/) {
print join("\t",$chrom,$pos,$hash{END},$fields[2]),"\n";
}else {
print join("\t",$chrom,$pos,$pos+1,$fields[2]."_1"),"\n";
print join("\t",'chr'.$1,$2,$2+1,$fields[2]."_2"),"\n";
}
}else {
if ($hash{CHR2} && $hash{CHR2} eq $chrom) {
my $end = $hash{END};
if ($pos > $end) {
my $temp = $end;
$end = $pos;
$pos = $temp;
}
print join("\t",$chrom,$pos,$end,$fields[2]),"\n";
}elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) {
print join("\t",$chrom,$pos,$pos+1,$fields[2]."_1"),"\n";
print join("\t",$hash{CHR2},$hash{END},$hash{END}+1,$fields[2]."_2"),"\n";
}unless ($hash{CHR2}) {
print join("\t",$chrom,$pos,$hash{END},$fields[2]),"\n";
}
}
}
close VCF;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment