Skip to content
Snippets Groups Projects
Commit 3d9955f6 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

update for variant calling debugging

parent 1fa984ad
No related merge requests found
#!/bin/env python
import sys
import pysam
import argparse
def get_args():
'''Define arguments.'''
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-s', '--sam',
help="The sam file",
required=True)
parser.add_argument('-o', '--out',
help="The outfile",
default='outfile.bam')
args = parser.parse_args()
return args
# set the tag names - take a look at SAM spec to pick an appropriate one
args = get_args()
infile = pysam.AlignmentFile(args.sam, "r")
out = pysam.AlignmentFile(args.out, "wb", template=infile)
for read in infile.fetch():
read.set_tag('RX', read.qname.split(":")[-1])
out.write(read)
infile.close()
out.close()
......@@ -12,13 +12,13 @@ usage() {
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:b:n:p:h opt
while getopts :r:b:c:n:p:h opt
do
case $opt in
r) index_path=$OPTARG;;
b) sbam=$OPTARG;;
c) bed=$OPTARG;;
y) nuctype=$OPTARG;;
n) nuctype=$OPTARG;;
p) pair_id=$OPTARG;;
h) usage;;
esac
......@@ -27,13 +27,19 @@ done
shift $(($OPTIND -1))
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
usage
fi
#if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
# usage
#fi
module load samtools/1.6 fastqc/0.11.5
samtools flagstat ${sbam} > ${pair_id}.flagstat.txt
fastqc -f bam ${sbam}
baseDir="`dirname \"$0\"`"
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
if [[ $nuctype == 'dna' ]]; then
module load bedtools/2.26.0 picard/2.10.3
......@@ -42,10 +48,10 @@ if [[ $nuctype == 'dna' ]]; then
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
samtools view -b -F 1024 ${pair_id}.ontarget.bam | bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b stdin -hist | grep ^all > ${pair_id}.dedupcov.txt
java -Xmx32g -jar \$PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar \$PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=\$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
java -Xmx4g -jar \$PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt
java -Xmx32g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${reffa} I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar $PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
java -Xmx4g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${pair_id}.bam HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${reffa} OUTPUT=${pair_id}.hist.txt
samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt
perl $baseDir/scripts/calculate_depthcov.pl ${pair_id}.covhist.txt
......
......@@ -2,22 +2,23 @@
#dnaseqalign.sh
usage() {
echo "-h Help documentation for hisat.sh"
echo "-h Help documentation for dnaseqalign.sh"
echo "-r --Reference Genome: GRCh38 or GRCm38"
echo "-x --FastQ R1"
echo "-y --FastQ R2"
echo "-p --Prefix for output file name"
echo "Example: bash dnaseqalign.sh -p prefix -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz"
echo "-u UMI"
echo "Example: bash dnaseqalign.sh -p prefix -u 1 -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:a:x:y:p:h opt
while getopts :r:x:y:p:uh opt
do
case $opt in
r) index_path=$OPTARG;;
x) fq1=$OPTARG;;
y) fq2=$OPTARG;;
a) algo=$OPTARG;;
u) umi='umi';;
p) pair_id=$OPTARG;;
h) usage;;
esac
......@@ -35,25 +36,27 @@ then
SLURM_CPUS_ON_NODE=1
fi
module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3 speedseq/20160506
if ($fq1 != $fq2)
module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3
baseDir="`dirname \"$0\"`"
if [[ $fq1 == $fq2 ]]
then
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} > out.sam
else
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam
bwa mem -M -t $SLURM_CPUS_ON_NODE -R "@RG\tID:${pair_id}\tLB:tx\tPL:illumina\tPU:barcode\tSM:${pair_id}" ${index_path}/genome.fa ${fq1} ${fq2} > out.sam
fi
if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
if [[ $umi=='umi' ]]
then
cat out.sam | k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt | samtools view -1 - > output.unsort.bam
run-HLA ${pair_id}.hla > ${pair_id}.hla.top 2> ${pair_id}.log.hla
touch ${pair_id}.hla.HLA-dummy.gt
cat ${pair_id}.hla.HLA*.gt | grep ^GT | cut -f2- > ${pair_id}.hla.all
k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam | python ${baseDir}/add_umi_bam.py -s - -o output.unsort.bam
elif [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
then
k8 /cm/shared/apps/bwakit/0.7.15/bwa-postalt.js -p ${pair_id}.hla ${index_path}/genome.fa.alt out.sam| samtools view -1 - > output.unsort.bam
else
samtools view -1 -o output.unsort.bam out.sam
fi
samtools sort --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam
samtools view -h output.unsort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o ${pair_id}.splitters.bam -
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o ${pair_id}.discordants.bam -
fi
samtools sort -n --threads $SLURM_CPUS_ON_NODE -o output.dups.bam output.unsort.bam
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.dups.bam O=${pair_id}.bam
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
......@@ -31,6 +31,7 @@ if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
baseDir="`dirname \"$0\"`"
module load picard/2.10.3 samtools/1.6
......@@ -53,6 +54,7 @@ then
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt
elif [ $algo == 'fgbio_umi' ]
then
samtools index -@ $SLURM_CPUS_ON_NODE ${sbam}
source activate fgbiotools
fgbio GroupReadsByUmi -s identity -i ${sbam} -o ${pair_id}.group.bam -m 10
fgbio CallMolecularConsensusReads -i ${pair_id}.group.bam -p consensus -M 1 -o ${pair_id}.consensus.bam -S ':none:'
......
......@@ -8,17 +8,19 @@ usage() {
echo "-y --FastQ R2"
echo "-a --Method: hisat or star"
echo "-p --Prefix for output file name"
echo "Example: bash hisat.sh -p prefix -r GRCh38 -a hisat -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz"
echo "-u [UMI sequences are in FQ Read Name]"
echo "Example: bash rnaseqalign.sh -a hisat -p prefix -u -r /project/shared/bicf_workflow_ref/GRCh38 -x SRR1551047_1.fastq.gz -y SRR1551047_2.fastq.gz"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:a:x:y:p:h opt
while getopts :r:a:x:y:p:hu opt
do
case $opt in
r) index_path=$OPTARG;;
x) fq1=$OPTARG;;
y) fq2=$OPTARG;;
a) algo=$OPTARG;;
u) umi=1;;
p) pair_id=$OPTARG;;
h) usage;;
esac
......@@ -32,6 +34,7 @@ if [[ -z $pair_id ]] || [[ -z $fq1 ]]; then
fi
module load samtools/gcc/1.6 picard/2.10.3
baseDir="`dirname \"$0\"`"
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
......@@ -57,6 +60,11 @@ else
fi
samtools view -1 --threads $SLURM_CPUS_ON_NODE -o output.bam out.sam
fi
if [[ $umi==1 ]]
then
python ${baseDir}/add_umi_bam.py -b output.bam -o output.unsort2.bam
mv output.unsort2.bam output.bam
fi
samtools sort -@ $SLURM_CPUS_ON_NODE -O BAM -n -o output.nsort.bam output.bam
java -jar $PICARD/picard.jar FixMateInformation ASSUME_SORTED=TRUE SORT_ORDER=coordinate ADD_MATE_CIGAR=TRUE I=output.nsort.bam O=${pair_id}.bam
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.bam
#!/usr/bin/perl -w
#svvcf2bed.pl
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'input|i=s','refdb|r=s','help|h');
my $vcf = $opt{input};
my %lines;
my %eventid;
my $ct = 0;
open IN, "<$vcf" or die $!;
while (my $line = <IN>) {
chomp($line);
if ($line =~ m/^#CHROM/) {
my @header = split(/\t/,$line);
($chrom, $pos,$id,$ref,$alt,$score,
$filter,$info,$format,@subjacc) = split(/\t/, $line);
}
next if $line =~ m/#/;
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
$ct ++;
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val unless ($hash{$key});
}
if ($id eq 'N') {
$id = 'NB'.sprintf("%06s",$ct);
}
my $evid = (split(/_/,$id))[0];
$hash{'END'} = $pos+1 unless $hash{'END'};
@deschead = split(":",$format);
F1:foreach $sample (@subjacc) {
my $allele_info = shift @gts;
@ainfo = split(/:/, $allele_info);
my %gtinfo = ();
foreach $k (0..$#deschead) {
$gtinfo{$deschead[$k]} = $ainfo[$k];
}
unless ($gtinfo{SU}) {
$gtinfo{SU} = 0;
$gtinfo{SU} = $gtinfo{RV}+$gtinfo{DV} if ($gtinfo{RV} && $gtinfo{DV});
}
$gtinfo{CN} = '' unless $gtinfo{CN};
if ($alt =~ m/chr(\w+):(\d+)/i) {
if ($1 eq $chrom) {
my $locus = join(":",$chrom,$pos,$2);
$eventid{$id} = $locus;
$lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}elsif ($id =~ m/_\d+/) {
my $locus = join(":",$chrom,$pos,$hash{END});
$eventid{$id} = $locus;
$lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}else {
my $locus1 = join(":",$chrom,$pos,$hash{END});
my $id1 = $id."_1";
$eventid{$id1} = $locus1;
my $locus2 = join(":",'chr'.$1,$2,$2+1);
my $id2 = $id."_2";
$eventid{$id2} = $locus2;
$lines{$evid}{$locus1}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
$lines{$evid}{$locus2}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}
}else {
if ($hash{CHR2} && $hash{CHR2} eq $chrom) {
my $locus = join(":",$chrom,$pos,$hash{END});
$eventid{$id} = $locus;
$lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) {
my $locus1 = join(":",$chrom,$pos,$pos+1);
my $id1 = $id."_1";
$eventid{$id1} = $locus1;
my $locus2 = join(":",$hash{CHR2},$hash{END},$hash{END}+1);
my $id2 = $id."_2";
$eventid{$id2} = $locus2;
$lines{$evid}{$locus1}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
$lines{$evid}{$locus2}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}unless ($hash{CHR2}) {
my $locus = join(":",$chrom,$pos,$hash{END});
$eventid{$id} = $locus;
$lines{$evid}{$locus}{$sample} = join("\t",$hash{SVTYPE},$gtinfo{CN},$gtinfo{SU});
}
}
}
}
close IN;
my @files = ("exonoverlap_sv.txt","geneoverlap_sv.txt");
foreach $file (@files) {
open IN, "<$file" or die $!;
while (my $line = <IN>) {
chomp($line);
next if ($line =~ m/#/);
my ($chrom,$start,$end,$id,$chr,$start2,$end2,$gene) = split(/\t/, $line);
my $evid = (split(/_/,$id))[0];
my $locus = $eventid{$id};
unless ($locus) {
$locus = $eventid{$evid};
}
if ($file eq 'exonoverlap_sv.txt') {
push @{$gene{$locus}}, $gene;
$inc{$evid} = 1;
}elsif ($file eq 'geneoverlap_sv.txt') {
push @{$gene{$locus}}, $gene;
$inc{$evid} = 1;
}
}
}
my $outfile = $vcf;
$outfile =~ s/vcf/annot.txt/g;
open OUT, ">$outfile" or die $!;
foreach $events (sort keys %lines) {
next unless ($inc{$events});
foreach $locus (sort {$a cmp $b} keys %{$lines{$events}}) {
my %gene_annots;
if ($gene{$locus}) {
my @genes = @{$gene{$locus}};
my %hash;
foreach $g (@genes) {
foreach $trx (split(/,/,$g)) {
my ($symbol,$trxid,$exonnum) = split(/\|/,$trx);
next unless $exonnum;
$hash{$symbol}{$trxid}{$exonnum} = 1;
}
}
foreach $sym (keys %hash) {
foreach $trxid (keys %{$hash{$sym}}) {
my @exons = sort {$a <=> $b} keys %{$hash{$sym}{$trxid}};
my $exon_bound = $exons[0];
if ($#exons > 0) {
$exon_bound = join("-",$exons[0],$exons[-1]);
}
$gene_annots{$sym}{$exon_bound} = 1;
}
}
}
foreach $sample (sort {$a cmp $b} keys %{$lines{$events}{$locus}}) {
foreach $genesym (keys %gene_annots) {
my ($SVTYPE,$CN,$SU) = split(/\t/,$lines{$events}{$locus}{$sample});
next if $SU < 3;
print OUT join("\t",'SV.'.$events,split(":",$locus),$lines{$events}{$locus}{$sample},
$genesym, join(";",sort {$a cmp $b} keys %{$gene_annots{$genesym}}),$sample),"\n";
}
}
}
}
......@@ -35,4 +35,4 @@ cnvkit.py coverage ${sbam} /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.cnvki
cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.normals.cnn -o ${pair_id}.cnr
cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns
cnvkit.py call ${pair_id}.cns -o ${pair_id}.call.cns
cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.pdf
cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.cnv.pdf
......@@ -53,7 +53,7 @@ else
echo "Missing Fasta File: ${index_path}/genome.fa"
usage
fi
module load python/2.7.x-anaconda samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel
module load python/2.7.x-anaconda picard/2.10.3 samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14 parallel
for i in *.bam; do
if [[ ! -f ${i}.bai ]]
......
!/bin/bash
#svcalling.sh
usage() {
echo "-h Help documentation for gatkrunner.sh"
echo "-r --Path to Reference Genome with the file genome.fa"
echo "-p --Prefix for output file name"
echo "-b --Bam File"
echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:a:b:p:h opt
do
case $opt in
r) index_path=$OPTARG;;
p) pair_id=$OPTARG;;
b) sbam=$OPTARG;;
h) usage;;
esac
done
function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
baseDir="`dirname \"$0\"`"
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $index_path ]]; then
usage
fi
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
if [[ -a "${index_path}/genome.fa" ]]
then
reffa="${index_path}/genome.fa"
dict="${index_path}/genome.dict"
else
echo "Missing Fasta File: ${index_path}/genome.fa"
usage
fi
baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
module load speedseq/20160506 novoBreak/v1.1.3 delly2/v0.7.7-multi samtools/1.6 bedtools/2.26.0 snpeff/4.3q vcftools/0.1.14
#RUN DELLY
mkdir temp
delly2 call -t BND -o delly_translocations.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DUP -o delly_duplications.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t INV -o delly_inversions.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DEL -o delly_deletion.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t INS -o delly_insertion.bcf -q 30 -g ${reffa} ${sbam}
delly2 filter -t BND -o delly_tra.bcf -f germline delly_translocations.bcf
delly2 filter -t DUP -o delly_dup.bcf -f germline delly_translocations.bcf
delly2 filter -t INV -o delly_inv.bcf -f germline delly_translocations.bcf
delly2 filter -t DEL -o delly_del.bcf -f germline delly_translocations.bcf
delly2 filter -t INS -o delly_ins.bcf -f germline delly_translocations.bcf
#MERGE DELLY AND MAKE BED
bcftools concat -a -O v delly_dup.bcf delly_inv.bcf delly_tra.bcf delly_del.bcf delly_ins.bcf | vcf-sort > ${pair_id}.delly.vcf
perl $baseDir/vcf2bed.sv.pl ${pair_id}.delly.vcf > delly.bed
bgzip ${pair_id}.delly.vcf
tabix ${pair_id}.delly.vcf.gz
#MAKE FILES FOR LUMPY
samtools sort -@ $SLURM_CPUS_ON_NODE -n -o namesort.bam ${sbam}
samtools view -h namesort.bam | samblaster -M -a --excludeDups --addMateTags --maxSplitCount 2 --minNonOverlap 20 -d discordants.sam -s splitters.sam > temp.sam
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" splitters.sam | samtools view -S -b - | samtools sort -o splitters.bam -
gawk '{ if ($0~"^@") { print; next } else { $10="*"; $11="*"; print } }' OFS="\t" discordants.sam | samtools view -S -b - | samtools sort -o discordants.bam -
#RUN LUMPY
speedseq sv -t $SLURM_CPUS_ON_NODE -o ${pair_id}.sssv -R ${reffa} -B ${sbam} -D discordants.bam -S splitters.bam -x ${index_path}/exclude_alt.bed
java -jar $SNPEFF_HOME/SnpSift.jar filter "GEN[0].SU > 2" ${pair_id}.sssv.sv.vcf.gz > lumpy.vcf
perl $baseDir/vcf2bed.sv.pl lumpy.vcf > lumpy.bed
#COMPARE DELLY & LUMPY
bedtools intersect -v -a lumpy.bed -b delly.bed > lumpy_only.bed
bedtools intersect -header -b lumpy_only.bed -a lumpy.vcf |bgzip > lumpy_only.vcf.gz
vcf-concat ${pair_id}.delly.vcf.gz lumpy_only.vcf.gz |vcf-sort -t temp > ${pair_id}.sv.vcf
perl $baseDir/vcf2bed.sv.pl ${pair_id}.sv.vcf |sort -V -k 1,1 -k 2,2n | grep -v 'alt' |grep -v 'random' |uniq > svs.bed
bedtools intersect -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed > exonoverlap_sv.txt
bedtools intersect -v -header -wb -a svs.bed -b ${index_path}/gencode.exons.bed | bedtools intersect -header -wb -a stdin -b ${index_path}/gencode.genes.chr.bed > geneoverlap_sv.txt
perl $baseDir/annot_sv.pl -r ${index_path} -i ${pair_id}.sv.vcf
bgzip ${pair_id}.sv.vcf
......@@ -19,7 +19,8 @@ do
done
function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
baseDir="`dirname \"$0\"`"
module load bedtools/2.26.0 samtools/1.6
HS=*.hotspot.vcf.gz
......
#!/usr/bin/perl
use strict;
if (scalar @ARGV != 1) {
print "vcf2bed.pl <vcf_file>\n";
exit 1;
}
my $vcfFile = $ARGV[0];
open VCF, $vcfFile
or die "Cannot open $vcfFile";
my $ct = 0;
while (<VCF>) {
next if (index($_, '#') == 0);
my @fields = split /\t/, $_;
$ct ++;
my $chrom = $fields[0];
my $pos = $fields[1];
my $ref = $fields[3];
my $annot = $fields[7];
my $alt = $fields[4];
if ($fields[2] eq 'N') {
$fields[2] = 'NB'.sprintf("%06s",$ct);
}
my %hash = ();
next if ($chrom =~ m/_/);
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val unless ($hash{$key});
}
$hash{'END'} = $pos+1 unless $hash{'END'};
next if ($hash{CHR2} && $hash{CHR2} =~ m/_/);
if ($alt =~ m/chr(\w+):(\d+)/i) {
if ($1 eq $chrom) {
my $end = $2;
if ($pos > $end) {
my $temp = $end;
$end = $pos;
$pos = $temp;
}
print join("\t",$chrom,$pos,$end,$fields[2]),"\n";
}elsif ($fields[2] =~ m/_\d+/) {
print join("\t",$chrom,$pos,$hash{END},$fields[2]),"\n";
}else {
print join("\t",$chrom,$pos,$pos+1,$fields[2]."_1"),"\n";
print join("\t",'chr'.$1,$2,$2+1,$fields[2]."_2"),"\n";
}
}else {
if ($hash{CHR2} && $hash{CHR2} eq $chrom) {
my $end = $hash{END};
if ($pos > $end) {
my $temp = $end;
$end = $pos;
$pos = $temp;
}
print join("\t",$chrom,$pos,$end,$fields[2]),"\n";
}elsif ($hash{CHR2} && $hash{CHR2} ne $chrom) {
print join("\t",$chrom,$pos,$pos+1,$fields[2]."_1"),"\n";
print join("\t",$hash{CHR2},$hash{END},$hash{END}+1,$fields[2]."_2"),"\n";
}unless ($hash{CHR2}) {
print join("\t",$chrom,$pos,$hash{END},$fields[2]),"\n";
}
}
}
close VCF;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment