Skip to content
Snippets Groups Projects
Commit 8d9c724f authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

Merge branch 'master' of git.biohpc.swmed.edu:BICF/Astrocyte/process_scripts

parents 2d64c494 66010ebb
No related merge requests found
#!/usr/bin/perl -w
#integrate_datasets.pl
#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s');
open VCFOUT, ">$opt{pid}\.delly.vcf" or die $!;
open DELFUS, ">$opt{pid}\.delly.potentialfusion.txt" or die $!;
open IN, "gunzip -c $opt{in} |" or die $!;
W1:while (my $line = <IN>) {
chomp($line);
if ($line =~ m/^#/) {
if ($line =~ m/^#CHROM/) {
print OUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n";
my @header = split(/\t/,$line);
($chrom, $pos,$id,$ref,$alt,$score,
$filter,$info,$format,@gtheader) = split(/\t/, $line);
unless ($opt{tumor}) {
if (grep(/T_DNA/,@gtheader)) {
my @tsamps = grep(/T_DNA/,@gtheader);
$opt{tumor} = $tsamps[0];
}else {
$opt{tumor} = $gtheader[0];
}
}
}
print VCFOUT $line,"\n";
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/);
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val unless ($hash{$key});
}
next unless ($hash{ANN});
next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/);
my %gtinfo = ();
my @deschead = split(/:/,$format);
F1:foreach my $k (0..$#gtheader) {
my $subjid = $gtheader[$k];
my $allele_info = $gts[$k];
my @ainfo = split(/:/, $allele_info);
my @mutallfreq = ();
foreach my $k (0..$#ainfo) {
$gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k];
}
$gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP});
next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1);
my @altct = split(/,/,$gtinfo{$subjid}{AD});
my $refct = shift @altct;
@altct2 = split(/,/,$gtinfo{$subjid}{AO});
if (scalar(@altct) ne scalar(@altct2)) {
warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n";
}
my $total = $refct;
foreach my $act (@altct) {
next if ($act eq '.');
$total += $act;
push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP});
}
$gtinfo{$subjid}{MAF} = \@mutallfreq;
}
next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20);
unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) {
warn "Missing Alt:$line\n";
}
@tumormaf = @{$gtinfo{$opt{tumor}}{MAF}};
@tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO});
next if ($tumoraltct[0] eq '.');
$hash{AF} = join(",",@tumormaf);
next if ($tumoraltct[0] < 20);
next if ($tumormaf[0] < 0.01);
my $keepforvcf = 0;
my $keeptrx;
F1:foreach $trx (split(/,/,$hash{ANN})) {
my ($allele,$effect,$impact,$gene,$geneid,$feature,
$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx);
next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i);
next if($effect eq 'sequence_feature');
$keeptrx = $trx;
$keepforvcf = $gene;
last F1;
}
next unless $keepforvcf;
my @nannot;
foreach $info (sort {$a cmp $b} keys %hash) {
if (defined $hash{$info}) {
push @nannot, $info."=".$hash{$info};
}else {
push @nannot, $info;
}
}
my $newannot = join(";",@nannot);
if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) {
print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot,
$format,@gts),"\n";
}elsif ($hash{END} && $hash{END} - $pos > 9999) {
my ($allele,$effect,$impact,$gene,$geneid,$feature,
$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx);
print DELFUS join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts),"\n";
}
}
close IN;
#!/usr/bin/perl -w
#integrate_datasets.pl
#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'in|i=s','pid|p=s','tumor|t=s');
open VCFOUT, ">$opt{pid}\.svaba.vcf" or die $!;
open DELFUS, ">$opt{pid}\.svaba.potentialfusion.txt" or die $!;
open IN, "gunzip -c $opt{in} |" or die $!;
W1:while (my $line = <IN>) {
chomp($line);
if ($line =~ m/^#/) {
if ($line =~ m/^#CHROM/) {
print OUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n";
my @header = split(/\t/,$line);
($chrom, $pos,$id,$ref,$alt,$score,
$filter,$info,$format,@gtheader) = split(/\t/, $line);
unless ($opt{tumor}) {
if (grep(/T_DNA/,@gtheader)) {
my @tsamps = grep(/T_DNA/,@gtheader);
$opt{tumor} = $tsamps[0];
}else {
$opt{tumor} = $gtheader[0];
}
}
}
print VCFOUT $line,"\n";
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/);
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val unless ($hash{$key});
}
if (length($alt) > length($ref)) {
$hash{SVTYPE} = "INS";
$hash{END} = $pos;
}elsif (length($alt) < length($ref)) {
my $diff = substr($ref, length($alt));
$hash{SVTYPE} = "DEL";
$hash{END} = $pos + length($diff);
}
next unless ($hash{ANN});
next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/);
my %gtinfo = ();
my @deschead = split(/:/,$format);
F1:foreach my $k (0..$#gtheader) {
my $subjid = $gtheader[$k];
my $allele_info = $gts[$k];
my @ainfo = split(/:/, $allele_info);
my @mutallfreq = ();
foreach my $k (0..$#ainfo) {
$gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k];
}
$gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP});
next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1);
my @altct = split(/,/,$gtinfo{$subjid}{AD});
my $refct = shift @altct;
@altct2 = split(/,/,$gtinfo{$subjid}{AO});
if (scalar(@altct) ne scalar(@altct2)) {
warn "Inconsistent Allele counts @ $chrom,$pos,$alt,$gtinfo{$subjid}{AD},$gtinfo{$subjid}{AO}\n";
}
my $total = $refct;
foreach my $act (@altct) {
next if ($act eq '.');
$total += $act;
push @mutallfreq, sprintf("%.4f",$act/$gtinfo{$subjid}{DP});
}
$gtinfo{$subjid}{MAF} = \@mutallfreq;
}
next unless ($gtinfo{$opt{tumor}}{DP} && $gtinfo{$opt{tumor}}{DP} ne '.' && $gtinfo{$opt{tumor}}{DP} >= 20);
unless ($gtinfo{$opt{tumor}}{AO} =~ m/\d+/ && $gtinfo{$opt{tumor}}{AD} =~ m/,/) {
warn "Missing Alt:$line\n";
}
@tumormaf = @{$gtinfo{$opt{tumor}}{MAF}};
@tumoraltct = split(/,/,$gtinfo{$opt{tumor}}{AO});
next if ($tumoraltct[0] eq '.');
$hash{AF} = join(",",@tumormaf);
next if ($tumoraltct[0] < 20);
next if ($tumormaf[0] < 0.01);
my $keepforvcf = 0;
my $keeptrx;
F1:foreach $trx (split(/,/,$hash{ANN})) {
my ($allele,$effect,$impact,$gene,$geneid,$feature,
$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$trx);
next unless ($impact =~ m/HIGH|MODERATE/ || $effect =~ /splice/i);
next if($effect eq 'sequence_feature');
$keeptrx = $trx;
$keepforvcf = $gene;
last F1;
}
next unless $keepforvcf;
my @nannot;
foreach $info (sort {$a cmp $b} keys %hash) {
if (defined $hash{$info}) {
push @nannot, $info."=".$hash{$info};
}else {
push @nannot, $info;
}
}
my $newannot = join(";",@nannot);
if ($hash{SVTYPE} eq 'INS' || ($hash{SVTYPE} eq 'DEL' && $keepforvcf !~ m/&/)) {
print VCFOUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot,
$format,@gts),"\n";
}elsif ($hash{SPAN} && $hash{SPAN} > 9999) {
my ($allele,$effect,$impact,$gene,$geneid,$feature,
$featureid,$biotype,$rank,$codon,$aa,$pos_dna,$len_cdna,
$cds_pos,$cds_len,$aapos,$aalen,$distance,$err) = split(/\|/,$keeptrx);
print DELFUS join("\t",$chrom,$pos,$hash{CHR2},$hash{END},$effect,$gene,$biotype,$filter,$format,@gts),"\n";
}
}
close IN;
...@@ -77,6 +77,7 @@ then ...@@ -77,6 +77,7 @@ then
delly2 call -t INS -o ${pair_id}.delly_insertion.bcf -q 30 -g ${reffa} ${sbam} ${normal} delly2 call -t INS -o ${pair_id}.delly_insertion.bcf -q 30 -g ${reffa} ${sbam} ${normal}
#delly2 filter -o ${pair_id}.delly_tra.bcf -f somatic -s samples.tsv ${pair_id}.delly_translocations.bcf #delly2 filter -o ${pair_id}.delly_tra.bcf -f somatic -s samples.tsv ${pair_id}.delly_translocations.bcf
else else
$tid=
#RUN DELLY #RUN DELLY
delly2 call -t BND -o ${pair_id}.delly_translocations.bcf -q 30 -g ${reffa} ${sbam} delly2 call -t BND -o ${pair_id}.delly_translocations.bcf -q 30 -g ${reffa} ${sbam}
delly2 call -t DUP -o ${pair_id}.delly_duplications.bcf -q 30 -g ${reffa} ${sbam} delly2 call -t DUP -o ${pair_id}.delly_duplications.bcf -q 30 -g ${reffa} ${sbam}
...@@ -94,6 +95,10 @@ then ...@@ -94,6 +95,10 @@ then
then then
zgrep '#CHROM' ${pair_id}.delly.vcf.gz > ${pair_id}.delly.genefusion.txt zgrep '#CHROM' ${pair_id}.delly.vcf.gz > ${pair_id}.delly.genefusion.txt
zcat ${pair_id}.delly.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS CHR2 END ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.delly.genefusion.txt zcat ${pair_id}.delly.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS CHR2 END ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.delly.genefusion.txt
mv ${pair_id}.delly.vcf.gz ${pair_id}.delly.ori.vcf.gz
bash $baseDir/filter_delly.pl -t $tid -p $pair_id -i ${pair_id}.delly.ori.vcf.gz
bgzip ${pair_id}.delly.vcf
cat ${pair_id}.potentialfusion.txt >> ${pair_id}.delly.genefusion.txt
fi fi
elif [[ $method == 'svaba' ]] elif [[ $method == 'svaba' ]]
then then
...@@ -116,6 +121,10 @@ then ...@@ -116,6 +121,10 @@ then
then then
zgrep '#CHROM' ${pair_id}.svaba.sv.vcf.gz > ${pair_id}.svaba.genefusion.txt zgrep '#CHROM' ${pair_id}.svaba.sv.vcf.gz > ${pair_id}.svaba.genefusion.txt
zcat ${pair_id}.svaba.sv.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS ALT ID ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.svaba.genefusion.txt zcat ${pair_id}.svaba.sv.vcf.gz | $SNPEFF_HOME/scripts/vcfEffOnePerLine.pl |java -jar $SNPEFF_HOME/SnpSift.jar extractFields - CHROM POS ALT ID ANN[*].EFFECT ANN[*].GENE ANN[*].BIOTYPE FILTER FORMAT GEN[*] |grep -E 'gene_fusion|feature_fusion' | sort -u >> ${pair_id}.svaba.genefusion.txt
mv ${pair_id}.svaba.vcf.gz ${pair_id}.svaba.ori.vcf.gz
bash $baseDir/filter_svaba.pl -t $tid -p $pair_id -i ${pair_id}.svaba.ori.vcf.gz
bgzip ${pair_id}.svaba.vcf
cat ${pair_id}.potentialfusion.txt >> ${pair_id}.svaba.genefusion.txt
fi fi
elif [[ $method == 'lumpy' ]] elif [[ $method == 'lumpy' ]]
then then
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment