Skip to content
Snippets Groups Projects
Commit aa745ec0 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

update testing for SN/SP GM12878

parent 8053c8d2
Branches
Tags publish_0.4.5
No related merge requests found
Pipeline #5983 passed with stages
in 18 hours, 6 minutes, and 35 seconds
......@@ -179,7 +179,7 @@ process seqqc {
mv ${pair_id}.covhist.txt ${pair_id}.covuniqhist.txt
mv ${pair_id}_lowcoverage.txt ${pair_id}_lowcoverageuniq.txt
mv ${pair_id}_exoncoverage.txt ${pair_id}_exoncoverageuniq.txt
bash $baseDir/process_scripts/alignment/bamqc.sh -c $capture_bed -n dna -r $index_path -b $sbam -p $pair_id
bash $baseDir/process_scripts/alignment/bamqc.sh -c $capture_bed -n dna -r $index_path -b ori.bam -p $pair_id
"""
}
......@@ -239,7 +239,7 @@ process pindel {
file("${subjid}.pindel_indel.pass.vcf.gz") into pindelvcf
file("${subjid}.dna.genefusion.txt") into gf
when:
params.callsvs="detect"
params.callsvs=="detect"
script:
"""
source /etc/profile.d/modules.sh
......
Subproject commit 6c0e8739e4ece19807bfd97d1a988afd78630b0a
Subproject commit 274f44a39c2429435ac94aa664f6fe68a68ee405
......@@ -4,47 +4,43 @@
my %tp;
my %fp;
my %fn;
my $input = shift @ARGV;
my $fn = $input;
$fn =~ s/eval.vcf/utswcoding.multiinter.bed.gz/;
my $fn = 'fn.vcf';
my $run = $input;
$run =~ s/\.eval.vcf//;
my @allcallers = ('gatk','sam','ssvar','platypus');
foreach ((@allcallers,'union','genomeseer')) {
my @allcallers = ('strelka2','fb','platypus','gatk','genomeseer','union');
foreach ((@allcallers)) {
$fn{'snp'}{$_} = 0;
$fn{'indel'}{$_} = 0;
}
open FP, ">fp.vcf" or die $!;
open FN, "gunzip -c $fn |" or die $!;
open FN, "<$fn" or die $!;
while (my $line = <FN>) {
chomp($line);
my ($chr,$pos,$end,$calls) = split(/\t/,$line);
next if ($calls =~ m/union/);
next unless ($calls =~ m/platinum/ && $calls =~ m/giab/);
my ($chr,$pos,$id,$ref,$alt,@others) = split(/\t/,$line);
$vartype = 'snp';
$size = $end-$pos;
if ($size > 1) {
if (length($ref) > 1 || length($alt) > 1) {
$vartype = 'indel';
}
next if ($chr eq 'chr6' && $pos >= 28510120 && $pos <= 33480577);
$fn{$vartype}{'union'} ++;
$fn{$vartype}{'genomeseer'} ++;
print $line,"\n";
foreach (@allcallers) {
$fn{$vartype}{$_} ++;
}
}
my %done;
my %reason;
open IN, "<$input" or die $!;
while (my $line = <IN>) {
chomp($line);
if ($line =~ m/^#/) {
print FP $line,"\n";
next;
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/);
next if ($chrom eq 'chr6' && $pos >= 28510120 && $pos <= 33480577);
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
......@@ -52,10 +48,7 @@ while (my $line = <IN>) {
}
my $cosmicsubj = 0;
@cosmicct = split(/,/,$hash{CNT}) if $hash{CNT};
foreach $val (@cosmicct) {
$cosmicsubj += $val if ($val =~ m/^\d+$/);
}
$cosmicsubj = $cosmicct[0];
my @deschead = split(/:/,$format);
my $allele_info = shift @gts;
@ainfo = split(/:/, $allele_info);
......@@ -72,59 +65,164 @@ while (my $line = <IN>) {
my $dp = $altct+$refct;
my $maf = sprintf("%.4f",$altct/$dp);
my $vartype = 'snp';
if ($hash{TYPE} ne 'snp') {
next if $maf < 0.1;
if (length($alt) == length($ref) && length($ref) > 1) {
$vartype = 'indel';
}
next if ($hash{FS} && $hash{FS} > 60);
foreach $ant (split(/,/,$alt)) {
$vartype = 'indel' if (length($ant) != length($ref));
}
if ($hash{REFREP} && $hash{REFREP} =~ m/^\d$/) {
$numreps= (split(/,/,$hash{REFREP}))[0];
next if $numreps > 1;
}
my @callers;
if ($hash{CallSet} && $hash{CallSet} =~ m/\// || $hash{SomaticCallSet} && $hash{SomaticCallSet} =~ m/\//) {
my @callinfo ;
@callinfo = split(/|/, $hash{CallSet}) if ($hash{CallSet});
if ($hash{SomaticCallSet}) {
@callinfo = split(/|/, $hash{SomaticCallSet}) if ($hash{SomaticCallSet});
}
foreach $cinfo (@callinfo) {
my ($caller, $alt, @samafinfo) = split(/\//,$cinfo);
push @callers, $caller;
}
$hash{CallSet} = join(",",@callinfo);
} elsif ($hash{CallSet} && $hash{CallSet} =~ m/\|/ || $hash{SomaticCallSet} && $hash{SomaticCallSet} =~ m/\|/) {
my @callinfo ;
@callinfo = split(/,/, $hash{CallSet}) if ($hash{CallSet});
if ($hash{SomaticCallSet}) {
@callinfo = split(/,/, $hash{SomaticCallSet}) if ($hash{SomaticCallSet});
}
foreach $cinfo (@callinfo) {
my ($caller, $alt, @samafinfo) = split(/\|/,$cinfo);
push @callers, $caller;
}
$hash{CallSet} = join(",",@callinfo);
} else {
if ($hash{CallSet}) {
@callers = (@callers, split(/\,/, $hash{CallSet}));
}
if ($hash{SomaticCallSet}) {
@callers = (@callers, split(/\,/, $hash{SomaticCallSet}));
}
}
$is_gs = 1;
$is_gs = 0 if ($hash{CallSet} eq 'hotspot' && $maf > 0.1);
if (($id =~ m/COS/) && $cosmicsubj >= 5) {
$is_gs = 0 if ($dp < 20 || $altct < 3);
$is_gs = 0 if ($maf < 0.01 && $vartype eq 'snp');
$is_gs = 0 if ($maf < 0.03 && $vartype eq 'indel');
}else {
$is_gs = 0 unless ($hash{CallSet} =~ m/,/);
$is_gs = 0 if ($dp < 20 || $altct < 8);
$is_gs = 0 if ($maf < 0.05 && $vartype eq 'snp');
$is_gs = 0 if ($maf < 0.1 && $vartype eq 'indel');
}
%chash = map {$_=>1} split(/,/,$hash{CallSet});
if ($hash{PlatRef} =~ m/giab|platinum/) {
if ($is_gs){
$tp{$vartype}{'genomeseer'} ++;
}elsif($is_gs && $hash{PlatRef} =~ m/giab/ && $hash{PlatRef} =~ m/platinum/) {
$fn{$vartype}{'genomeseer'} ++;
if ($maf < 0.05 || ($maf < 0.10 && $vartype ne 'snp')) {
$reason{$chrom}{$pos}{$ref}{$alt} = 'LowAF';
$is_gs = 0;
}elsif ($hash{CallSetInconsistent} && $vartype ne 'snp') {
$reason{$chrom}{$pos}{$ref}{$alt} = 'Inconsistent';
$is_gs = 0;
}elsif ($hash{RepeatType} && $hash{RepeatType} =~ m/Simple_repeat/ && $maf < 0.15) {
$reason{$chrom}{$pos}{$ref}{$alt} = 'InRepeat';
$is_gs = 0;
} elsif ($hash{strandBias} || (($hash{SAP} && $hash{SAP} > 20) && ((exists $hash{SAF} && $hash{SAF}< 1) || (exists $hash{SAR} && $hash{SAR}< 1)))) {
$reason{$chrom}{$pos}{$ref}{$alt} = 'StrandBias';
$is_gs = 0;
} elsif ($id =~ m/COS/ && $cosmicsubj >= 5) {
if ($dp < 20 || $altct < 3) {
$reason{$chrom}{$pos}{$ref}{$alt} = 'LowCoverage';
$is_gs = 0;
}
} elsif (scalar(@callers) < 2) {
$reason{$chrom}{$pos}{$ref}{$alt} = 'OneCaller';
$is_gs = 0;
} elsif ($dp < 20 || $altct < 8) {
$reason{$chrom}{$pos}{$ref}{$alt} = 'LowCoverage';
$is_gs = 0;
}
my %sinfo;
%chash = map {$_=>1} @callers;
if ($is_gs) {
$chash{'genomeseer'} = 1;
}
$chash{'union'} = 1;
$vartype{$chrom}{$pos}{$ref}{$alt} = $vartype;
$refpos = 0;
if ($hash{platforms}) {
$refpos ++;
}if ($hash{MTD}) {
$refpos ++;
}
my $chrompos= join(":",$chrom,$pos,$ref,$alt);
my $callers = join(":",keys %chash);
$status{$chrompos}{$callers} = $refpos;
}
my %cat;
foreach $loci (keys %status) {
my @csets = keys %{$status{$loci}};
my ($chrom,$pos,$ref,$alt) = split(/:/,$loci);
my $trueset = 0;
my $genomeseer = 0;
my %callers;
foreach $cset (@csets) {
$repos = $status{$loci}{$cset};
@callers = split(/:/,$cset);
if ($trueset) {
$trueset = $repos if ($repos > $trueset);
}else {
$trueset = $repos;
}
foreach (@callers) {
$callers{$_} = 1;
}
$tp{$vartype}{'union'} ++;
}
if ($trueset > 0) {
foreach (@allcallers) {
if ($chash{$_}){
$tp{$vartype}{$_} ++;
}elsif($hash{PlatRef} =~ m/giab/ && $hash{PlatRef} =~ m/platinum/) {
$fn{$vartype}{$_} ++;
if ($callers{$_}){
$cat{$chrom}{$pos}{$ref}{$alt}{$_} = 'tp';
} else {
$cat{$chrom}{$pos}{$ref}{$alt}{$_} = 'fn' unless ($trueset < 2);
print join(":","FN",$chrom,$pos,$ref,$alt,$reason{$chrom}{$pos}{$ref}{$alt}),"\n" if ($_ eq 'genomeseer');
}
}
}else {
if ($is_gs){
$fp{$vartype}{'genomeseer'} ++;
print FP $line,"\n";
foreach (@allcallers) {
if ($callers{$_}){
$cat{$chrom}{$pos}{$ref}{$alt}{$_} = 'fp';
print join(":","FP",$chrom,$pos,$ref,$alt,$vartype{$chrom}{$pos}{$ref}{$alt}),"\n" if ($_ eq 'genomeseer');
}
}
$fp{$vartype}{'union'} ++;
foreach $caller (keys %chash) {
$fp{$vartype}{$caller} ++;
}
}
foreach $chrom (keys %cat) {
foreach $pos (keys %{$cat{$chrom}}) {
foreach $ref (keys %{$cat{$chrom}{$pos}}){
foreach $alt (keys %{$cat{$chrom}{$pos}{$ref}}){
my $vartype = $vartype{$chrom}{$pos}{$ref}{$alt};
foreach $call (keys %{$cat{$chrom}{$pos}{$ref}{$alt}}) {
if ($cat{$chrom}{$pos}{$ref}{$alt}{$call} eq 'tp') {
$tp{$vartype}{$call} ++;
}elsif ($cat{$chrom}{$pos}{$ref}{$alt}{$call} eq 'fp') {
$fp{$vartype}{$call} ++;
}elsif ($cat{$chrom}{$pos}{$ref}{$alt}{$call} eq 'fn') {
$fn{$vartype}{$call} ++;
}
}
}
}
}
}
foreach ((@allcallers,'genomeseer','union')) {
open OUT, ">$run\.snsp.txt" or die $!;
print OUT join("\t",'SampleID','Method','SNV TP','SNV FP','SNV FN','InDel TP','InDel FP','InDel FN',
'SNV SN','InDel SN','Combined SN','SNV PPV','InDel PPV','Combined PPV'),"\n";
foreach (@allcallers) {
$fp{'snp'}{$_} = 0 unless $fp{'snp'}{$_};
$fp{'indel'}{$_} = 0 unless $fp{'indel'}{$_};
$fn{'indel'}{$_} = 0 unless $fn{'indel'}{$_};
next unless ($tp{'snp'}{$_});
next unless ($tp{'indel'}{$_});
$sn_snp = sprintf("%.1f",100*$tp{'snp'}{$_}/($tp{'snp'}{$_}+$fn{'snp'}{$_}));
$sn_indel = sprintf("%.1f",100*$tp{'indel'}{$_}/($tp{'indel'}{$_}+$fn{'indel'}{$_}));
$sp = sprintf("%.1f",100*($tp{'snp'}{$_}+$tp{'indel'}{$_})/($tp{'snp'}{$_}+$fp{'snp'}{$_}+$tp{'indel'}{$_}+$fp{'indel'}{$_}));
print join("\t",$run,$_,$tp{'snp'}{$_},$fp{'snp'}{$_},$fn{'snp'}{$_},
$tp{'indel'}{$_},$fp{'indel'}{$_},$fn{'indel'}{$_},
$sn_snp,$sn_indel,$sp),"\n";
$sn_total = sprintf("%.1f",100*($tp{'snp'}{$_}+$tp{'indel'}{$_})/($tp{'snp'}{$_}+$fn{'snp'}{$_}+$tp{'indel'}{$_}+$fn{'indel'}{$_}));
$sp_total = sprintf("%.1f",100*($tp{'snp'}{$_}+$tp{'indel'}{$_})/($tp{'snp'}{$_}+$fp{'snp'}{$_}+$tp{'indel'}{$_}+$fp{'indel'}{$_}));
$sp_snp = sprintf("%.1f",100*$tp{'snp'}{$_}/($tp{'snp'}{$_}+$fp{'snp'}{$_}));
$sp_indel = sprintf("%.1f",100*$tp{'indel'}{$_}/($tp{'indel'}{$_}+$fp{'indel'}{$_}));
print OUT join("\t",$run,$_,$tp{'snp'}{$_},$fp{'snp'}{$_},$fn{'snp'}{$_},
$tp{'indel'}{$_},$fp{'indel'}{$_},$fn{'indel'}{$_},
$sn_snp,$sn_indel,$sn_total,$sp_snp,$sp_indel,$sp_total),"\n";
}
close OUT;
#!/bin/bash
#snsp.sh
module load samtools/1.6 bedtools/2.26.0
ID=$1
source /etc/profile.d/modules.sh
module load samtools/gcc/1.8 htslib/gcc/1.8 bedtools/2.26.0 vcftools/0.1.14 bcftools/gcc/1.8
#!/bin/bash
#snsp.sh
usage() {
echo "-h Help documentation for gatkrunner.sh"
echo "-t --Target Panel Bed File"
echo "-p --SampleID"
echo "-b --BAM File"
echo "-v --VCF File"
echo "Example: bash snsp.sh -p prefix -b bamfile -t targetpanel -v VCF"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :t:r:v:s:p:b:h opt
do
case $opt in
p) id=$OPTARG;;
t) tpanel=$OPTARG;;
r) index_path=$OPTARG;;
b) bam=$OPTARG;;
v) vcf=$OPTARG;;
h) usage;;
esac
done
shift $(($OPTIND -1))
baseDir="`dirname \"$0\"`"
perl ${baseDir}/filter_giab.pl $ID
bedtools intersect -header -a ${ID}.PASS.vcf -b /project/shared/bicf_workflow_ref/GRCh38/utswv2_cds.bed | bedtools intersect -v -header -a stdin -b /project/shared/bicf_workflow_ref/GRCh38/HLA_HG38.bed | bedtools intersect -header -a stdin -b /project/shared/bicf_workflow_ref/GRCh38/giab3_platinum2.hg38.highConf.bed | bgzip > ${ID}.t.vcf.gz
bcftools reheader -h /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.vcfheader.txt -o ${ID}.utswcoding.vcf.gz ${ID}.t.vcf.gz
tabix ${ID}.utswcoding.vcf.gz
bedtools multiinter -i ${ID}.utswcoding.vcf.gz /project/shared/bicf_workflow_ref/GRCh38/giab3.utswcoding.vcf.gz /project/shared/bicf_workflow_ref/GRCh38/platinum_v2.utswcoding.vcf.gz -names union giab platinum |cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct | bgzip > ${ID}.utswcoding.multiinter.bed.gz
tabix ${ID}.utswcoding.multiinter.bed.gz
bcftools annotate -a ${ID}.utswcoding.multiinter.bed.gz --columns CHROM,FROM,TO,PlatRef -h /project/shared/bicf_workflow_ref/PlatRef.header ${ID}.utswcoding.vcf.gz > ${ID}.eval.vcf
perl /project/PHG/PHG_Clinical/clinseq_workflows/scripts/calc_snsp.pl ${ID}.eval.vcf
module load bedtools/2.29.0 htslib/gcc/1.8 vcftools/0.1.14
outnf='output'
if [[ -f "${outnf}/GM12878/GM12878.annot.vcf.gz" ]]
then
vcf="GM12878.annot.vcf.gz"
bam="GM12878/GM12878.dedup.bam"
cd ${outnf}/GM12878
cut -f 1,2,3 ${index_path}/../gencode.CDS.bed |grep ^chr > gencode.cds.bed
bedtools intersect -header -a ${index_path}/giab3.vcf.gz -b ${tpanel} | bedtools intersect -v -header -a stdin -b ${index_path}/HLA_HG38.bed | bedtools intersect -header -a stdin -b ${index_path}/giab3_platinum2.hg38.highConf.bed | bedtools intersect -header -a stdin -b gencode.cds.bed | vcf-sort | uniq | /project/shared/bicf_workflow_ref/seqprg/vt/vt decompose_blocksub - -a -o giab3.utswcoding.vcf
bgzip giab3.utswcoding.vcf
tabix giab3.utswcoding.vcf.gz
bedtools intersect -header -a ${index_path}/platinum_v2.vcf.gz -b ${tpanel} | bedtools intersect -v -header -a stdin -b ${index_path}/HLA_HG38.bed | bedtools intersect -header -a stdin -b ${index_path}/giab3_platinum2.hg38.highConf.bed | bedtools intersect -header -a stdin -b gencode.cds.bed | vcf-sort | uniq | /project/shared/bicf_workflow_ref/seqprg/vt/vt decompose_blocksub - -a -o platinum_v2.utswcoding.vcf
bgzip platinum_v2.utswcoding.vcf
tabix platinum_v2.utswcoding.vcf.gz
bedtools intersect -header -a ${vcf} -b ${tpanel} | bedtools intersect -v -header -a stdin -b ${index_path}/HLA_HG38.bed | bedtools intersect -header -a stdin -b ${index_path}/giab3_platinum2.hg38.highConf.bed | bedtools intersect -header -a stdin -b gencode.cds.bed | vcf-sort |uniq | bgzip > ${id}.utswcoding.vcf.gz
tabix ${id}.utswcoding.vcf.gz
bcftools annotate -Oz -a giab3.utswcoding.vcf.gz --columns CHROM,POS,REF,ALT,platforms -o ${id}.g.vcf.gz ${id}.utswcoding.vcf.gz
tabix ${id}.g.vcf.gz
bcftools annotate -Ov -a platinum_v2.utswcoding.vcf.gz --columns CHROM,POS,REF,ALT,MTD -o ${id}.eval.vcf ${id}.g.vcf.gz
bedtools genomecov -bga -split -ibam ${bam} |awk '$4 > 19' | cut -f 1,2,3 | bedtools merge -i stdin > enoughcov.bed
bedtools intersect -header -a giab3.utswcoding.vcf.gz -b platinum_v2.utswcoding.vcf.gz |bedtools intersect -header -v -a stdin -b ${id}.utswcoding.vcf.gz | bedtools intersect -a stdin -b enoughcov.bed > fn.vcf
perl $baseDir/calc_snsp.pl ${id}.eval.vcf
fi
......@@ -12,7 +12,7 @@ def test_output_VCF():
assert os.path.exists(os.path.join(test_output_path, 'GM12878/GM12878.annot.vcf.gz'))
annot_vcf = test_output_path + 'GM12878/GM12878.annot.vcf.gz'
annot_output = gzip.open(annot_vcf).readlines()
assert 'FILTER=<ID=PASS,Description="All filters passed">' in str(annot_output[1])
@pytest.mark.output
def test_output_Coverage_Histogram():
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment