...
 
Commits (25)
MIT License
Copyright (c) 2019 BICF / Astrocyte
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
......@@ -13,7 +13,7 @@ usage() {
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:b:c:n:p:d:h opt
while getopts :r:b:c:n:p:s:d:h opt
do
case $opt in
r) index_path=$OPTARG;;
......@@ -22,6 +22,7 @@ do
n) nuctype=$OPTARG;;
p) pair_id=$OPTARG;;
d) dedup=$OPTARG;;
s) skiplc=1;;
h) usage;;
esac
done
......@@ -52,15 +53,18 @@ fi
if [[ $nuctype == 'dna' ]]; then
module load bedtools/2.26.0 picard/2.10.3
samtools view -@ $SLURM_CPUS_ON_NODE -b -L ${bed} -o ${pair_id}.ontarget.bam ${sbam}
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.ontarget.bam
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
if [[ -z $skiplc ]]
then
samtools view -@ $SLURM_CPUS_ON_NODE -b -L ${bed} -o ${pair_id}.ontarget.bam ${sbam}
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.ontarget.bam
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
java -Xmx64g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -XX:ParallelGCThreads=$SLURM_CPUS_ON_NODE -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt
samtools view -@ $SLURM_CPUS_ON_NODE -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
samtools view -@ $SLURM_CPUS_ON_NODE ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
fi
java -Xmx64g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt
java -Xmx64g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt
samtools view -@ $SLURM_CPUS_ON_NODE -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt
grep ^all ${pair_id}.covhist.txt > ${pair_id}.genomecov.txt
samtools view -@ $SLURM_CPUS_ON_NODE ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
perl $baseDir/calculate_depthcov.pl ${pair_id}.covhist.txt
fi
......@@ -14,12 +14,14 @@ while (my $line = <ENT>) {
$entrez{$row[2]} = $row[1];
}
}
open OM, "</project/shared/bicf_workflow_ref/human/GRCh38/clinseq_prj/utswv2_known_genefusions.txt" or die $!;
while (my $line = <OM>) {
chomp($line);
$known{$line} = 1;
}
close OM;
open OM, "</project/shared/bicf_workflow_ref/human/GRCh38/clinseq_prj/panel1410.genelist.txt" or die $!;
while (my $line = <OM>) {
chomp($line);
......@@ -43,20 +45,24 @@ foreach $efile (@exonfiles) {
push @rightexons, $exonnum;
}
}
$exonnuminfo{$dir}{leftgene} = join("-",$leftexons[0],$leftexons[-1]);
$exonnuminfo{$dir}{rightgene} = join("-",$rightexons[0],$rightexons[-1]);
if ($leftexons[0] eq $leftexons[-1]) {
$exonnuminfo{$dir}{leftgene} = $leftexons[0];
}else {
$exonnuminfo{$dir}{leftgene} = join("-",$leftexons[0],$leftexons[-1]);
}
if ($rightexons[0] eq $rightexons[-1]) {
$exonnuminfo{$dir}{rightgene} = $rightexons[0];
}else {
$exonnuminfo{$dir}{rightgene} = join("-",$rightexons[0],$rightexons[-1]);
}
}
open OUT, ">$opt{prefix}\.translocations.txt" or die $!;
open OAS, ">$opt{prefix}\.translocations.answer.txt" or die $!;
open OUTIR, ">$opt{prefix}\.cbioportal.genefusions.txt" or die $!;
print OUT join("\t","FusionName","LeftGene","RightGene","LefttBreakpoint",
"RightBreakpoint","LeftStrand","RightStrand","RNAReads",
"DNAReads"),"\n";
print OAS join("\t","FusionName","LeftGene","LefttBreakpoint","LeftGeneExons","LeftStrand",
"RightGene","RightBreakpoint","RightGeneExons","RightStrand",
"RNAReads","DNAReads","FusionType","Annot"),"\n";
"RNAReads","DNAReads","FusionType","Annot",'Filter','ChrType','ChrDistance'),"\n";
print OUTIR join("\t","Hugo_Symbol","Entrez_Gene_Id","Center","Tumor_Sample_Barcode",
"Fusion","DNA_support","RNA_support","Method","Frame"),"\n";
......@@ -75,6 +81,7 @@ while (my $line = <FUSION>) {
foreach my $i (0..$#row) {
$hash{$hline[$i]} = $row[$i];
}
my @filter;
my ($left_chr,$left_pos,$left_strand) = split(/:/,$hash{LeftBreakpoint});
my ($right_chr,$right_pos,$right_strand) = split(/:/,$hash{RightBreakpoint});
$hash{LeftBreakpoint} = join(":",$left_chr,$left_pos);
......@@ -83,7 +90,9 @@ while (my $line = <FUSION>) {
$hash{RightStrand} = $right_strand;
$hash{LeftGene} = (split(/\^/,$hash{LeftGene}))[0];
$hash{RightGene} = (split(/\^/,$hash{RightGene}))[0];
next unless ($keep{$hash{LeftGene}} || $keep{$hash{RightGene}});
unless ($keep{$hash{LeftGene}} || $keep{$hash{RightGene}}) {
push @filter, 'OutsideGeneList';
}
$hash{SumRNAReads} += $hash{JunctionReadCount}+$hash{SpanningFragCount};
my $fname = join("--",$hash{LeftGene},$hash{RightGene});
my $fname2 = join("--",sort {$a cmp $b} $hash{LeftGene},$hash{RightGene});
......@@ -95,20 +104,40 @@ while (my $line = <FUSION>) {
}
$hash{PROT_FUSION_TYPE} = 'in-frame' if ($hash{PROT_FUSION_TYPE} eq 'INFRAME');
my ($dna_support,$rna_support)=("no") x 2;
if ($known{$fname2} && ($hash{SumRNAReads} >= 3)|| ($hash{SumRNAReads} >= 5)) {
$rna_support = "yes";
print OUT join("\t",$fname,$hash{LeftGene},$hash{RightGene},
$hash{LeftBreakpoint},$hash{RightBreakpoint},$hash{LeftStrand},
$hash{RightStrand},$hash{SumRNAReads},0),"\n";
print OAS join("\t",$fname,$hash{LeftGene},$hash{LeftBreakpoint},$leftexon,$hash{LeftStrand},
$hash{RightGene},$hash{RightBreakpoint},$rightexon,$hash{RightStrand},
$hash{SumRNAReads},0,lc($hash{PROT_FUSION_TYPE}),$hash{annots}),"\n";
print OUTIR join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}},"UTSW",$sname,$fname." fusion",
$dna_support,$rna_support,"STAR Fusion",lc($hash{PROT_FUSION_TYPE})),"\n";
print OUTIR join("\t",$hash{RightGene},$entrez{$hash{RightGene}},"UTSW",$sname,$fname." fusion",
$dna_support,$rna_support,"STAR Fusion",lc($hash{PROT_FUSION_TYPE})),"\n";
$hash{annots} =~ s/CHROMOSOMAL\[/CHROMOSOMAL /;
$hash{annots} =~ s/\]|\[|\"//g;
@annots = split(/,/,$hash{annots});
my ($chrom_type_dist) = grep(/CHROMOSOMAL/,@annots);
my ($chrtype,$chrdist) = split(/ /,$chrom_type_dist);
@annot2 = grep(!/CHROMOSOMAL/, @annots);
$fusion_annot = '';
if (scalar(@annot2) > 0) {
$fusion_annot = join(",",@annot2);
}
if ($known{$fname2} || $fusion_annot =~ m/CCLE|Cosmic|FA_CancerSupp|Klijn_CellLines|Mitelman|YOSHIHARA_TCGA|chimer/i) {
push @filter, 'LowReadCt' if ($hash{SumRNAReads} < 3);
}else {
push @filter, 'LowReadCt' if ($hash{SumRNAReads} < 5);
}
$rna_support = "yes";
if ($left_chr eq $right_chr) {
$diff = abs($right_pos-$left_pos);
push @filter, 'ReadThrough' if ($diff < 200000);
}
my $qc ='PASS';
if (scalar(@filter) > 0) {
$qc = join(";","FailedQC",@filter);
}
print OAS join("\t",$fname,$hash{LeftGene},$hash{LeftBreakpoint},$leftexon,$hash{LeftStrand},
$hash{RightGene},$hash{RightBreakpoint},$rightexon,$hash{RightStrand},
$hash{SumRNAReads},0,lc($hash{PROT_FUSION_TYPE}),$fusion_annot,$qc,$chrtype,$chrdist),"\n";
print OUTIR join("\t",$hash{LeftGene},$entrez{$hash{LeftGene}},"UTSW",$sname,$fname." fusion",
$dna_support,$rna_support,"STAR Fusion",lc($hash{PROT_FUSION_TYPE})),"\n";
print OUTIR join("\t",$hash{RightGene},$entrez{$hash{RightGene}},"UTSW",$sname,$fname." fusion",
$dna_support,$rna_support,"STAR Fusion",lc($hash{PROT_FUSION_TYPE})),"\n";
}
close OUT;
close OAS;
close OUTIR;
......@@ -6,15 +6,17 @@ use File::Basename;
my $results= GetOptions (\%opt,'fpkm|f=s','logcpm|l=s','cnv|c=s','prefix|p=s','help|h');
open ENT_ENS, "</project/shared/bicf_workflow_ref/human/gene_info.human.txt" or die $!;
open ENT_GENE, "</project/shared/bicf_workflow_ref/human/gene_info.human.txt" or die $!;
my %entrez;
my $ent_header = <ENT_ENS>;
while (my $line = <ENT_ENS>){
my %entgene;
my $ent_header = <ENT_GENE>;
while (my $line = <ENT_GENE>){
chomp $line;
my @row = split(/\t/, $line);
$entrez{$row[2]}=$row[1];
$entgene{$row[6]}{$row[2]}=$row[1];
#$entrez{$row[2]}=$row[1];
}
close ENT_ENS;
close ENT_GENE;
open ENT_ENS, "</project/shared/bicf_workflow_ref/human/GRCh38/genenames.txt" or die $!;
my $gn_header = <ENT_ENS>;
my %ensym;
......@@ -41,12 +43,14 @@ if($opt{fpkm}){
my $fpkm_header = <FPKM>;
while(my $line = <FPKM>){
chomp $line;
my ($id,$gene,$ref,$strand,$start,$end,$coverage,$fpkm,$tpm) = split(/\t/,$line);
my $entrezid=0;
my ($id,$gene,$chr,$strand,$start,$end,$coverage,$fpkm,$tpm) = split(/\t/,$line);
$chr =~ s/^chr//g;
my $ensembl = (split(/\./,$id))[0];
if ($entrez{$ensembl}) {
$entrezid = $entrez{$ensembl};
}else {
$entrezid = $entrez{$gene};
}elsif($entgene{$chr}{$gene}){
$entrezid = $entgene{$chr}{$gene};
}
next unless ($entrezid);
print OUTF join("\t",$entrezid,$fpkm),"\n";
......@@ -68,19 +72,24 @@ if($opt{logcpm}){
chomp($line);
my @row = split(/\t/,$line);
my $gene = $row[0];
my $chrom = (split(";",$row[1]))[0];
$chrom =~ s/^chr//g;
my $ct = $row[-1];
next if($gene =~ m/^__/);
$cts{$gene}{$sample} = $ct;
$cts{$chrom}{$gene} = $ct;
$total += $ct;
}
print $total."\n";
close IN;
foreach $ens (keys %cts) {
next unless $entrez{$ens};
unless ($cts{$ens}) {
$cts{$ens} = 0;
}
$cpm = ($cts{$ens}/$total)*1e6;
print OUTL join("\t",$entrez{$ens},sprintf("%.2f",log2($cpm))),"\n";
foreach my $ens_chr (keys %cts) {
foreach my $ens (keys $cts{$ens_chr}){
next unless $entgene{$ens_chr}{$ens};#($entrez{$ens} or $entgene{$gene}{$chrom});
#unless ($cts{$ens_chr}{$ens}){
# $cts{$ens_chr}{$ens} = 0;
#}
$cpm = ($cts{$ens_chr}{$ens}/$total)*1e6;
print OUTL join("\t",$entgene{$ens_chr}{$ens},sprintf("%.2f",log2($cpm))),"\n";
}
}
close OUTL;
}
......
......@@ -15,7 +15,7 @@ while (my $line = <SYM>) {
chomp($line);
my ($chrom,$start,$end,$ensembl,$symbol,$type) = split(/\t/,$line);
$ensembl = (split(/\./,$ensembl))[0];
$names{$ensembl} = {symbol=>$symbol,type=>$type};
$names{$ensembl} = {chr=>$chrom,symbol=>$symbol,type=>$type};
}
my @files = @ARGV;
......@@ -32,8 +32,13 @@ foreach $file (@files) {
while (my $line = <IN>) {
chomp($line);
my ($ensid,$gene,$chr,$strand,$start,$end,$cov,$fpkm,$tmp) = split(/\t/,$line);
my ($ens,$version) = split(/\./,$ensid);
$cts{$ens}{$sample} = $fpkm;
my $ens = $ensid;
$ens =~ s/\.\d+//;
if ($chr eq $names{$ens}{chr}) {
$cts{$ens}{$sample} = $fpkm;
}else {
warn "unable to map to genenames\n";
}
}
close IN;
}
......
......@@ -38,7 +38,9 @@ then
SLURM_CPUS_ON_NODE=64
fi
source /etc/profile.d/modules.sh
module load subread/1.6.1 stringtie/1.3.2d-intel
module load subread/1.6.1
export PATH=/project/shared/bicf_workflow_ref/seqprg/bin:$PATH
featureCounts -s $stranded -M --fraction -J --ignoreDup -T $SLURM_CPUS_ON_NODE -p -g gene_name -a ${gtf} -o ${pair_id}.cts ${sbam}
mkdir ${pair_id}_stringtie
cd ${pair_id}_stringtie
......
......@@ -26,10 +26,14 @@ source /etc/profile.d/modules.sh
module load bedtools/2.26.0 samtools/1.6 snpeff/4.3q
if [[ $index_path == '/project/shared/bicf_workflow_ref/human/GRCh38/hisat_index' ]]
then
index_path='/project/shared/bicf_workflow_ref/human/GRCh38'
elif [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
then
index_path='/project/shared/bicf_workflow_ref/human/GRCh38'
fi
if [[ $index_path == '/project/shared/bicf_workflow_ref/human/GRCh38' ]]
then
tabix -f ${unionvcf}
......@@ -38,7 +42,7 @@ then
bcftools annotate -Oz -a ${index_path}/oncokb_hotspot.txt.gz -o ${pair_id}.oncohotspot.vcf.gz -h ${index_path}/oncokb_hotspot.header -c CHROM,FROM,TO,OncoKB_REF,OncoKB_ALT,Gene,OncoKB_ProteinChange,OncoKB_AF,OncoTree_Tissue,OncoTree_MainType,OncoTree_Code,OncoKBHotspot ${pair_id}.gnomad.vcf.gz
tabix ${pair_id}.oncohotspot.vcf.gz
bcftools annotate -Oz -a ${index_path}/repeat_regions.bed.gz -o ${pair_id}.repeat.vcf.gz --columns CHROM,FROM,TO,RepeatType -h /project/shared/bicf_workflow_ref/RepeatType.header ${pair_id}.oncohotspot.vcf.gz
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-downstream -no-upstream -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.repeat.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar annotate -id ${index_path}/dbSnp.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CLNSIG,CLNDSDB,CLNDSDBID,CLNDBN,CLNREVSTAT,CLNACC ${index_path}/clinvar.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CNT ${index_path}/cosmic.vcf.gz - | java -Xmx10g -jar $SNPEFF_HOME/SnpSift.jar dbnsfp -v -db ${index_path}/dbNSFP.txt.gz - | bgzip > ${pair_id}.annot.vcf.gz
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-downstream -no-upstream -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.repeat.vcf.gz | java -jar $SNPEFF_HOME/SnpSift.jar annotate -id ${index_path}/dbSnp.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info CLNSIG,CLNDSDB,CLNDSDBID,CLNDBN,CLNREVSTAT,CLNACC ${index_path}/clinvar.vcf.gz - | java -jar $SNPEFF_HOME/SnpSift.jar annotate -info LEGACY_ID,CNT ${index_path}/cosmic.vcf.gz - | java -Xmx10g -jar $SNPEFF_HOME/SnpSift.jar dbnsfp -v -db ${index_path}/dbNSFP.txt.gz - | bgzip > ${pair_id}.annot.vcf.gz
tabix ${pair_id}.annot.vcf.gz
else
if [[ $index_path == '/project/shared/bicf_workflow_ref/mouse/GRCm38' ]]
......
......@@ -11,12 +11,13 @@ usage() {
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :b:p:n:t:c:uh opt
while getopts :b:p:n:f:t:c:uh opt
do
case $opt in
b) sbam=$OPTARG;;
p) pair_id=$OPTARG;;
n) normals=$OPTARG;;
r) index_path=$OPTARG;;
t) targets=$OPTARG;;
c) capture=$OPTARG;;
u) umi='umi';;
......@@ -25,48 +26,43 @@ do
done
shift $(($OPTIND -1))
baseDir="`dirname \"$0\"`"
index_path='/project/shared/bicf_workflow_ref/human/GRCh38/clinseq_prj'
if [[ -z $index_path ]]
then
index_path='/project/shared/bicf_workflow_ref/human/GRCh38'
fi
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $sbam ]]; then
if [[ -z $pair_id ]] || [[ -z $sbam ]]
then
usage
fi
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
baseDir="`dirname \"$0\"`"
if [[ $capture == "${index_path}/UTSWV2.bed" ]]
then
normals="${index_path}/UTSWV2.normals.cnn"
targets="${index_path}/UTSWV2.cnvkit_"
if [[ $umi == 'umi' ]]
then
normals="${index_path}/UTSWV2.uminormals.cnn"
fi
elif [[ $capture == "${index_path}/UTSWV2_2.panelplus.bed" ]]
then
normals="${index_path}/panelofnormals.panel1385V2_2.cnn"
targets="${index_path}/panel1385V2-2.cnvkit_"
elif [[ $capture == "${index_path}/hemepanelV3.bed" ]]
if [[ -z $normals ]] || [[ -z $targets ]]
then
normals="${index_path}/hemepanelV3.flatreference.cnn"
targets="${index_path}/hemepanelV3.cnvkit_"
usage
fi
echo "${targets}targets.bed"
echo "${targets}antitargets.bed"
echo "${normals}"
source /etc/profile.d/modules.sh
module load cnvkit/0.9.5 bedtools/2.26.0
module load cnvkit/0.9.5 bedtools/2.26.0 samtools/gcc/1.8 bcftools/gcc/1.8
unset DISPLAY
#samtools index ${sbam}
#bcftools mpileup --threads 10 -a 'INFO/AD,INFO/ADF,INFO/ADR,FORMAT/DP,FORMAT/SP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR' -Ou -Q20 -d 99999 -C50 -f ${reffa} -t ${index_path}/NGSCheckMate.bed ${sbam} | bcftools call --threads 10 -vmO v -o common_variants.vcf
cnvkit.py coverage ${sbam} ${targets}targets.bed -o ${pair_id}.targetcoverage.cnn
cnvkit.py coverage ${sbam} ${targets}antitargets.bed -o ${pair_id}.antitargetcoverage.cnn
cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn ${normals} -o ${pair_id}.cnr
cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns
cnvkit.py call ${pair_id}.cns -o ${pair_id}.call.cns
cnvkit.py call --filter cn ${pair_id}.cns -o ${pair_id}.call.cns
#cnvkit.py call --filter cn ${pair_id}.cns -v common_variants.vcf -o ${pair_id}.ballelecall.cns
cnvkit.py scatter ${pair_id}.cnr -s ${pair_id}.call.cns -t --segment-color "blue" -o ${pair_id}.cnv.scatter.pdf
cut -f 1,2,3 ${pair_id}.call.cns | grep -v chrom | bedtools intersect -wao -b /project/shared/bicf_workflow_ref/human/GRCh38/cytoBand.txt -a stdin |cut -f 1,2,3,7 > ${pair_id}.cytoband.bed
perl $baseDir/filter_cnvkit.pl *.call.cns
perl $baseDir/filter_cnvkit.pl ${pair_id}.call.cns
#!/usr/bin/perl -w
#integrate_datasets.pl
#module load vcftools/0.1.14 samtools/1.6 bedtools/2.26.0
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'td|d=s','indel|i=s','sv|s=s','tumor|t=s');
my @files = grep(/vcf.gz/,values %opt);
foreach $file (@files) {
chomp($file);
open IN, "gunzip -c $file |" or die $!;
my $outfile = $file;
$outfile =~ s/vcf.*/pass.vcf/;
my @gtheader;
open OUT, ">$outfile" or die $!;
W1:while (my $line = <IN>) {
chomp($line);
my $format = 'GT:MAF:AO';
if ($line =~ m/^#/) {
if ($line =~ m/^#CHROM/) {
print OUT "##FORMAT=<ID=AO,Number=A,Type=Integer,Description=\"Alternate allele observation count\">\n";
print OUT "##FORMAT=<ID=MAF,Number=1,Type=Integer,Description=\"Mutation Allele Frequency\">\n";
print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
my @header = split(/\t/,$line);
($chrom, $pos,$id,$ref,$alt,$score,
$filter,$info) = split(/\t/, $line);
print OUT join("\t",$chrom, $pos,$id,$ref,$alt,$score,
$filter,$info,'FORMAT',$opt{tumor}),"\n";
}else {
print OUT $line,"\n";
}
next;
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot) = split(/\t/, $line);
next if ($ref =~ m/\./ || $alt =~ m/\./ || $alt=~ m/,X/);
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val unless ($hash{$key});
}
next unless ($hash{ANN});
next unless ($hash{ANN} =~ m/HIGH|MODERATE|LOW/);
my %gtinfo;
foreach (split(/,/,$hash{DP2})) {
$gtinfo{AO} += $_;
}
$gtinfo{MAF} = $hash{VAF};
$hash{AF} = $hash{VAF};
next if $gtinfo{AO} < 10;
next if ($hash{VAF} < 0.01);
$gt = '0/0';
$gt = '0/1' if ($hash{VAF} > 0.3);
$gt = '1/1' if ($hash{VAF} > 0.8);
my $gtinfo = join(":",$gt,$gtinfo{MAF},$gtinfo{AO});
my @nannot;
foreach $info (sort {$a cmp $b} keys %hash) {
if (defined $hash{$info}) {
push @nannot, $info."=".$hash{$info};
}else {
push @nannot, $info;
}
}
my $newannot = join(";",@nannot);
print OUT join("\t",$chrom, $pos,$id,$ref,$alt,$score,$filter,$newannot,
$format,$gtinfo),"\n";
}
close IN;
}
......@@ -6,7 +6,6 @@ use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'td|d=s','indel|i=s','sv|s=s','tumor|t=s');
my @files = grep(/vcf.gz/,values %opt);
foreach $file (@files) {
chomp($file);
......@@ -18,8 +17,8 @@ foreach $file (@files) {
W1:while (my $line = <IN>) {
chomp($line);
if ($line =~ m/^#/) {
print OUT $line,"\n";
if ($line =~ m/^#CHROM/) {
print OUT "##INFO=<ID=AF,Number=A,Type=Integer,Description=\"Alternate allele observation frequency\">\n";
my @header = split(/\t/,$line);
($chrom, $pos,$id,$ref,$alt,$score,
$filter,$info,$format,@gtheader) = split(/\t/, $line);
......@@ -32,6 +31,7 @@ foreach $file (@files) {
}
}
}
print OUT $line,"\n";
next;
}
my ($chrom, $pos,$id,$ref,$alt,$score,
......@@ -53,7 +53,7 @@ foreach $file (@files) {
my @mutallfreq = ();
foreach my $k (0..$#ainfo) {
$gtinfo{$subjid}{$deschead[$k]} = $ainfo[$k];
$hash{$deschead[$k]} = $ainfo[$k] if ($subjid eq $opt{tumor});
#$hash{$deschead[$k]} = $ainfo[$k] if ($subjid eq $opt{tumor});
}
$gtinfo{$subjid}{DP} = (split(/,/,$gtinfo{$subjid}{DP}))[0] if ($gtinfo{$subjid}{DP});
next F1 unless ($gtinfo{$subjid}{DP} && $gtinfo{$subjid}{DP} ne '.' && $gtinfo{$subjid}{DP} >= 1);
......@@ -108,7 +108,7 @@ foreach $file (@files) {
}
}
my $newannot = join(";",@nannot);
print OUT join("\t",$chrom, $pos,$id,$ref,$alt,$score,$filter,$newannot,
print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$newannot,
$format,@gts),"\n";
}
close IN;
......
......@@ -58,7 +58,8 @@ fi
source /etc/profile.d/modules.sh
module load gatk/4.1.2.0 samtools/gcc/1.8
samtools index -@ $SLURM_CPUS_ON_NODE ${sbam}
which samtools
/cm/shared/apps/samtools/gcc/1.8/bin/samtools index -@ $SLURM_CPUS_ON_NODE ${sbam}
if [[ $algo == 'gatkbam_rna' ]]
then
......@@ -70,12 +71,12 @@ then
gatk SplitNCigarReads -R ${reffa} -I ${pair_id}.rg_added_sorted.bam -O ${pair_id}.split.bam
gatk --java-options "-Xmx32g" BaseRecalibrator -I ${pair_id}.split.bam --known-sites ${index_path}/dbSnp.gatk4.vcf.gz -R ${reffa} -O ${pair_id}.recal_data.table --use-original-qualities
gatk --java-options "-Xmx32g" ApplyBQSR -I ${pair_id}.split.bam -R ${reffa} -O ${pair_id}.final.bam --use-original-qualities -bqsr ${pair_id}.recal_data.table
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.final.bam
/cm/shared/apps/samtools/gcc/1.8/bin/samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.final.bam
elif [[ $algo == 'gatkbam' ]]
then
gatk --java-options "-Xmx32g" BaseRecalibrator -I ${sbam} --known-sites ${index_path}/dbSnp.gatk4.vcf.gz -R ${reffa} -O ${pair_id}.recal_data.table --use-original-qualities
gatk --java-options "-Xmx32g" ApplyBQSR -I ${sbam} -R ${reffa} -O ${pair_id}.final.bam --use-original-qualities -bqsr ${pair_id}.recal_data.table
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.final.bam
/cm/shared/apps/samtools/gcc/1.8/bin/samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.final.bam
elif [[ $algo == 'abra2' ]]
then
......
......@@ -84,7 +84,7 @@ then
vcf-concat fb.*.vcf | vcf-sort | vcf-annotate -n --fill-type | bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.freebayes.vcf.gz -
elif [[ $algo == 'gatk' ]]
then
gatk4_dbsnp=${index_path}/clinseq_prj/dbSnp.gatk4.vcf.gz
gatk4_dbsnp=/project/shared/bicf_workflow_ref/human/GRCh38/clinseq_prj/dbSnp.gatk4.vcf.gz
user=$USER
module load gatk/4.1.2.0
gvcflist=''
......
#!/bin/bash
#svcalling.sh
usage() {
echo "-h Help documentation for gatkrunner.sh"
echo "-r --Path to Reference Genome with the file genome.fa"
echo "-p --Prefix for output file name"
echo "Example: bash svcalling.sh -p prefix -r /path/GRCh38 -a gatk"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:b:l:p:h opt
do
case $opt in
r) index_path=$OPTARG;;
b) sbam=$OPTARG;;
p) pair_id=$OPTARG;;
l) itdbed=$OPTARG;;
h) usage;;
esac
done
function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
baseDir="`dirname \"$0\"`"
# Check for mandatory options
if [[ -z $pair_id ]] || [[ -z $index_path ]]; then
usage
fi
if [[ -z $SLURM_CPUS_ON_NODE ]]
then
SLURM_CPUS_ON_NODE=1
fi
if [[ -a "${index_path}/genome.fa" ]]
then
reffa="${index_path}/genome.fa"
dict="${index_path}/genome.dict"
else
echo "Missing Fasta File: ${index_path}/genome.fa"
usage
fi
source /etc/profile.d/modules.sh
module load samtools/gcc/1.8 snpeff/4.3q vcftools/0.1.14 htslib/gcc/1.8 bcftools/gcc/1.8 bedtools/2.26.0
stexe=`which samtools`
samtools view -@ $SLURM_CPUS_ON_NODE -L ${itdbed} ${sbam} | /project/shared/bicf_workflow_ref/seqprg/itdseek-1.2/itdseek.pl --refseq ${reffa} --samtools ${stexe} --bam ${sbam} | vcf-sort | bedtools intersect -header -b ${itdbed} -a stdin | bgzip > ${pair_id}.itdseek.vcf.gz
tabix ${pair_id}.itdseek.vcf.gz
bcftools norm --fasta-ref $reffa -m - -Ov ${pair_id}.itdseek.vcf.gz | java -Xmx30g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 - |bgzip > ${pair_id}.itdseek_tandemdup.vcf.gz
......@@ -26,8 +26,18 @@ baseDir="`dirname \"$0\"`"
source /etc/profile.d/modules.sh
module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q
if [[ -a "${index_path}/genome.fa" ]]
then
reffa="${index_path}/genome.fa"
dict="${index_path}/genome.dict"
else
echo "Missing Fasta File: ${index_path}/genome.fa"
usage
fi
perl $baseDir\/uniform_vcf_gt.pl $pair_id $vcf
bgzip -f ${pair_id}.uniform.vcf
j=${pair_id}.uniform.vcf.gz
tabix -f $j
bcftools norm -m - -Oz $j -o ${pair_id}.norm.vcf.gz
bcftools norm --fasta-ref $reffa -m - -Oz $j -o ${pair_id}.norm.vcf.gz
......@@ -75,16 +75,14 @@ while (my $line = <VCF>) {
push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
}
next if ($missingGT == scalar(@gts));
if ($hash{SVTYPE} eq 'DUP:TANDEM') {
print DUP join("\t",$chrom,$pos,$id,'N','<DUP>',$score,$filter,$annot,$newformat,@newgts),"\n";
}elsif ($hash{SVTYPE} eq 'INS') {
print SV join("\t",$chrom,$pos,$id,'N','<INS>',$score,$filter,$annot,$newformat,@newgts),"\n";
print DUP join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
}elsif ($hash{SVTYPE} eq 'DEL' || $hash{SVTYPE} eq 'INS') {
if (abs($hash{SVLEN}) < 50) {
print SI join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
}else {
$newalt = "<".$hash{SVTYPE}.">";
$newalt = "<".$hash{SVTYPE}.">";
print SV join("\t",$chrom,$pos,$id,'N',$newalt,$score,$filter,$annot,$newformat,@newgts),"\n";
}
}
......
......@@ -9,11 +9,12 @@ usage() {
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:p:h opt
while getopts :r:l:p:h opt
do
case $opt in
r) index_path=$OPTARG;;
p) pair_id=$OPTARG;;
l) idtbed=$OPTARG;;
h) usage;;
esac
done
......@@ -46,19 +47,19 @@ source /etc/profile.d/modules.sh
genomefiledate=`find ${reffa} -maxdepth 0 -printf "%TY%Tm%Td\n"`
module load samtools/1.6 pindel/0.2.5-intel snpeff/4.3q
module load samtools/1.6 pindel/0.2.5-intel snpeff/4.3q bedtools/2.26.0
touch ${pair_id}.pindel.config
for i in *.bam; do
sname="${i%.bam}"
sname=`echo "$i" |cut -f 1 -d '.'`
echo -e "${i}\t400\t${sname}" >> ${pair_id}.pindel.config
done
pindel -T $SLURM_CPUS_ON_NODE -f ${reffa} -i ${pair_id}.pindel.config -o ${pair_id}.pindel_out --RP
pindel2vcf -P ${pair_id}.pindel_out -r ${reffa} -R HG38 -d ${genomefiledate} -v pindel.vcf
cat pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter " ( GEN[*].AD[1] >= 10 )" | bgzip > pindel.vcf.gz
perl $baseDir/parse_pindel.pl ${pair_id} pindel.vcf.gz
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.indel.vcf |bgzip > indel.vcf.gz
perl $baseDir/norm_annot.sh -r ${index_path} -p pindel_indel -v indel.vcf.gz
mv pindel_indel.norm.vcf.gz ${pair_id}.pindel_indel.vcf.gz
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.dup.vcf |bgzip > ${pair_id}.pindel_tandemdup.vcf.gz
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.sv.vcf |bgzip > ${pair_id}.pindel_sv.vcf.gz
cat pindel.vcf | java -jar $SNPEFF_HOME/SnpSift.jar filter "( GEN[*].AD[1] >= 10 )" | bgzip > pindel.vcf.gz
tabix pindel.vcf.gz
bash $baseDir/norm_annot.sh -r ${index_path} -p pindel -v pindel.vcf.gz
perl $baseDir/parse_pindel.pl ${pair_id} pindel.norm.vcf.gz
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.indel.vcf |bgzip > ${pair_id}.pindel_indel.vcf.gz
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.dup.vcf | bedtools intersect -header -b ${idtbed} -a stdin | bgzip > ${pair_id}.pindel_tandemdup.vcf.gz
java -Xmx10g -jar $SNPEFF_HOME/snpEff.jar -no-intergenic -lof -c $SNPEFF_HOME/snpEff.config GRCh38.86 ${pair_id}.sv.vcf | bgzip > ${pair_id}.pindel_sv.vcf.gz
......@@ -15,7 +15,6 @@ do
case $opt in
r) index_path=$OPTARG;;
p) pair_id=$OPTARG;;
v) vcf=$OPTARG;;
h) usage;;
esac
......@@ -24,6 +23,16 @@ function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
baseDir="`dirname \"$0\"`"
if [[ -a "${index_path}/genome.fa" ]]
then
reffa="${index_path}/genome.fa"
dict="${index_path}/genome.dict"
else
echo "Missing Fasta File: ${index_path}/genome.fa"
usage
fi
source /etc/profile.d/modules.sh
module load bedtools/2.26.0 samtools/1.6 bcftools/1.6 snpeff/4.3q
......@@ -32,7 +41,7 @@ mv ${vcf} ${pair_id}.ori.vcf.gz
bgzip -f ${pair_id}.uniform.vcf
j=${pair_id}.uniform.vcf.gz
tabix -f $j
bcftools norm -m - -Oz $j -o ${pair_id}.norm.vcf.gz
bcftools norm --fasta-ref $reffa -m - -Oz $j -o ${pair_id}.norm.vcf.gz
bash $baseDir/annotvcf.sh -p ${pair_id} -r $index_path -v ${pair_id}.norm.vcf.gz
/project/shared/bicf_workflow_ref/seqprg/vt/vt decompose_blocksub ${pair_id}.annot.vcf.gz -p -a -o ${pair_id}.vcf
bgzip -f ${pair_id}.vcf
......@@ -59,6 +59,16 @@ while (my $line = <VCF>) {
foreach (split(',',$gtdata{AO})) {
$gtdata{DP} += $_;
}
} elsif ($gtdata{TIR}) {
$gtdata{GT} = '0/0';
$gtdata{AO} = (split(/,/,$gtdata{TIR}))[0];
$gtdata{RO} = $gtdata{DP} - $gtdata{AO};
$gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
} elsif ($gtdata{$ref."U"} && $gtdata{$alt."U"}) {
$gtdata{GT} = '0/0';
$gtdata{AO} = (split(/,/,$gtdata{$alt."U"}))[0];
$gtdata{RO} = (split(/,/,$gtdata{$ref."U"}))[0];
$gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
}
if ($gtdata{DP} && $gtdata{DP} < 5) {
$missingGT ++;
......