Skip to content
Snippets Groups Projects
Commit 2b3b3cfe authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

adding cnv and umicode

parent 11a5dfe5
Branches
Tags
No related merge requests found
......@@ -48,10 +48,10 @@ if [[ $nuctype == 'dna' ]]; then
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
samtools view -b -F 1024 ${pair_id}.ontarget.bam | bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b stdin -hist | grep ^all > ${pair_id}.dedupcov.txt
java -Xmx32g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt
java -Xmx64g -jar $PICARD/picard.jar EstimateLibraryComplexity I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.libcomplex.txt
samtools view -F 1024 ${pair_id}.ontarget.bam | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
java -Xmx4g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt
java -Xmx64g -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt
samtools view -b -q 1 ${pair_id}.ontarget.bam | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
bedtools coverage -sorted -g ${index_path}/genomefile.txt -a ${bed} -b ${sbam} -hist > ${pair_id}.covhist.txt
perl $baseDir/calculate_depthcov.pl ${pair_id}.covhist.txt
......
......@@ -36,7 +36,7 @@ then
SLURM_CPUS_ON_NODE=1
fi
module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 bcftools/1.6 htslib/1.6 picard/2.10.3
module load bwakit/0.7.15 bwa/intel/0.7.15 samtools/1.6 picard/2.10.3
baseDir="`dirname \"$0\"`"
......
......@@ -48,10 +48,10 @@ then
samtools markdup -s --output-fmt BAM -@ $SLURM_CPUS_ON_NODE sort.bam ${pair_id}.dedup.bam
elif [ $algo == 'picard' ]
then
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt
java -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar MarkDuplicates I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt
elif [ $algo == 'picard_umi' ]
then
java -Djava.io.tmpdir=./ -Xmx4g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt
java -Djava.io.tmpdir=./ -Xmx16g -jar $PICARD/picard.jar MarkDuplicates BARCODE_TAG=RX I=${sbam} O=${pair_id}.dedup.bam M=${pair_id}.dedup.stat.txt
elif [ $algo == 'fgbio_umi' ]
then
module load fgbio bwa/intel/0.7.15
......
......@@ -9,11 +9,12 @@ usage() {
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :b:p:h opt
while getopts :b:p:uh opt
do
case $opt in
b) sbam=$OPTARG;;
p) pair_id=$OPTARG;;
u) umi='umi';;
h) usage;;
esac
done
......@@ -32,7 +33,14 @@ fi
module load cnvkit/0.9.0
cnvkit.py coverage ${sbam} /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.cnvkit_targets.bed -o ${pair_id}.targetcoverage.cnn
cnvkit.py coverage ${sbam} /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.cnvkit_antitargets.bed -o ${pair_id}.antitargetcoverage.cnn
if [[ $umi == 'umi' ]]
then
cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.uminormals.cnn -o ${pair_id}.cnr
else
cnvkit.py fix ${pair_id}.targetcoverage.cnn ${pair_id}.antitargetcoverage.cnn /project/shared/bicf_workflow_ref/GRCh38/UTSWV2.normals.cnn -o ${pair_id}.cnr
fi
cnvkit.py segment ${pair_id}.cnr -o ${pair_id}.cns
cnvkit.py call ${pair_id}.cns -o ${pair_id}.call.cns
cnvkit.py diagram ${pair_id}.cnr -s ${pair_id}.cns -o ${pair_id}.cnv.pdf
perl $baseDir/filter_cnvkit.pl *.call.cns
#!/usr/bin/perl -w
#parse_cnvkit_table.pl
my $refdir = '/project/shared/bicf_workflow_ref/GRCh38/';
open OM, "<$refdir\/panel1385.genelist.txt" or die $!;
while (my $line = <OM>) {
chomp($line);
$keep{$line} = 1;
}
#my @keep = ("AKT2","ATM","AURKA","BAP1","BCL2L1","BCL6","BIRC3","BRCA2","CCND1","CCNE1","CD79B","CDK8","CDKN2A","CDKN2B","CEBPA","EGFR","ERBB2","FGF10","FGF14","FGF19","FGF3","FGF4","FLT1","FLT3","FOXP1","GATA6","GNA13","GNAS","IKZF1","IL7R","IRS2","KDM4C","KLHL6","KMT2A","KRAS","LYN","MCL1","MITF","MYC","NOTCH2","PIK3CA","PRKAR1A","PRKDC","PTEN","RB1","RICTOR","RUNX1T1","SDHA","TFDP1","TP53","ZNF217","CDK4","MDM4","MYCN","BTG2","BCL2L2","RAF1");
#my %keep = map {$_ => 1} @keep;
my @cvncalls = @ARGV;
foreach my $file (@ARGV) {
open IN, "<$file" or die $!;
my $out = $file;
$out =~ s/call.cns/cnvcalls.txt/;
open OUT, ">$out" or die $!;
print OUT join("\t","Gene","Chromosome","Start","End","Abberation Type","CN","Score"),"\n";
my $header = <IN>;
while (my $line = <IN>) {
chomp($line);
my ($chr,$start,$end,$geneids,$log2,$cn,$depth,
$probes,$weight) = split(/\t/,$line);
my %genes;
my @ids = split(/;|,/,$geneids);
foreach my $gid (@ids) {
my ($key,$value) = split(/=/,$gid);
if ($key eq 'ensembl_gn' || $key eq 'identifier') {
$genes{$value} = 1 if $keep{$value};
}
}
my $newgeneids = join(";", keys %genes);
my $len = sprintf("%.1f",($end-$start)/1000);
next if ($cn == 2) || scalar(keys %genes) < 1;
my $abtype = 'amplification';
$abtype = 'loss' if ($cn < 2);
print OUT join("\t",$newgeneids,$chr,$start,$end,$abtype,$cn,$weight),"\n";
}
close IN;
close OUT;
}
......@@ -100,4 +100,17 @@ then
vcf-sort platypus.vcf |vcf-annotate -n --fill-type -n |bgzip > platypus.vcf.gz
tabix platypus.vcf.gz
bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.platypus.vcf.gz platypus.vcf.gz
elif [[ $algo == 'strelka2' ]]
then
module load strelka/2.8.3 samtools/1.6 manta/1.2.0 snpeff/4.3q vcftools/0.1.14
mkdir manta strelka
gvcflist=''
for i in *.bam; do
gvcflist="$gvcflist --bam ${i}"
done
configManta.py $gvcflist --referenceFasta ${reffa} --exome --runDir manta
manta/runWorkflow.py -m local -j 8
configureStrelkaGermlineWorkflow.py $gvcflist --referenceFasta ${reffa} --targeted --indelCandidates manta/results/variants/candidateSmallIndels.vcf.gz --runDir strelka
strelka/runWorkflow.py -m local -j 8
bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.strelka2.vcf.gz strelka/results/variants/variants.vcf.gz
fi
......@@ -53,7 +53,8 @@ if [ $algo == 'strelka2' ]
/cm/shared/apps/manta/1.2.0/bin/configManta.py --normalBam ${normal}.final.bam --tumorBam ${tumor}.final.bam --referenceFasta ${genome_reference} --runDir ${manta_analysisPath}
${manta_analysisPath}/runWorkflow.py -m local -j 8
/cm/shared/apps/strelka/2.8.3/bin/configureStrelkaSomaticWorkflow.py --normalBam ${normal}.final.bam --tumorBam ${tumor}.final.bam --referenceFasta ${genome_reference} --targeted --indelCandidates ${manta_analysisPath}/results/variants/candidateSmallIndels.vcf.gz --runDir ${strelka_analysisPath}
${strelka_analysisPath}/runWorkflow.py -m local -j 8
${strelka_analysisPath}/runWorkflow.py -m local -j 8
bcftools norm -c s -f ${reffa} -w 10 -O z -o ${pair_id}.strelka2.vcf.gz strelka/results/variants/variants.vcf.gz
fi
if [ $algo == 'virmid' ]
......
......@@ -82,13 +82,12 @@ if [ $algo == 'strelka2' ]
then
module load strelka/2.8.3 samtools/1.6 manta/1.2.0 snpeff/4.3q vcftools/0.1.14
mkdir manta strelka
configManta.py --normalBam ${normal} --tumorBam ${tumor} --referenceFasta ${reffa} --runDir manta
configManta.py --normalBam ${mnormal} --tumorBam ${mtumor} --referenceFasta ${reffa} --runDir manta
manta/runWorkflow.py -m local -j 8
configureStrelkaSomaticWorkflow.py --normalBam ${mnormal} --tumorBam ${mtumor} --referenceFasta ${reffa} --targeted --indelCandidates manta/results/variants/candidateSmallIndels.vcf.gz --runDir strelka
configureStrelkaSomaticWorkflow.py --normalBam ${normal} --tumorBam ${tumor} --referenceFasta ${reffa} --targeted --indelCandidates manta/results/variants/candidateSmallIndels.vcf.gz --runDir strelka
strelka/runWorkflow.py -m local -j 8
vcf-concat strelka/results/variants/*.vcf.gz | vcf-annotate -n --fill-type -n |vcf-sort |java -jar $SNPEFF_HOME/SnpSift.jar filter "((FILTER = 'PASS') & (GEN[*].DP >= 10))" | perl -pe "s/TUMOR/${tid}/g" | perl -pe "s/NORMAL/${nid}/g" |bgzip > ${pair_id}.strelka.vcf.gz
vcf-concat strelka/results/variants/*.vcf.gz | vcf-annotate -n --fill-type -n |vcf-sort |java -jar $SNPEFF_HOME/SnpSift.jar filter "(GEN[*].DP >= 10)" | perl -pe "s/TUMOR/${tid}/g" | perl -pe "s/NORMAL/${nid}/g" |bgzip > ${pair_id}.strelka2.vcf.gz
fi
if [ $algo == 'virmid' ]
then
module load snpeff/4.3q virmid/1.2 samtools/1.6 vcftools/0.1.14
......@@ -108,7 +107,12 @@ fi
if [ $algo == 'mutect2' ]
then
module load parallel gatk/3.7 snpeff/4.3q samtools/1.6 vcftools/0.1.14
cut -f 1 ${index_path}/genomefile.5M.txt | parallel --delay 2 -j 10 "java -Xmx20g -jar \$GATK_JAR -R ${reffa} -D ${dbsnp} -T MuTect2 -stand_call_conf 10 -A FisherStrand -A QualByDepth -A VariantType -A DepthPerAlleleBySample -A HaplotypeScore -A AlleleBalance -I:tumor ${tumor} -I:normal ${normal} --cosmic ${cosmic} -o ${tid}.{}.mutect.vcf -L {}"
if [ -z ${tbed} ]
then
cut -f 1 ${index_path}/genomefile.5M.txt | parallel --delay 2 -j 10 "java -Xmx20g -jar \$GATK_JAR -R ${reffa} -D ${dbsnp} -T MuTect2 -stand_call_conf 10 -A FisherStrand -A QualByDepth -A VariantType -A DepthPerAlleleBySample -A HaplotypeScore -A AlleleBalance -I:tumor ${tumor} -I:normal ${normal} --cosmic ${cosmic} -o ${tid}.{}.mutect.vcf -L {}"
else
awk '{print $1":"$2"-"$3}' ${tbed} | parallel --delay 2 -j 10 "java -Xmx20g -jar \$GATK_JAR -R ${reffa} -D ${dbsnp} -T MuTect2 -stand_call_conf 10 -A FisherStrand -A QualByDepth -A VariantType -A DepthPerAlleleBySample -A HaplotypeScore -A AlleleBalance -I:tumor ${tumor} -I:normal ${normal} --cosmic ${cosmic} -o ${tid}.{}.mutect.vcf -L {}"
fi
vcf-concat ${tid}*mutect.vcf | vcf-sort | vcf-annotate -n --fill-type | java -jar \$SNPEFF_HOME/SnpSift.jar filter -p '((FS <= 60) & GEN[*].DP >= 10)' | perl -pe "s/TUMOR/${tid}/" | perl -pe "s/NORMAL/${nid}/g" |bgzip > ${pair_id}.pmutect.vcf.gz
fi
......
......@@ -31,8 +31,8 @@ calllist=''
for i in *.vcf.gz; do
if [[ $i == $HS ]]
then
bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz
list2="$list2 hotspot.nooverlap.vcf.gz"
bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > nooverlap.hotspot.vcf.gz
list2="$list2 nooverlap.hotspot.vcf.gz"
fi
done
......
......@@ -7,99 +7,102 @@ my @vcffiles = @ARGV;
open HEADER, "<$headerfile" or die $!;
open OUT, ">int.vcf" or die $!;
while (my $line = <HEADER>) {
chomp($line);
print OUT $line,"\n";;
chomp($line);
print OUT $line,"\n";;
}
close HEADER;
my @sampleorder;
my %headerlines;
foreach $vcf (@vcffiles) {
$caller = (split(/\./,$vcf))[-3];
open VCF, "gunzip -c $vcf|" or die $!;
my @sampleids;
while (my $line = <VCF>) {
chomp($line);
if ($line =~ m/#/) {
if ($line =~ m/#CHROM/) {
($chromhd, $posd,$idhd,$refhd,$althd,$scorehd,
$filterhd,$annothd,$formathd,@sampleids) = split(/\t/, $line);
foreach $j (0..$#sampleids) {
$sampleids[$j] =~ s/\.final//g;
}
unless (@sampleorder) {
@sampleorder = @sampleids;
print OUT $line,"\n";
}
next;
}
next;
$caller = (split(/\./,$vcf))[-3];
open VCF, "gunzip -c $vcf|" or die $!;
my @sampleids;
while (my $line = <VCF>) {
chomp($line);
if ($line =~ m/#/) {
if ($line =~ m/#CHROM/) {
($chromhd, $posd,$idhd,$refhd,$althd,$scorehd,
$filterhd,$annothd,$formathd,@sampleids) = split(/\t/, $line);
foreach $j (0..$#sampleids) {
$sampleids[$j] =~ s/\.final//g;
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val;
unless (@sampleorder) {
@sampleorder = @sampleids;
print OUT $line,"\n";
}
my @deschead = split(/:/,$format);
my $newformat = 'GT:DP:AD:AO:RO';
my %newgts;
my $missingGT = 0;
FG:foreach my $i (0..$#gts) {
my $allele_info = $gts[$i];
my $sid = $sampleids[$i];
my @gtinfo = split(/:/,$allele_info);
my %gtdata;
if ($allele_info eq '.') {
$newgts{$sid} = '.:.:.:.:.';
$missingGT ++;
next FG;
}
foreach my $i (0..$#deschead) {
$gtdata{$deschead[$i]} = $gtinfo[$i];
}
if ($gtdata{GT} eq './.') {
$newgts{$sid} = '.:.:.:.:.';
$missingGT ++;
next FG;
}
if ($gtdata{AD}){
($gtdata{RO},@alts) = split(/,/,$gtdata{AD});
$gtdata{AO} = join(",",@alts);
$gtdata{DP} = $gtdata{RO};
foreach (@alts) {
$gtdata{DP} += $_;
}
} elsif (exists $gtdata{NR} && exists $gtdata{NV}) {
$gtdata{DP} = $gtdata{NR};
$gtdata{AO} = $gtdata{NV};
$gtdata{RO} = $gtdata{DP} - $gtdata{AO};
$gtdata{AD} = join(",",$gtdata{RO},$gtdata{AO})
} elsif (exists $gtdata{AO} && exists $gtdata{RO}) {
$gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
$gtdata{DP} = $gtdata{RO};
foreach (split(',',$gtdata{AO})) {
$gtdata{DP} += $_;
}
}
if ($gtdata{DP} && $gtdata{DP} < 3) {
$missingGT ++;
}
$newgts{$sid} = join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
next;
}
next;
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
if ($pos == 48303944) {
warn "Debugging\n";
}
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val;
}
my @deschead = split(/:/,$format);
my $newformat = 'GT:DP:AD:AO:RO';
my %newgts;
my $missingGT = 0;
FG:foreach my $i (0..$#gts) {
my $allele_info = $gts[$i];
my $sid = $sampleids[$i];
my @gtinfo = split(/:/,$allele_info);
my %gtdata;
if ($allele_info eq '.') {
$newgts{$sid} = '.:.:.:.:.';
$missingGT ++;
next FG;
}
next if ($missingGT == scalar(@gts));
my @newgts;
foreach $id (@sampleorder) {
push @newgts, $newgts{$id};
foreach my $i (0..$#deschead) {
$gtdata{$deschead[$i]} = $gtinfo[$i];
}
if ($gtdata{GT} eq './.') {
$newgts{$sid} = '.:.:.:.:.';
$missingGT ++;
next FG;
}
if ($gtdata{AD}){
($gtdata{RO},@alts) = split(/,/,$gtdata{AD});
$gtdata{AO} = join(",",@alts);
$gtdata{DP} = $gtdata{RO};
foreach (@alts) {
$gtdata{DP} += $_;
}
} elsif (exists $gtdata{NR} && exists $gtdata{NV}) {
$gtdata{DP} = $gtdata{NR};
$gtdata{AO} = $gtdata{NV};
$gtdata{RO} = $gtdata{DP} - $gtdata{AO};
$gtdata{AD} = join(",",$gtdata{RO},$gtdata{AO})
} elsif (exists $gtdata{AO} && exists $gtdata{RO}) {
$gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
$gtdata{DP} = $gtdata{RO};
foreach (split(',',$gtdata{AO})) {
$gtdata{DP} += $_;
}
$lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
}
if ($gtdata{DP} && $gtdata{DP} < 3) {
$missingGT ++;
}
$newgts{$sid} = join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
}
next if ($missingGT == scalar(@gts));
my @newgts;
foreach $id (@sampleorder) {
push @newgts, $newgts{$id};
}
close VCF;
$lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
}
close VCF;
}
my @callers = ('ssvar','platypus','sam','gatk','hotspot');
my @callers = ('ssvar','platypus','sam','gatk','strelka2','hotspot');
if (grep(/mutect/,@vcffiles)) {
@callers = ('sssom','pmutect','shimmer','stelka','varscan','virmid');
@callers = ('sssom','pmutect','shimmer','strelka2','varscan','virmid');
}
F1:foreach $chr (sort {$a cmp $b} keys %lines) {
F2:foreach $pos (sort {$a <=> $b} keys %{$lines{$chr}}) {
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment