Skip to content
Snippets Groups Projects
Commit 28fc369f authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

updating union

parent d58e9577
Branches
No related merge requests found
#!/bin/bash
#union.sh
usage() {
echo "-h Help documentation for gatkrunner.sh"
echo "-r --Reference Genome: GRCh38 or GRCm38"
echo "-p --Prefix for output file name"
echo "Example: bash hisat.sh -p prefix -r /path/GRCh38"
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:p:h opt
do
case $opt in
r) index_path=$OPTARG;;
p) pair_id=$OPTARG;;
h) usage;;
esac
done
function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
baseDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
module load gatk/3.7 python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q samtools/1.6 vcftools/0.1.14
HS=*.hotspot.vcf.gz
list1=`ls *vcf.gz|grep -v hotspot`
list2=`ls *vcf.gz|grep -v hotspot`
varlist=''
calllist=''
for i in *.vcf.gz; do
EXT="${i#*.}"
CALL="${EXT%%.*}"
calllist="$calllist $CALL"
tabix $i
if [[ $i == $HS ]]
then
bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz
tabix hotspot.nooverlap.vcf.gz
list2="$list2 hotspot.nooverlap.vcf.gz"
varlist="$varlist --variant:$CALL hotspot.nooverlap.vcf.gz"
else
varlist="$varlist --variant:$CALL $i"
fi
done
bedtools multiinter -i $list2 -names $calllist | cut -f 1,2,3,5 | bedtools sort -i stdin | bedtools merge -c 4 -o distinct > ${pair_id}_integrate.bed
priority='ssvar'
if [[ *.platypus.vcf.gz ]]
then
priority="$priority,platypus"
fi
priority="$priority,sam,gatk"
if [[ -f *.hotspot.vcf.gz ]]
then
priority="$priority,hotspot"
fi
java -Xmx32g -jar $GATK_JAR -R ${index_path}/genome.fa -T CombineVariants --filteredrecordsmergetype KEEP_UNCONDITIONAL $varlist -genotypeMergeOptions PRIORITIZE -priority $priority -o ${pair_id}.int.vcf
perl $baseDir/uniform_integrated_vcf.pl ${pair_id}.int.vcf
bgzip ${pair_id}_integrate.bed
tabix ${pair_id}_integrate.bed.gz
bgzip ${pair_id}.uniform.vcf
tabix ${pair_id}.uniform.vcf.gz
bcftools annotate -a ${pair_id}_integrate.bed.gz --columns CHROM,FROM,TO,CallSet -h ${index_path}/CallSet.header ${pair_id}.uniform.vcf.gz | bgzip > ${pair_id}.union.vcf.gz
#!/usr/bin/perl -w
#check_designfile.pl
my $pe = shift @ARGV;
my $dfile = shift @ARGV;
open OUT, ">design.valid.txt" or die $!;
#open OUT2, ">tochannel.csv" or die $!;
open DFILE, "<$dfile" or die $!;
my $head = <DFILE>;
chomp($head);
$head =~ s/FullPathTo//g;
my @colnames = split(/\t/,$head);
my %newcols = map {$_=> 1} @colnames;
unless (grep(/FqR1/,@colnames)) {
die "Missing Sequence Files in Design File: FqR1\n";
}
unless (grep(/SampleID/,@colnames)) {
die "Missing SampleID in Design File\n";
}
if ($pe eq 'pe') {
unless (grep(/FqR2/,@colnames)) {
die "Missing Sequence Files in Design File: FqR2\n";
}
}else {
delete $newcols{FqR2};
}
my @cols = sort {$a cmp $b} keys %newcols;
print OUT join("\t","SampleID","FqR1","FqR2"),"\n";
my @grp = ('a','b');
my $lnct = 0;
while (my $line = <DFILE>) {
chomp($line);
$line =~ s/FullPathTo//g;
my @row = split(/\t/,$line);
my %hash;
foreach my $i (0..$#row) {
next unless ($newcols{$colnames[$i]});
$hash{$colnames[$i]} = $row[$i];
}
if ($hash{SampleID} =~ m/^\d/) {
$hash{SampleID} =~ s/^/S/;
}
$hash{SampleName} = $hash{SampleID} unless ($hash{SampleName});
$hash{SubjectID} = $hash{SampleID} unless ($hash{SubjectID});
unless ($hash{SampleGroup}) {
$j = $lnct %% 2;
$hash{SampleGroup} = $grp[$j];
}
$hash{SampleGroup} =~ s/_/./g;
my @line;
foreach $f (@cols) {
push @line, $hash{$f};
}
print OUT join("\t",@line),"\n";
$hash{FqR2} = 'na' unless ($hash{FqR2});
print join(",",$hash{SampleID},$hash{FqR1},$hash{FqR2}),"\n";
$lnct ++;
}
......@@ -21,7 +21,7 @@ do
done
function join_by { local IFS="$1"; shift; echo "$*"; }
shift $(($OPTIND -1))
module load python/2.7.x-anaconda bedtools/2.26.0 snpeff/4.3q
module load python/2.7.x-anaconda bedtools/2.26.0 samtools/1.6 snpeff/4.3q
if [[ $index_path == '/project/shared/bicf_workflow_ref/GRCh38' ]]
then
tabix ${unionvcf}
......
......@@ -63,8 +63,9 @@ if [[ $algo == 'gatkbam_rna' ]]
then
module load picard/2.10.3
java -Xmx4g -jar $PICARD/picard.jar CleanSam INPUT=${sbam} OUTPUT=${pair_id}.clean.bam
java -Xmx4g -jar $PICARD/picard.jar ReorderSam I=${pair_id}.clean.bam O=${pair_id}.sort.bam R=${reffa} CREATE_INDEX=TRUE
#java -Xmx4g -jar $PICARD/picard.jar AddOrReplaceReadGroups INPUT=${pair_id}.clean.bam O=${pair_id}.rg_added_sorted.bam SO=coordinate RGID=${pair_id} RGLB=tx RGPL=illumina RGPU=barcode RGSM=${pair_id}
samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.clean.bam
#samtools index -@ $SLURM_CPUS_ON_NODE ${pair_id}.clean.bam
java -Xmx4g -jar $GATK_JAR -T SplitNCigarReads -R ${reffa} -I ${pair_id}.clean.bam -o ${pair_id}.split.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS
java -Xmx32g -jar $GATK_JAR -T RealignerTargetCreator -known ${knownindel} -R ${reffa} -o ${pair_id}.bam.list -I ${pair_id}.split.bam -nt $SLURM_CPUS_ON_NODE -nct 1
java -Xmx32g -jar $GATK_JAR -I ${pair_id}.split.bam -R ${reffa} --filter_mismatching_base_and_quals -T IndelRealigner -targetIntervals ${pair_id}.bam.list -o ${pair_id}.realigned.bam
......
#!/usr/bin/perl
#migrate_db.pl
my $vcf = shift @ARGV;
my $outfile = $vcf;
$outfile =~ s/vcf.gz/uniform.vcf/;
open OUT, ">$outfile" or die $!;
open VCF, "gunzip -c $vcf|" or die $!;
while (my $line = <VCF>) {
chomp($line);
if ($line =~ m/#/) {
print OUT $line,"\n";
next;
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
my %hash = ();
foreach $a (split(/;/,$annot)) {
my ($key,$val) = split(/=/,$a);
$hash{$key} = $val;
}
my @deschead = split(/:/,$format);
my $newformat = 'GT:DP:AD:AO:RO';
my @newgts = ();
my $missingGT = 0;
FG:foreach my $allele_info (@gts) {
my @gtinfo = split(/:/,$allele_info);
my %gtdata;
if ($allele_info eq '.') {
push @newgts, '.:.:.:.:.';
$missingGT ++;
next FG;
}
foreach my $i (0..$#deschead) {
$gtdata{$deschead[$i]} = $gtinfo[$i];
}
if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') {
push @newgts, '.:.:.:.:.';
$missingGT ++;
next FG;
}
if ($gtdata{AD}){
($gtdata{RO},@alts) = split(/,/,$gtdata{AD});
$gtdata{AO} = join(",",@alts);
$gtdata{DP} = $gtdata{RO};
foreach (@alts) {
$gtdata{DP} += $_;
}
} elsif (exists $gtdata{NR} && exists $gtdata{NV}) {
$gtdata{DP} = $gtdata{NR};
$gtdata{AO} = $gtdata{NV};
$gtdata{RO} = $gtdata{DP} - $gtdata{AO};
} elsif (exists $gtdata{AO} && exists $gtdata{RO}) {
$gtdata{AD} = join(',',$gtdata{RO},$gtdata{AO});
$gtdata{DP} = $gtdata{RO};
foreach (split(',',$gtdata{AO})) {
$gtdata{DP} += $_;
}
}
if ($gtdata{DP} && $gtdata{DP} < 5) {
$missingGT ++;
}
push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
}
next if ($missingGT == scalar(@gts));
print OUT join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
}
close VCF;
......@@ -28,9 +28,6 @@ list2=`ls *vcf.gz|grep -v hotspot`
varlist=''
calllist=''
for i in *.vcf.gz; do
EXT="${i#*.}"
CALL="${EXT%%.*}"
calllist="$calllist $CALL"
if [[ $i == $HS ]]
then
bedtools multiinter -i $list1 |cut -f 1,2,3 |bedtools intersect -header -v -a $i -b stdin |bgzip > hotspot.nooverlap.vcf.gz
......@@ -38,6 +35,6 @@ for i in *.vcf.gz; do
fi
done
perl $baseDir/unionvcf.pl $list2
perl $baseDir/unionvcf.pl ${index_path}/union.header.vcf $list2
perl $baseDir/vcfsorter.pl ${index_path}/genome.dict int.vcf |bgzip > ${pair_id}.union.vcf.gz
......@@ -3,23 +3,33 @@
my $headerfile = shift @ARGV;
my @vcffiles = @ARGV;
my $outfile = $vcf;
$outfile =~ s/vcf.gz/uniform.vcf/;
open HEADER, "<$headerfile" or die $!;
open OUT, ">union.vcf" or die $!;
open OUT, ">int.vcf" or die $!;
while (my $line = <HEADER>) {
print OUT $line;
chomp($line);
print OUT $line,"\n";;
}
close HEADER;
my @sampleorder;
my %headerlines;
foreach $vcf (@vcffiles) {
$caller = (split(/\./,$vcf))[1];
open VCF, "gunzip -c $vcf|" or die $!;
my @sampleids;
while (my $line = <VCF>) {
chomp($line);
if ($line =~ m/#/) {
$headerlines{$line} = 1;
next;
if ($line =~ m/#CHROM/) {
($chromhd, $posd,$idhd,$refhd,$althd,$scorehd,
$filterhd,$annothd,$formathd,@sampleids) = split(/\t/, $line);
unless (@sampleorder) {
@sampleorder = @sampleids;
print OUT $line,"\n";
}
next;
}
}
my ($chrom, $pos,$id,$ref,$alt,$score,
$filter,$annot,$format,@gts) = split(/\t/, $line);
......@@ -30,13 +40,15 @@ foreach $vcf (@vcffiles) {
}
my @deschead = split(/:/,$format);
my $newformat = 'GT:DP:AD:AO:RO';
my @newgts = ();
my %newgts;
my $missingGT = 0;
FG:foreach my $allele_info (@gts) {
FG:foreach my $i (0..$#gts) {
my $allele_info = $gts[$i];
my $sid = $sampleids[$i];
my @gtinfo = split(/:/,$allele_info);
my %gtdata;
if ($allele_info eq '.') {
push @newgts, '.:.:.:.:.';
$newgts{$sid} = '.:.:.:.:.';
$missingGT ++;
next FG;
}
......@@ -44,7 +56,7 @@ foreach $vcf (@vcffiles) {
$gtdata{$deschead[$i]} = $gtinfo[$i];
}
if ($gtdata{DP} == 0 || $gtdata{GT} eq './.') {
push @newgts, '.:.:.:.:.';
$newgts{$sid} = '.:.:.:.:.';
$missingGT ++;
next FG;
}
......@@ -69,9 +81,13 @@ foreach $vcf (@vcffiles) {
if ($gtdata{DP} && $gtdata{DP} < 3) {
$missingGT ++;
}
push @newgts, join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
$newgts{$sid} = join(":",$gtdata{GT},$gtdata{DP},$gtdata{AD},$gtdata{AO},$gtdata{RO});
}
next if ($missingGT == scalar(@gts));
my @newgts;
foreach $id (@sampleorder) {
push @newgts, $newgts{$id};
}
$lines{$chrom}{$pos}{$caller} = join("\t",$chrom,$pos,$id,$ref,$alt,$score,$filter,$annot,$newformat,@newgts),"\n";
}
close VCF;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment