Skip to content
Snippets Groups Projects
Commit 34b23838 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

update adding aggregate qc

parent 16061b5a
Branches
Tags
No related merge requests found
......@@ -13,7 +13,7 @@ usage() {
exit 1
}
OPTIND=1 # Reset OPTIND
while getopts :r:b:c:n:p:s:d:h opt
while getopts :r:b:c:n:p:e:s:d:h opt
do
case $opt in
r) index_path=$OPTARG;;
......@@ -22,6 +22,7 @@ do
n) nuctype=$OPTARG;;
p) pair_id=$OPTARG;;
d) dedup=$OPTARG;;
e) version=$OPTARG;;
s) skiplc=1;;
h) usage;;
esac
......@@ -63,9 +64,12 @@ if [[ $nuctype == 'dna' ]]; then
samtools index -@ $NPROC ${pair_id}.ontarget.bam
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
samtools view -@ $NPROC -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity BARCODE_TAG=RG I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir}
#java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt TMP_DIR=${tmpdir}
#java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir}
#samtools view -@ $NPROC ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
fi
#java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt TMP_DIR=${tmpdir}
perl $baseDir/sequenceqc_dna.pl -e ${version} -r $index_path ${pair_id}.genomecov.txt
else
perl $baseDir/sequenceqc_rna.pl -e ${version} -r $index_path ${pair_id}.flagstat.txt
fi
#!/usr/bin/perl -w
#sequenceqc_alignment.p
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s');
my @statfiles = @ARGV;
my $fileowner = $opt{user};
my $gittag = 'v5';
if ($opt{gitdir}) {
$gittag = $opt{gitdir};
}elsif($ENV{'gitv'}) {
$gittag = $ENV{'gitv'};
}
foreach $sfile (@statfiles) {
$sfile =~ m/(\S+)\.genomecov.txt/;
my $prefix = $1;
my %hash;
open FLAG, "<$prefix\.trimreport.txt";
while (my $line = <FLAG>) {
chomp($line);
my ($file,$raw,$trim) = split(/\t/,$line);
$hash{rawct} += $raw;
}
open FLAG, "<$prefix\.flagstat.txt" or die $!;
while (my $line = <FLAG>) {
chomp($line);
if ($line =~ m/(\d+) \+ \d+ in total/) {
$hash{total} = $1;
}elsif ($line =~ m/(\d+) \+ \d+ read1/) {
$hash{pairs} = $1;
}elsif ($line =~ m/(\d+) \+ \d+ mapped/) {
$hash{maprate} = 100*sprintf("%.4f",$1/$hash{total});
}elsif ($line =~ m/(\d+) \+ \d+ properly paired/) {
$hash{propair} = 100*sprintf("%.4f",$1/$hash{total});
}
}
close FLAG;
open FLAG, "<$prefix\.ontarget.flagstat.txt" or die $!;
while (my $line = <FLAG>) {
chomp($line);
if ($line =~ m/(\d+) \+ \d+ in total/) {
$hash{ontarget} = $1;
}
}
unless ($hash{rawct}) {
$hash{rawct} = $hash{total};
}
my %lc;
open DUP, "<$prefix.libcomplex.txt" or die $!;
while (my $line = <DUP>) {
chomp($line);
if ($line =~ m/## METRICS/) {
$header = <DUP>;
$nums = <DUP>;
chomp($header);
chomp($nums);
my @stats = split(/\t/,$header);
my @row = split(/\t/,$nums);
my %info;
foreach my $i (0..$#stats) {
$info{$stats[$i]} = $row[$i];
}
$lc{TOTREADSLC} += $info{UNPAIRED_READS_EXAMINED} + $info{READ_PAIRS_EXAMINED};
$hash{libsize} = $info{ESTIMATED_LIBRARY_SIZE};
$lc{TOTDUPLC} += $info{UNPAIRED_READ_DUPLICATES} + $info{READ_PAIR_DUPLICATES};
}
}
close DUP;
$hash{percdups} = sprintf("%.4f",$lc{TOTDUPLC}/$lc{TOTREADSLC});
my %cov;
open COV, "<$prefix\.genomecov.txt" or die $!;
my $sumdepth;
my $totalbases;
while (my $line = <COV>) {
chomp($line);
my ($all,$depth,$bp,$total,$percent) = split(/\t/,$line);
$cov{$depth} = $percent;
$sumdepth += $depth*$bp;
$totalbases = $total unless ($totalbases);
}
my $avgdepth = sprintf("%.0f",$sumdepth/$totalbases);
my @depths = sort {$a <=> $b} keys %cov;
my @perc = @cov{@depths};
my @cum_sum = cumsum(@perc);
my $median = 0;
foreach my $i (0..$#cum_sum) {
if ($cum_sum[$i] < 0.5) {
$median = $i;
}
}
$hash{'perc100x'} = 100*sprintf("%.4f",1-$cum_sum[100]);
$hash{'perc200x'} = 100*sprintf("%.4f",1-$cum_sum[200]);
$hash{'perc500x'} = 100*sprintf("%.4f",1-$cum_sum[500]);
#### Begin File Information ########
my $status = 'PASS';
$status = 'FAIL' if ($hash{maprate} < 0.90 && $hash{'perc100x'} < 0.90);
my @stats = stat("$prefix\.flagstat.txt");
my ($day,$month,$year) = (localtime($stats[9]))[3,4,5];
$year += 1900;
$month++;
$date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day));
$fileowner = 's'.$stats[4] unless $fileowner;
$hash{status} = $status;
$hash{date}=$date;
$hash{fileowner} = $fileowner;
##### End File Information ########
##### START separateFilesPerSample ######
open OUT, ">".$prefix.".sequence.stats.txt" or die $!;
print OUT join("\n", "Sample\t".$prefix,"Total_Raw_Count\t".$hash{rawct},
"On_Target\t".$hash{ontarget},"Map_Rate\t".$hash{maprate},
"Properly_Paired\t".$hash{propair},
"Percent_Duplicates\t".sprintf("%.2f",100*$hash{percdups}),
"Percent_on_Target\t".sprintf("%.2f",100*$hash{ontarget}/$hash{rawct}),
"All_Average_Depth\t".$avgdepth,"All_Median_Depth\t".$median,
"Percent_over_100x\t".$hash{'perc100x'},
"Percent_over_200x\t".$hash{'perc200x'},
"Percent_over_500x\t".$hash{'perc500x'},
"Alignment_Status\t".$hash{status},"Alignment_Date\t".$hash{date},
"File_Owner\t".$hash{fileowner},"Workflow_Version\t".$gittag),"\n";
close OUT;
system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt});
##### END separateFilesPerSample ######
}
sub cumsum {
my @nums = @_;
my @cumsum = ();
my $mid = 0;
for my $i (0..$#nums) {
$mid += $nums[$i];
push(@cumsum,$mid);
}
return @cumsum;
}
#!/usr/bin/perl -w
#sequenceqc_rnaseq.pl
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s');
my @files = @ARGV;
chomp(@files);
my $fileowner = $opt{user};
my $gittag = 'v5';
if ($opt{gitdir}) {
$gittag = $opt{gitdir};
}elsif($ENV{'gitv'}) {
$gittag = $ENV{'gitv'};
}
foreach my $file (@files) {
chomp($file);
$file =~ m/(\S+)\.flagstat.txt/;
my @directory = split(/\//, $file);
$prefix = $directory[-1];
my $sample = (split(/\./,$prefix))[0];
open OUT, ">".$sample.".sequence.stats.txt" or die;#separateFilesPerSample
$hisatfile = $file;
$hisatfile =~ s/flagstat/alignerout/;
open HISAT, "<$hisatfile";
my ($total, $pairs,$read2ct,$maprate,$concorrate);
while (my $line = <HISAT>) {
chomp($line);
if ($line =~ m/(\d+) reads; of these:/) {
$total = $1;
}elsif ($line =~ m/Number of input reads\s+\|\s+(\d+)/) {
$total = $1;
}elsif ($line =~ m/(\d+) \S+ were paired; of these:/) {
$pairs = $1;
$total = $pairs*2 if ($total == $pairs);
}elsif ($line =~ m/(\d+) pairs aligned concordantly 0 times; of these:/) {
$concorrate = sprintf("%.4f",1-($1/$pairs));
}elsif ($line =~ m/(\S+)% overall alignment rate/) {
$maprate = $1/100;
}
}
close HISAT;
open IN, "<$file" or die $!;
while ($line = <IN>) {
chomp($line);
if ($line =~ m/(\d+) \+ \d+ in total/) {
$total = $1 unless $total;
}elsif ($line =~ m/(\d+) \+ \d+ read1/) {
$pairs = $1;
}elsif ($line =~ m/(\d+) \+ \d+ read2/) {
$read2ct = $1;
}elsif ($line =~ m/(\d+) \+ \d+ mapped\s+\((\S+)%.+\)/) {
$maprate = sprintf("%.2f",$2/100) unless ($maprate);
}elsif ($line =~ m/(\d+) \+ \d+ properly paired\s+\((\S+)%.+\)/) {
$concorrate = sprintf("%.2f",$2/100) unless ($concorrate);
}
}
close IN;
my @stats=stat($file);
my ($day,$month,$year) = (localtime($stats[9]))[3,4,5];
$year += 1900;
$month++;
$date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day));
$fileowner = 's'.$stats[4] unless $fileowner;
$total = $pairs*2 if ($total == $pairs);
my $status = 'PASS';
$status = 'FAIL' if ($maprate < 0.90);
$status = 'FAIL' if ($total < 6000000);
print OUT join("\n","Sample\t".$sample,"Total_Raw_Count\t".$total, "Read1_Map\t".$pairs,"Read2_Map\t".$read2ct,
"Map_Rate\t".$maprate,"Concordant_Rate\t".$concorrate,"Alignment_Status\t".$status,"Alignment_Date\t".$date,
"File_Owner\t".$fileowner,"Workflow_Version\t".$gittag),"\n";
system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt});
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment