Skip to content
Snippets Groups Projects
Commit 34b23838 authored by Brandi Cantarel's avatar Brandi Cantarel
Browse files

update adding aggregate qc

parent 16061b5a
No related merge requests found
...@@ -13,7 +13,7 @@ usage() { ...@@ -13,7 +13,7 @@ usage() {
exit 1 exit 1
} }
OPTIND=1 # Reset OPTIND OPTIND=1 # Reset OPTIND
while getopts :r:b:c:n:p:s:d:h opt while getopts :r:b:c:n:p:e:s:d:h opt
do do
case $opt in case $opt in
r) index_path=$OPTARG;; r) index_path=$OPTARG;;
...@@ -22,6 +22,7 @@ do ...@@ -22,6 +22,7 @@ do
n) nuctype=$OPTARG;; n) nuctype=$OPTARG;;
p) pair_id=$OPTARG;; p) pair_id=$OPTARG;;
d) dedup=$OPTARG;; d) dedup=$OPTARG;;
e) version=$OPTARG;;
s) skiplc=1;; s) skiplc=1;;
h) usage;; h) usage;;
esac esac
...@@ -63,9 +64,12 @@ if [[ $nuctype == 'dna' ]]; then ...@@ -63,9 +64,12 @@ if [[ $nuctype == 'dna' ]]; then
samtools index -@ $NPROC ${pair_id}.ontarget.bam samtools index -@ $NPROC ${pair_id}.ontarget.bam
samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
samtools view -@ $NPROC -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt samtools view -@ $NPROC -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity BARCODE_TAG=RG I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir}
#java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt TMP_DIR=${tmpdir} #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt TMP_DIR=${tmpdir}
#java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir}
#samtools view -@ $NPROC ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt #samtools view -@ $NPROC ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
fi fi
#java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt TMP_DIR=${tmpdir} #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt TMP_DIR=${tmpdir}
perl $baseDir/sequenceqc_dna.pl -e ${version} -r $index_path ${pair_id}.genomecov.txt
else
perl $baseDir/sequenceqc_rna.pl -e ${version} -r $index_path ${pair_id}.flagstat.txt
fi fi
#!/usr/bin/perl -w
#sequenceqc_alignment.p
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s');
my @statfiles = @ARGV;
my $fileowner = $opt{user};
my $gittag = 'v5';
if ($opt{gitdir}) {
$gittag = $opt{gitdir};
}elsif($ENV{'gitv'}) {
$gittag = $ENV{'gitv'};
}
foreach $sfile (@statfiles) {
$sfile =~ m/(\S+)\.genomecov.txt/;
my $prefix = $1;
my %hash;
open FLAG, "<$prefix\.trimreport.txt";
while (my $line = <FLAG>) {
chomp($line);
my ($file,$raw,$trim) = split(/\t/,$line);
$hash{rawct} += $raw;
}
open FLAG, "<$prefix\.flagstat.txt" or die $!;
while (my $line = <FLAG>) {
chomp($line);
if ($line =~ m/(\d+) \+ \d+ in total/) {
$hash{total} = $1;
}elsif ($line =~ m/(\d+) \+ \d+ read1/) {
$hash{pairs} = $1;
}elsif ($line =~ m/(\d+) \+ \d+ mapped/) {
$hash{maprate} = 100*sprintf("%.4f",$1/$hash{total});
}elsif ($line =~ m/(\d+) \+ \d+ properly paired/) {
$hash{propair} = 100*sprintf("%.4f",$1/$hash{total});
}
}
close FLAG;
open FLAG, "<$prefix\.ontarget.flagstat.txt" or die $!;
while (my $line = <FLAG>) {
chomp($line);
if ($line =~ m/(\d+) \+ \d+ in total/) {
$hash{ontarget} = $1;
}
}
unless ($hash{rawct}) {
$hash{rawct} = $hash{total};
}
my %lc;
open DUP, "<$prefix.libcomplex.txt" or die $!;
while (my $line = <DUP>) {
chomp($line);
if ($line =~ m/## METRICS/) {
$header = <DUP>;
$nums = <DUP>;
chomp($header);
chomp($nums);
my @stats = split(/\t/,$header);
my @row = split(/\t/,$nums);
my %info;
foreach my $i (0..$#stats) {
$info{$stats[$i]} = $row[$i];
}
$lc{TOTREADSLC} += $info{UNPAIRED_READS_EXAMINED} + $info{READ_PAIRS_EXAMINED};
$hash{libsize} = $info{ESTIMATED_LIBRARY_SIZE};
$lc{TOTDUPLC} += $info{UNPAIRED_READ_DUPLICATES} + $info{READ_PAIR_DUPLICATES};
}
}
close DUP;
$hash{percdups} = sprintf("%.4f",$lc{TOTDUPLC}/$lc{TOTREADSLC});
my %cov;
open COV, "<$prefix\.genomecov.txt" or die $!;
my $sumdepth;
my $totalbases;
while (my $line = <COV>) {
chomp($line);
my ($all,$depth,$bp,$total,$percent) = split(/\t/,$line);
$cov{$depth} = $percent;
$sumdepth += $depth*$bp;
$totalbases = $total unless ($totalbases);
}
my $avgdepth = sprintf("%.0f",$sumdepth/$totalbases);
my @depths = sort {$a <=> $b} keys %cov;
my @perc = @cov{@depths};
my @cum_sum = cumsum(@perc);
my $median = 0;
foreach my $i (0..$#cum_sum) {
if ($cum_sum[$i] < 0.5) {
$median = $i;
}
}
$hash{'perc100x'} = 100*sprintf("%.4f",1-$cum_sum[100]);
$hash{'perc200x'} = 100*sprintf("%.4f",1-$cum_sum[200]);
$hash{'perc500x'} = 100*sprintf("%.4f",1-$cum_sum[500]);
#### Begin File Information ########
my $status = 'PASS';
$status = 'FAIL' if ($hash{maprate} < 0.90 && $hash{'perc100x'} < 0.90);
my @stats = stat("$prefix\.flagstat.txt");
my ($day,$month,$year) = (localtime($stats[9]))[3,4,5];
$year += 1900;
$month++;
$date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day));
$fileowner = 's'.$stats[4] unless $fileowner;
$hash{status} = $status;
$hash{date}=$date;
$hash{fileowner} = $fileowner;
##### End File Information ########
##### START separateFilesPerSample ######
open OUT, ">".$prefix.".sequence.stats.txt" or die $!;
print OUT join("\n", "Sample\t".$prefix,"Total_Raw_Count\t".$hash{rawct},
"On_Target\t".$hash{ontarget},"Map_Rate\t".$hash{maprate},
"Properly_Paired\t".$hash{propair},
"Percent_Duplicates\t".sprintf("%.2f",100*$hash{percdups}),
"Percent_on_Target\t".sprintf("%.2f",100*$hash{ontarget}/$hash{rawct}),
"All_Average_Depth\t".$avgdepth,"All_Median_Depth\t".$median,
"Percent_over_100x\t".$hash{'perc100x'},
"Percent_over_200x\t".$hash{'perc200x'},
"Percent_over_500x\t".$hash{'perc500x'},
"Alignment_Status\t".$hash{status},"Alignment_Date\t".$hash{date},
"File_Owner\t".$hash{fileowner},"Workflow_Version\t".$gittag),"\n";
close OUT;
system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt});
##### END separateFilesPerSample ######
}
sub cumsum {
my @nums = @_;
my @cumsum = ();
my $mid = 0;
for my $i (0..$#nums) {
$mid += $nums[$i];
push(@cumsum,$mid);
}
return @cumsum;
}
#!/usr/bin/perl -w
#sequenceqc_rnaseq.pl
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
my %opt = ();
my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s');
my @files = @ARGV;
chomp(@files);
my $fileowner = $opt{user};
my $gittag = 'v5';
if ($opt{gitdir}) {
$gittag = $opt{gitdir};
}elsif($ENV{'gitv'}) {
$gittag = $ENV{'gitv'};
}
foreach my $file (@files) {
chomp($file);
$file =~ m/(\S+)\.flagstat.txt/;
my @directory = split(/\//, $file);
$prefix = $directory[-1];
my $sample = (split(/\./,$prefix))[0];
open OUT, ">".$sample.".sequence.stats.txt" or die;#separateFilesPerSample
$hisatfile = $file;
$hisatfile =~ s/flagstat/alignerout/;
open HISAT, "<$hisatfile";
my ($total, $pairs,$read2ct,$maprate,$concorrate);
while (my $line = <HISAT>) {
chomp($line);
if ($line =~ m/(\d+) reads; of these:/) {
$total = $1;
}elsif ($line =~ m/Number of input reads\s+\|\s+(\d+)/) {
$total = $1;
}elsif ($line =~ m/(\d+) \S+ were paired; of these:/) {
$pairs = $1;
$total = $pairs*2 if ($total == $pairs);
}elsif ($line =~ m/(\d+) pairs aligned concordantly 0 times; of these:/) {
$concorrate = sprintf("%.4f",1-($1/$pairs));
}elsif ($line =~ m/(\S+)% overall alignment rate/) {
$maprate = $1/100;
}
}
close HISAT;
open IN, "<$file" or die $!;
while ($line = <IN>) {
chomp($line);
if ($line =~ m/(\d+) \+ \d+ in total/) {
$total = $1 unless $total;
}elsif ($line =~ m/(\d+) \+ \d+ read1/) {
$pairs = $1;
}elsif ($line =~ m/(\d+) \+ \d+ read2/) {
$read2ct = $1;
}elsif ($line =~ m/(\d+) \+ \d+ mapped\s+\((\S+)%.+\)/) {
$maprate = sprintf("%.2f",$2/100) unless ($maprate);
}elsif ($line =~ m/(\d+) \+ \d+ properly paired\s+\((\S+)%.+\)/) {
$concorrate = sprintf("%.2f",$2/100) unless ($concorrate);
}
}
close IN;
my @stats=stat($file);
my ($day,$month,$year) = (localtime($stats[9]))[3,4,5];
$year += 1900;
$month++;
$date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day));
$fileowner = 's'.$stats[4] unless $fileowner;
$total = $pairs*2 if ($total == $pairs);
my $status = 'PASS';
$status = 'FAIL' if ($maprate < 0.90);
$status = 'FAIL' if ($total < 6000000);
print OUT join("\n","Sample\t".$sample,"Total_Raw_Count\t".$total, "Read1_Map\t".$pairs,"Read2_Map\t".$read2ct,
"Map_Rate\t".$maprate,"Concordant_Rate\t".$concorrate,"Alignment_Status\t".$status,"Alignment_Date\t".$date,
"File_Owner\t".$fileowner,"Workflow_Version\t".$gittag),"\n";
system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt});
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment