update adding aggregate qc

34b23838 · Brandi Cantarel · 16061b5a · 34b23838 · 34b23838 · 34b23838
Commit 34b23838 authored 4 years ago by Brandi Cantarel
--- a/alignment/bamqc.sh
+++ b/alignment/bamqc.sh
@@ -13,7 +13,7 @@ usage() {
  exit 1
 }
 OPTIND=1 # Reset OPTIND
-while getopts :r:b:c:n:p:s:d:h opt
+while getopts :r:b:c:n:p:e:s:d:h opt
 do
    case $opt in
        r) index_path=$OPTARG;;
@@ -22,6 +22,7 @@ do
        n) nuctype=$OPTARG;;
        p) pair_id=$OPTARG;;
 	d) dedup=$OPTARG;;
+	e) version=$OPTARG;;
 	s) skiplc=1;;
        h) usage;;
    esac
@@ -63,9 +64,12 @@ if [[ $nuctype == 'dna' ]]; then
 	samtools index -@ $NPROC ${pair_id}.ontarget.bam
 	samtools flagstat  ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt
 	samtools view  -@ $NPROC -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt
+	java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity BARCODE_TAG=RG I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir}
 	#java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt TMP_DIR=${tmpdir}
-	#java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir}
 	#samtools view  -@ $NPROC ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt
    fi
    #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt TMP_DIR=${tmpdir}
+    perl $baseDir/sequenceqc_dna.pl -e ${version} -r $index_path ${pair_id}.genomecov.txt
+else
+    perl $baseDir/sequenceqc_rna.pl -e ${version} -r $index_path ${pair_id}.flagstat.txt
 fi
--- a/alignment/sequenceqc_dna.pl
+++ b/alignment/sequenceqc_dna.pl
+#!/usr/bin/perl -w
+#sequenceqc_alignment.p
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+my %opt = ();
+my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s');
+my @statfiles = @ARGV;
+my $fileowner = $opt{user};
+my $gittag = 'v5';
+if ($opt{gitdir}) {
+    $gittag = $opt{gitdir};
+}elsif($ENV{'gitv'}) {
+    $gittag = $ENV{'gitv'};
+}
+foreach $sfile (@statfiles) {
+  $sfile =~ m/(\S+)\.genomecov.txt/;
+  my $prefix = $1;
+  my %hash;
+  open FLAG, "<$prefix\.trimreport.txt";
+  while (my $line = <FLAG>) {
+    chomp($line);
+    my ($file,$raw,$trim) = split(/\t/,$line);
+    $hash{rawct} += $raw;
+  }
+  open FLAG, "<$prefix\.flagstat.txt" or die $!;
+  while (my $line = <FLAG>) {
+    chomp($line);
+    if ($line =~ m/(\d+) \+ \d+ in total/) {
+      $hash{total} = $1;
+    }elsif ($line =~ m/(\d+) \+ \d+ read1/) {
+      $hash{pairs} = $1;
+    }elsif ($line =~ m/(\d+) \+ \d+ mapped/) {
+      $hash{maprate} = 100*sprintf("%.4f",$1/$hash{total});
+    }elsif ($line =~ m/(\d+) \+ \d+ properly paired/) {
+      $hash{propair} = 100*sprintf("%.4f",$1/$hash{total});
+    }
+  }
+  close FLAG;
+  open FLAG, "<$prefix\.ontarget.flagstat.txt" or die $!;
+  while (my $line = <FLAG>) {
+    chomp($line);
+    if ($line =~ m/(\d+) \+ \d+ in total/) {
+      $hash{ontarget} = $1;
+    }
+  }
+  unless ($hash{rawct}) {
+    $hash{rawct} = $hash{total};
+  }
+  my %lc;
+  open DUP, "<$prefix.libcomplex.txt" or die $!;
+  while (my $line = <DUP>) {
+    chomp($line);
+    if ($line =~ m/## METRICS/) {
+      $header = <DUP>;
+      $nums = <DUP>;
+      chomp($header);
+      chomp($nums);
+      my @stats = split(/\t/,$header);
+      my @row = split(/\t/,$nums);
+      my %info;
+      foreach my $i (0..$#stats) {
+	$info{$stats[$i]} = $row[$i];
+      }
+      $lc{TOTREADSLC} += $info{UNPAIRED_READS_EXAMINED} + $info{READ_PAIRS_EXAMINED};
+      $hash{libsize} = $info{ESTIMATED_LIBRARY_SIZE};
+      $lc{TOTDUPLC} +=  $info{UNPAIRED_READ_DUPLICATES} + $info{READ_PAIR_DUPLICATES};
+    }
+  }
+  close DUP;
+  $hash{percdups} = sprintf("%.4f",$lc{TOTDUPLC}/$lc{TOTREADSLC});
+  my %cov;
+  open COV, "<$prefix\.genomecov.txt" or die $!;
+  my $sumdepth;
+  my $totalbases;
+  while (my $line = <COV>) {
+    chomp($line);
+    my ($all,$depth,$bp,$total,$percent) = split(/\t/,$line);
+    $cov{$depth} = $percent;
+    $sumdepth += $depth*$bp;
+    $totalbases = $total unless ($totalbases);
+  }
+  my $avgdepth = sprintf("%.0f",$sumdepth/$totalbases);
+  my @depths = sort {$a <=> $b} keys %cov;
+  my @perc = @cov{@depths};
+  my @cum_sum = cumsum(@perc);
+  my $median = 0;
+  foreach my $i (0..$#cum_sum) {
+    if ($cum_sum[$i] < 0.5) {
+      $median = $i;
+    }
+  }
+  $hash{'perc100x'} = 100*sprintf("%.4f",1-$cum_sum[100]);
+  $hash{'perc200x'} = 100*sprintf("%.4f",1-$cum_sum[200]);
+  $hash{'perc500x'} = 100*sprintf("%.4f",1-$cum_sum[500]);
+  #### Begin File Information ########
+  my $status = 'PASS';
+  $status = 'FAIL' if ($hash{maprate} < 0.90 && $hash{'perc100x'} < 0.90);
+  my @stats = stat("$prefix\.flagstat.txt");
+  my ($day,$month,$year) = (localtime($stats[9]))[3,4,5];
+  $year += 1900;
+  $month++;
+  $date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day));
+  $fileowner = 's'.$stats[4] unless $fileowner;
+  $hash{status} = $status;
+  $hash{date}=$date;
+  $hash{fileowner} = $fileowner;
+  ##### End File Information ########
+  ##### START separateFilesPerSample ######
+  open OUT, ">".$prefix.".sequence.stats.txt" or die $!;
+  print OUT join("\n", "Sample\t".$prefix,"Total_Raw_Count\t".$hash{rawct},
+		 "On_Target\t".$hash{ontarget},"Map_Rate\t".$hash{maprate},
+		 "Properly_Paired\t".$hash{propair},
+		 "Percent_Duplicates\t".sprintf("%.2f",100*$hash{percdups}),
+		 "Percent_on_Target\t".sprintf("%.2f",100*$hash{ontarget}/$hash{rawct}),
+		 "All_Average_Depth\t".$avgdepth,"All_Median_Depth\t".$median,
+		 "Percent_over_100x\t".$hash{'perc100x'},
+		 "Percent_over_200x\t".$hash{'perc200x'},
+		 "Percent_over_500x\t".$hash{'perc500x'},
+		 "Alignment_Status\t".$hash{status},"Alignment_Date\t".$hash{date},
+		 "File_Owner\t".$hash{fileowner},"Workflow_Version\t".$gittag),"\n";
+  close OUT;
+  system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt});
+  ##### END separateFilesPerSample ######
+}
+sub cumsum {
+  my @nums = @_;
+  my @cumsum = ();
+  my $mid = 0;
+  for my $i (0..$#nums) {
+    $mid += $nums[$i];
+    push(@cumsum,$mid);
+  }
+  return @cumsum;
+}
--- a/alignment/sequenceqc_rna.pl
+++ b/alignment/sequenceqc_rna.pl
+#!/usr/bin/perl -w
+#sequenceqc_rnaseq.pl
+use Getopt::Long qw(:config no_ignore_case no_auto_abbrev);
+my %opt = ();
+my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s');
+my @files = @ARGV;
+chomp(@files);
+my $fileowner = $opt{user};
+my $gittag = 'v5';
+if ($opt{gitdir}) {
+    $gittag = $opt{gitdir};
+}elsif($ENV{'gitv'}) {
+    $gittag = $ENV{'gitv'};
+}
+foreach my $file (@files) {
+  chomp($file);
+  $file =~ m/(\S+)\.flagstat.txt/;
+  my @directory = split(/\//, $file);
+  $prefix = $directory[-1];
+  my $sample = (split(/\./,$prefix))[0];
+  open OUT, ">".$sample.".sequence.stats.txt" or die;#separateFilesPerSample
+  $hisatfile = $file;
+  $hisatfile =~ s/flagstat/alignerout/;
+  open HISAT, "<$hisatfile";
+  my ($total, $pairs,$read2ct,$maprate,$concorrate);
+  while (my $line = <HISAT>) {
+    chomp($line);
+    if ($line =~ m/(\d+) reads; of these:/) {
+      $total = $1;
+    }elsif ($line =~ m/Number of input reads\s+\|\s+(\d+)/) {
+	$total = $1;
+    }elsif ($line =~ m/(\d+) \S+ were paired; of these:/) {
+      $pairs = $1;
+      $total = $pairs*2 if ($total == $pairs);
+    }elsif ($line =~ m/(\d+) pairs aligned concordantly 0 times; of these:/) {
+      $concorrate = sprintf("%.4f",1-($1/$pairs));
+    }elsif ($line =~ m/(\S+)% overall alignment rate/) {
+      $maprate = $1/100;
+    }
+  }
+  close HISAT;
+  open IN, "<$file" or die $!;
+  while ($line = <IN>) {
+    chomp($line);
+    if ($line =~ m/(\d+) \+ \d+ in total/) {
+      $total = $1 unless $total;
+    }elsif ($line =~ m/(\d+) \+ \d+ read1/) {
+      $pairs = $1;
+    }elsif ($line =~ m/(\d+) \+ \d+ read2/) {
+      $read2ct = $1;
+    }elsif ($line =~ m/(\d+) \+ \d+ mapped\s+\((\S+)%.+\)/) {
+      $maprate = sprintf("%.2f",$2/100) unless ($maprate);
+    }elsif ($line =~ m/(\d+) \+ \d+ properly paired\s+\((\S+)%.+\)/) {
+      $concorrate = sprintf("%.2f",$2/100) unless ($concorrate);
+    }
+  }
+  close IN;
+  my @stats=stat($file);
+  my ($day,$month,$year) = (localtime($stats[9]))[3,4,5];
+  $year += 1900;
+  $month++;
+  $date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day));
+  $fileowner = 's'.$stats[4] unless $fileowner;
+  $total = $pairs*2 if ($total == $pairs);
+  my $status = 'PASS';
+  $status = 'FAIL' if ($maprate < 0.90);
+  $status = 'FAIL' if ($total < 6000000);  
+  print OUT join("\n","Sample\t".$sample,"Total_Raw_Count\t".$total, "Read1_Map\t".$pairs,"Read2_Map\t".$read2ct,
+    "Map_Rate\t".$maprate,"Concordant_Rate\t".$concorrate,"Alignment_Status\t".$status,"Alignment_Date\t".$date,
+    "File_Owner\t".$fileowner,"Workflow_Version\t".$gittag),"\n";
+  system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt});
+}