diff --git a/alignment/bamqc.sh b/alignment/bamqc.sh index 52f904e385e2e536a3f8e8858d0e2dbb179536b1..79db6ad4b87fd88e9c63064e89c4fbb874710ebc 100644 --- a/alignment/bamqc.sh +++ b/alignment/bamqc.sh @@ -13,7 +13,7 @@ usage() { exit 1 } OPTIND=1 # Reset OPTIND -while getopts :r:b:c:n:p:s:d:h opt +while getopts :r:b:c:n:p:e:s:d:h opt do case $opt in r) index_path=$OPTARG;; @@ -22,6 +22,7 @@ do n) nuctype=$OPTARG;; p) pair_id=$OPTARG;; d) dedup=$OPTARG;; + e) version=$OPTARG;; s) skiplc=1;; h) usage;; esac @@ -63,9 +64,12 @@ if [[ $nuctype == 'dna' ]]; then samtools index -@ $NPROC ${pair_id}.ontarget.bam samtools flagstat ${pair_id}.ontarget.bam > ${pair_id}.ontarget.flagstat.txt samtools view -@ $NPROC -b -q 1 ${sbam} | bedtools coverage -sorted -hist -g ${index_path}/genomefile.txt -b stdin -a ${bed} > ${pair_id}.mapqualcov.txt + java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity BARCODE_TAG=RG I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir} #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectAlignmentSummaryMetrics R=${index_path}/genome.fa I=${pair_id}.ontarget.bam OUTPUT=${pair_id}.alignmentsummarymetrics.txt TMP_DIR=${tmpdir} - #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -XX:ParallelGCThreads=$NPROC -jar $PICARD/picard.jar EstimateLibraryComplexity I=${sbam} OUTPUT=${pair_id}.libcomplex.txt TMP_DIR=${tmpdir} #samtools view -@ $NPROC ${sbam} | awk '{sum+=$5} END { print "Mean MAPQ =",sum/NR}' > ${pair_id}.meanmap.txt fi #java -Xmx64g -Djava.io.tmpdir=${tmpdir} -jar $PICARD/picard.jar CollectInsertSizeMetrics INPUT=${sbam} HISTOGRAM_FILE=${pair_id}.hist.ps REFERENCE_SEQUENCE=${index_path}/genome.fa OUTPUT=${pair_id}.hist.txt TMP_DIR=${tmpdir} + perl $baseDir/sequenceqc_dna.pl -e ${version} -r $index_path ${pair_id}.genomecov.txt +else + perl $baseDir/sequenceqc_rna.pl -e ${version} -r $index_path ${pair_id}.flagstat.txt fi diff --git a/alignment/sequenceqc_dna.pl b/alignment/sequenceqc_dna.pl new file mode 100755 index 0000000000000000000000000000000000000000..9d573b89a2843cbf3e777b7779adddb509bcf333 --- /dev/null +++ b/alignment/sequenceqc_dna.pl @@ -0,0 +1,139 @@ +#!/usr/bin/perl -w +#sequenceqc_alignment.p + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s'); + +my @statfiles = @ARGV; + +my $fileowner = $opt{user}; +my $gittag = 'v5'; +if ($opt{gitdir}) { + $gittag = $opt{gitdir}; +}elsif($ENV{'gitv'}) { + $gittag = $ENV{'gitv'}; +} +foreach $sfile (@statfiles) { + $sfile =~ m/(\S+)\.genomecov.txt/; + my $prefix = $1; + my %hash; + open FLAG, "<$prefix\.trimreport.txt"; + while (my $line = <FLAG>) { + chomp($line); + my ($file,$raw,$trim) = split(/\t/,$line); + $hash{rawct} += $raw; + } + open FLAG, "<$prefix\.flagstat.txt" or die $!; + while (my $line = <FLAG>) { + chomp($line); + if ($line =~ m/(\d+) \+ \d+ in total/) { + $hash{total} = $1; + }elsif ($line =~ m/(\d+) \+ \d+ read1/) { + $hash{pairs} = $1; + }elsif ($line =~ m/(\d+) \+ \d+ mapped/) { + $hash{maprate} = 100*sprintf("%.4f",$1/$hash{total}); + }elsif ($line =~ m/(\d+) \+ \d+ properly paired/) { + $hash{propair} = 100*sprintf("%.4f",$1/$hash{total}); + } + } + close FLAG; + open FLAG, "<$prefix\.ontarget.flagstat.txt" or die $!; + while (my $line = <FLAG>) { + chomp($line); + if ($line =~ m/(\d+) \+ \d+ in total/) { + $hash{ontarget} = $1; + } + } + unless ($hash{rawct}) { + $hash{rawct} = $hash{total}; + } + my %lc; + open DUP, "<$prefix.libcomplex.txt" or die $!; + while (my $line = <DUP>) { + chomp($line); + if ($line =~ m/## METRICS/) { + $header = <DUP>; + $nums = <DUP>; + chomp($header); + chomp($nums); + my @stats = split(/\t/,$header); + my @row = split(/\t/,$nums); + my %info; + foreach my $i (0..$#stats) { + $info{$stats[$i]} = $row[$i]; + } + $lc{TOTREADSLC} += $info{UNPAIRED_READS_EXAMINED} + $info{READ_PAIRS_EXAMINED}; + $hash{libsize} = $info{ESTIMATED_LIBRARY_SIZE}; + $lc{TOTDUPLC} += $info{UNPAIRED_READ_DUPLICATES} + $info{READ_PAIR_DUPLICATES}; + } + } + close DUP; + $hash{percdups} = sprintf("%.4f",$lc{TOTDUPLC}/$lc{TOTREADSLC}); + my %cov; + open COV, "<$prefix\.genomecov.txt" or die $!; + my $sumdepth; + my $totalbases; + while (my $line = <COV>) { + chomp($line); + my ($all,$depth,$bp,$total,$percent) = split(/\t/,$line); + $cov{$depth} = $percent; + $sumdepth += $depth*$bp; + $totalbases = $total unless ($totalbases); + } + my $avgdepth = sprintf("%.0f",$sumdepth/$totalbases); + my @depths = sort {$a <=> $b} keys %cov; + my @perc = @cov{@depths}; + my @cum_sum = cumsum(@perc); + my $median = 0; + foreach my $i (0..$#cum_sum) { + if ($cum_sum[$i] < 0.5) { + $median = $i; + } + } + $hash{'perc100x'} = 100*sprintf("%.4f",1-$cum_sum[100]); + $hash{'perc200x'} = 100*sprintf("%.4f",1-$cum_sum[200]); + $hash{'perc500x'} = 100*sprintf("%.4f",1-$cum_sum[500]); + + #### Begin File Information ######## + my $status = 'PASS'; + $status = 'FAIL' if ($hash{maprate} < 0.90 && $hash{'perc100x'} < 0.90); + my @stats = stat("$prefix\.flagstat.txt"); + my ($day,$month,$year) = (localtime($stats[9]))[3,4,5]; + $year += 1900; + $month++; + $date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day)); + $fileowner = 's'.$stats[4] unless $fileowner; + $hash{status} = $status; + $hash{date}=$date; + $hash{fileowner} = $fileowner; + ##### End File Information ######## + ##### START separateFilesPerSample ###### + open OUT, ">".$prefix.".sequence.stats.txt" or die $!; + print OUT join("\n", "Sample\t".$prefix,"Total_Raw_Count\t".$hash{rawct}, + "On_Target\t".$hash{ontarget},"Map_Rate\t".$hash{maprate}, + "Properly_Paired\t".$hash{propair}, + "Percent_Duplicates\t".sprintf("%.2f",100*$hash{percdups}), + "Percent_on_Target\t".sprintf("%.2f",100*$hash{ontarget}/$hash{rawct}), + "All_Average_Depth\t".$avgdepth,"All_Median_Depth\t".$median, + "Percent_over_100x\t".$hash{'perc100x'}, + "Percent_over_200x\t".$hash{'perc200x'}, + "Percent_over_500x\t".$hash{'perc500x'}, + "Alignment_Status\t".$hash{status},"Alignment_Date\t".$hash{date}, + "File_Owner\t".$hash{fileowner},"Workflow_Version\t".$gittag),"\n"; + close OUT; + system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt}); + ##### END separateFilesPerSample ###### +} + + +sub cumsum { + my @nums = @_; + my @cumsum = (); + my $mid = 0; + for my $i (0..$#nums) { + $mid += $nums[$i]; + push(@cumsum,$mid); + } + return @cumsum; +} diff --git a/alignment/sequenceqc_rna.pl b/alignment/sequenceqc_rna.pl new file mode 100755 index 0000000000000000000000000000000000000000..906b802404a6ba82cd463ca09ebe92f1d4d6c931 --- /dev/null +++ b/alignment/sequenceqc_rna.pl @@ -0,0 +1,78 @@ +#!/usr/bin/perl -w +#sequenceqc_rnaseq.pl + +use Getopt::Long qw(:config no_ignore_case no_auto_abbrev); +my %opt = (); +my $results = GetOptions (\%opt,'refdir|r=s','help|h','gitdir|e=s','user|u=s'); + +my @files = @ARGV; +chomp(@files); + +my $fileowner = $opt{user}; +my $gittag = 'v5'; +if ($opt{gitdir}) { + $gittag = $opt{gitdir}; +}elsif($ENV{'gitv'}) { + $gittag = $ENV{'gitv'}; +} + +foreach my $file (@files) { + chomp($file); + $file =~ m/(\S+)\.flagstat.txt/; + my @directory = split(/\//, $file); + $prefix = $directory[-1]; + my $sample = (split(/\./,$prefix))[0]; + open OUT, ">".$sample.".sequence.stats.txt" or die;#separateFilesPerSample + $hisatfile = $file; + $hisatfile =~ s/flagstat/alignerout/; + open HISAT, "<$hisatfile"; + my ($total, $pairs,$read2ct,$maprate,$concorrate); + while (my $line = <HISAT>) { + chomp($line); + if ($line =~ m/(\d+) reads; of these:/) { + $total = $1; + }elsif ($line =~ m/Number of input reads\s+\|\s+(\d+)/) { + $total = $1; + }elsif ($line =~ m/(\d+) \S+ were paired; of these:/) { + $pairs = $1; + $total = $pairs*2 if ($total == $pairs); + }elsif ($line =~ m/(\d+) pairs aligned concordantly 0 times; of these:/) { + $concorrate = sprintf("%.4f",1-($1/$pairs)); + }elsif ($line =~ m/(\S+)% overall alignment rate/) { + $maprate = $1/100; + } + } + close HISAT; + open IN, "<$file" or die $!; + while ($line = <IN>) { + chomp($line); + if ($line =~ m/(\d+) \+ \d+ in total/) { + $total = $1 unless $total; + }elsif ($line =~ m/(\d+) \+ \d+ read1/) { + $pairs = $1; + }elsif ($line =~ m/(\d+) \+ \d+ read2/) { + $read2ct = $1; + }elsif ($line =~ m/(\d+) \+ \d+ mapped\s+\((\S+)%.+\)/) { + $maprate = sprintf("%.2f",$2/100) unless ($maprate); + }elsif ($line =~ m/(\d+) \+ \d+ properly paired\s+\((\S+)%.+\)/) { + $concorrate = sprintf("%.2f",$2/100) unless ($concorrate); + } + } + close IN; + + my @stats=stat($file); + my ($day,$month,$year) = (localtime($stats[9]))[3,4,5]; + $year += 1900; + $month++; + $date = join("-",$year,sprintf("%02s",$month),sprintf("%02s",$day)); + $fileowner = 's'.$stats[4] unless $fileowner; + $total = $pairs*2 if ($total == $pairs); + my $status = 'PASS'; + $status = 'FAIL' if ($maprate < 0.90); + $status = 'FAIL' if ($total < 6000000); + + print OUT join("\n","Sample\t".$sample,"Total_Raw_Count\t".$total, "Read1_Map\t".$pairs,"Read2_Map\t".$read2ct, + "Map_Rate\t".$maprate,"Concordant_Rate\t".$concorrate,"Alignment_Status\t".$status,"Alignment_Date\t".$date, + "File_Owner\t".$fileowner,"Workflow_Version\t".$gittag),"\n"; + system(qq{cat $opt{refdir}\/reference_info.txt >> $prefix\.sequence.stats.txt}); +}