FMAP_assembly_centrifuge.pl, FMAP_assembly_heatmap.pl

a93a59fc · jiwoongbio · 0fcc75fa · a93a59fc · a93a59fc · a93a59fc
Commit a93a59fc authored 7 years ago by jiwoongbio
--- a/FMAP_assembly_centrifuge.pl
+++ b/FMAP_assembly_centrifuge.pl
+# Author: Jiwoong Kim (jiwoongbio@gmail.com)
+use strict;
+use warnings;
+local $SIG{__WARN__} = sub { die "ERROR in $0: ", $_[0] };
+
+use Getopt::Long qw(:config no_ignore_case);
+
+GetOptions('h' => \(my $help = ''),
+	'p=i' => \(my $threads = 1),
+);
+if($help || scalar(@ARGV) == 0) {
+	die <<EOF;
+
+Usage:   perl FMAP_assembly_taxon.pl [options] FMAP_assembly.region.txt assembly.fasta centrifuge.index
+
+Options: -h       display this help message
+         -p INT   number of threads [$threads]
+
+EOF
+}
+my ($regionFile, $assemblyFastaFile, $centrifugeIndex) = @ARGV;
+my %assemblySequenceHash = ();
+{
+	open(my $reader, $assemblyFastaFile);
+	my $contig = '';
+	while(my $line = <$reader>) {
+		chomp($line);
+		if($line =~ /^>(\S*)/) {
+			$contig = $1;
+		} else {
+			$assemblySequenceHash{$contig} .= $line;
+		}
+	}
+	close($reader);
+}
+(my $regionFastaFile = $regionFile) =~ s/(\.txt)?$/\.fasta/;
+{
+	open(my $reader, $regionFile);
+	open(my $writer, "> $regionFastaFile");
+	chomp(my $line = <$reader>);
+	my @columnList = split(/\t/, $line);
+	while(my $line = <$reader>) {
+		chomp($line);
+		my %tokenHash = ();
+		@tokenHash{@columnList} = split(/\t/, $line);
+		my $region = join('|', my ($contig, $start, $end, $strand) = @tokenHash{'contig', 'start', 'end', 'strand'});
+		my $sequence = substr($assemblySequenceHash{$contig}, $start - 1, $end - ($start - 1));
+		$sequence = uc($sequence);
+		($sequence = reverse($sequence)) =~ tr/ACGT/TGCA/ if($strand eq '-');
+		print $writer ">$region\n";
+		print $writer "$sequence\n";
+	}
+	close($reader);
+	close($writer);
+}
+my %regionTaxonReferenceHash = ();
+{
+	open(my $reader, "centrifuge -f -p $threads -x $centrifugeIndex -U $regionFastaFile --report-file /dev/null |");
+	chomp(my $line = <$reader>);
+	my @columnList = split(/\t/, $line);
+	while(my $line = <$reader>) {
+		my %tokenHash = ();
+		@tokenHash{@columnList} = split(/\t/, $line);
+		$regionTaxonReferenceHash{$tokenHash{'readID'}} = [@tokenHash{'taxID', 'seqID'}];
+	}
+	close($reader);
+}
+{
+	open(my $reader, $regionFile);
+	chomp(my $line = <$reader>);
+	my @columnList = split(/\t/, $line);
+	print join("\t", @columnList, 'taxon', 'reference'), "\n";
+	while(my $line = <$reader>) {
+		chomp($line);
+		my %tokenHash = ();
+		@tokenHash{@columnList} = split(/\t/, $line);
+		my $region = join('|', @tokenHash{'contig', 'start', 'end', 'strand'});
+		@tokenHash{'taxon', 'reference'} = defined($_ = $regionTaxonReferenceHash{$region}) ? @$_ : ('', '');
+		print join("\t", @tokenHash{@columnList, 'taxon', 'reference'}), "\n";
+	}
+	close($reader);
+}
--- a/FMAP_assembly_heatmap.pl
+++ b/FMAP_assembly_heatmap.pl
+# Author: Jiwoong Kim (jiwoongbio@gmail.com)
+use strict;
+use warnings;
+local $SIG{__WARN__} = sub { die "ERROR in $0: ", $_[0] };
+
+use Cwd 'abs_path';
+use Getopt::Long qw(:config no_ignore_case);
+use Statistics::R;
+use Bio::DB::Taxonomy;
+
+(my $fmapPath = abs_path($0)) =~ s/\/[^\/]*$//;
+my $database = loadDefaultDatabase();
+my $databasePrefix = "$fmapPath/FMAP_data/$database";
+my $dataPath = "$fmapPath/FMAP_data";
+system("mkdir -p $dataPath");
+
+if(not -r "$dataPath/nodes.dmp" or not -r "$dataPath/names.dmp") {
+	my $URL = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz";
+	my $file = "$dataPath/taxdump.tar.gz";
+	system("wget --no-verbose -O $file $URL") if(not -r $file);
+	die "ERROR in $0: '$file' has zero size.\n" if(-z $file);
+	system("cd $dataPath; tar -zxf taxdump.tar.gz nodes.dmp");
+	system("cd $dataPath; tar -zxf taxdump.tar.gz names.dmp");
+	system("rm -f $dataPath/$_") foreach('nodes', 'parents', 'names2id', 'id2names');
+}
+my $db = Bio::DB::Taxonomy->new(-source => 'flatfile', -directory => $dataPath, -nodesfile => "$dataPath/nodes.dmp", -namesfile => "$dataPath/names.dmp");
+
+GetOptions('h' => \(my $help = ''),
+	'c=s' => \(my $comparisonFile = ''),
+	'f' => \(my $fontSize = 15),
+	'w' => \(my $tdWidth = 60),
+	'orthology2definition=s' => \(my $orthologyDefinitionFile = ''),
+	'd=s' => \$databasePrefix);
+
+if($help || scalar(@ARGV) == 0) {
+	die <<EOF;
+
+Usage:   perl FMAP_assembly_heatmap.pl [options] [name=]FMAP_assembly.abundance.txt [...] > FMAP_assembly_heatmap.html
+
+Options: -h       display this help message
+         -c FILE  comparison output file including orthology and filter columns
+         -f INT   HTML font size
+         -w INT   HTML table cell width
+
+EOF
+}
+
+my %orthologyFilterHash = ();
+if($comparisonFile ne '') {
+	die "ERROR in $0: '$comparisonFile' is not readable.\n" unless(-r $comparisonFile);
+	open(my $reader, $comparisonFile);
+	chomp(my $line = <$reader>);
+	my @columnList = split(/\t/, $line);
+	while(my $line = <$reader>) {
+		chomp($line);
+		my %tokenHash = ();
+		@tokenHash{@columnList} = split(/\t/, $line);
+		my ($orthology, $filter) = @tokenHash{'orthology', 'filter'};
+		$orthologyFilterHash{$orthology} = $filter;
+	}
+	close($reader);
+}
+
+if($databasePrefix eq "$fmapPath/FMAP_data/$database") {
+	if($database =~ /^orthology_uniref/) {
+		$orthologyDefinitionFile = "$fmapPath/FMAP_data/KEGG_orthology.txt" if($orthologyDefinitionFile eq '');
+	} elsif($database =~ /^ARDB/ || $database =~ /^betalactamases/) {
+		$orthologyDefinitionFile = "$databasePrefix.definition.txt" if($orthologyDefinitionFile eq '');
+	}
+}
+
+my %orthologyDefinitionHash = ();
+if($orthologyDefinitionFile ne '') {
+	die "ERROR in $0: '$orthologyDefinitionFile' is not readable.\n" unless(-r $orthologyDefinitionFile);
+	open(my $reader, $orthologyDefinitionFile);
+	while(my $line = <$reader>) {
+		chomp($line);
+		my ($orthology, $definition) = split(/\t/, $line);
+		$orthologyDefinitionHash{$orthology} = $definition;
+	}
+	close($reader);
+}
+
+my @sampleFileList = @ARGV;
+my @sampleList = ();
+my %orthologySampleAbundanceHash = ();
+my %orthologySampleTaxonAbundanceHash = ();
+foreach my $sampleFile (@sampleFileList) {
+	($sampleFile, my $sample) = reverse(split(/=/, $sampleFile, 2));
+	$sample = $sampleFile unless(defined($sample));
+	push(@sampleList, $sample);
+	open(my $reader, $sampleFile);
+	chomp(my $line = <$reader>);
+	my @columnList = split(/\t/, $line);
+	while(my $line = <$reader>) {
+		chomp($line);
+		my %tokenHash = ();
+		@tokenHash{@columnList} = split(/\t/, $line);
+		my $orthology = $tokenHash{'orthology'};
+		my $abundance = $tokenHash{'meanDepth/genome'};
+		next if(defined($_ = $orthologyFilterHash{$orthology}) && $_ eq 'fail');
+		$orthologySampleAbundanceHash{$orthology}->{$sample} += $abundance;
+		$orthologySampleTaxonAbundanceHash{$orthology}->{$sample}->{$_} += $abundance if(defined($_ = $tokenHash{'taxon'}));
+		$orthologyDefinitionHash{$orthology} = $_ if(defined($_ = $tokenHash{'definition'}));
+	}
+	close($reader);
+}
+
+my $R = Statistics::R->new();
+$R->run('table <- data.frame()');
+foreach my $orthology (keys %orthologySampleAbundanceHash) {
+	my $abundances = join(',', map {defined($_) ? $_ : 0} map {$orthologySampleAbundanceHash{$orthology}->{$_}} @sampleList);
+	$R->run("table <- rbind(table, $orthology = c($abundances))");
+}
+$R->run('hc <- hclust(dist(table))');
+my $orthologyList = $R->get('hc$labels[hc$order]');
+$R->stop();
+
+#print <<EOF;
+#Content-Type: text/html; charset=utf-8
+#
+#EOF
+
+print <<EOF;
+<!DOCTYPE html>
+<html>
+<head>
+<style>
+* {
+	font-size: ${fontSize}px;
+	line-height: ${fontSize}px;
+}
+table.heatmap {
+	border-spacing: 1px;
+}
+table.heatmap td {
+	padding: 5px;
+	white-space: nowrap;
+	border: 2px solid #FFFFFF;
+}
+table.heatmap td div.popup {
+	text-align: left;
+	display: none;
+	position: absolute;
+	background-color: #FFFFFF;
+	padding: 5px;
+	border: 2px solid #000000;
+}
+table.heatmap td:hover div.popup {
+	display: block;
+}
+</style>
+</head>
+<body>
+<form method="post">
+EOF
+
+print "<p><table class=\"heatmap\">\n";
+{
+	my @tdList = ("<td></td>");
+	foreach my $sample (@sampleList) {
+		push(@tdList, "<td style=\"text-align: center; width: ${tdWidth}px;\">$sample</td>");
+	}
+	print join('', "<tr>", @tdList, "</tr>"), "\n";
+}
+foreach my $orthology (@$orthologyList) {
+	my @tdList = ();
+	if(defined($_ = $orthologyDefinitionHash{$orthology}) && $_ ne '') {
+		(my $description = $_) =~ s/;/<br>/g;
+		push(@tdList, "<td>$orthology<div class=\"popup\">$description</div></td>");
+	} else {
+		push(@tdList, "<td>$orthology</td>");
+	}
+	foreach my $sample (@sampleList) {
+		my $abundance = defined($_ = $orthologySampleAbundanceHash{$orthology}->{$sample}) ? $_ : 0;
+		my $color = '#FFFFFF';
+		if($abundance > 1) {
+			$color = color(1, 1 / $abundance, 0);
+		} else {
+			$color = color($abundance, 1, 0);
+		}
+		if(defined($_ = $orthologySampleTaxonAbundanceHash{$orthology}->{$sample})) {
+			my %taxonAbundanceHash = %$_;
+			my @taxonAbundanceList = sort {$b->[1] <=> $a->[1]} map {[$_ =~ /^[0-9]+$/ ? $db->get_taxon(-taxonid => $_)->scientific_name : $_, $taxonAbundanceHash{$_}]} keys %taxonAbundanceHash;
+			my $taxonAbundances = join('<br>', map {sprintf('%s %.3f', @$_)} @taxonAbundanceList);
+			my @taxonFontSizeList = map {[substr($_->[0], 0, 4), $fontSize * ($_->[1] / $abundance)]} @taxonAbundanceList;
+			my $taxons = join('', map {sprintf('<div style="font-size: %fpx; line-height: %fpx;">%s</div>', $_->[1], $_->[1], $_->[0])} @taxonFontSizeList);
+			push(@tdList, "<td style=\"text-align: center; background-color: $color;\">$taxons<div class=\"popup\">$taxonAbundances</div></td>");
+		} else {
+			push(@tdList, "<td style=\"text-align: center; background-color: $color;\"></td>");
+		}
+	}
+	print join('', "<tr>", @tdList, "</tr>"), "\n";
+}
+print "</table></p>\n";
+
+print "<p><b>Color gradation for relative depth</b></p>\n";
+print "<p><table class=\"heatmap\">\n";
+for(my $depth = 10; $depth > 0; $depth -= 2) {
+	my $color = color(1, 1 / $depth, 0);
+	print "<tr><td style=\"text-align: center; width: ${tdWidth}px; background-color: $color;\">$depth</td></tr>\n";
+}
+for(my $depth = 1; $depth >= 0; $depth = sprintf('%.1f', $depth - 0.2)) {
+	my $color = color($depth, 1, 0);
+	print "<tr><td style=\"text-align: center; width: ${tdWidth}px; background-color: $color;\">$depth</td></tr>\n";
+}
+print "</table></p>\n";
+
+print <<EOF;
+</form>
+</body>
+</html>
+EOF
+
+sub loadDefaultDatabase {
+	open(my $reader, "$fmapPath/FMAP_data/database");
+	chomp(my $line = <$reader>);
+	close($reader);
+	return $1 if($line =~ /^([A-Za-z0-9_.]+)/);
+}
+
+sub color {
+	my ($red, $green, $blue) = @_;
+	(my $color = sprintf('#%02x%02x%02x', int($red * 255), int($green * 255), int($blue * 255))) =~ tr/a-z/A-Z/;
+	return $color;
+}
--- a/README.md
+++ b/README.md
@@ -119,6 +119,33 @@ Options: -h       display this help message
         -P STR   contig prefix used for abundance estimation
 ```

+* **FMAP_assembly_centrifuge.pl**
+  * Require [Centrifuge](https://ccb.jhu.edu/software/centrifuge/).
+  * Input
+    1. FMAP_assembly.region.txt (ORF regions mapping to KEGG orthologies generated by FMAP_assembly)
+    2. De novo assembled sequences in FASTA format
+    3. Centrifuge index filename prefix (minus trailing .X.cf)
+  * Output: FMAP_assembly.region.taxon.txt (FMAP_assembly.region.txt including a column of [NCBI taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy) IDs (integer))
+```
+Usage:   perl FMAP_assembly_taxon.pl [options] FMAP_assembly.region.txt assembly.fasta centrifuge.index
+
+Options: -h       display this help message
+         -p INT   number of threads [1]
+```
+
+* **FMAP_assembly_heatmap.pl**
+  * Require [Bio::DB::Taxonomy](http://search.cpan.org/dist/BioPerl/Bio/DB/Taxonomy.pm).
+  * Input: FMAP_assembly.abundance.txt (abundances generated by FMAP_assembly)
+  * Output: HTML format of abundance heatmap table
+```
+Usage:   perl FMAP_assembly_heatmap.pl [options] [name=]FMAP_assembly.abundance.txt [...] > FMAP_assembly_heatmap.html
+
+Options: -h       display this help message
+         -c FILE  comparison output file including orthology and filter columns
+         -f INT   HTML font size
+         -w INT   HTML table cell width
+```
+
 * **FMAP_assembly_operon.pl**
  * Input: FMAP_assembly.region.txt (ORF regions mapping to KEGG orthologies generated by FMAP_assembly)
  * Output: FMAP_assembly_operon.txt ([ODB (v3)](http://operondb.jp) known operons consisting of orthologies located together on an assembled contig/scaffold/transcript)