#!/usr/local/bin/perl -w # ./calc_node_values probe_length # This program calculates the function value at each tree node. use strict; use Storable; my ($i); my ($treeFile, $treeString, @treeArray, $char); my ($ifQuoted, @stringArray, $leafString); my (@stack, $element); my ($popTimes); my ($nodeNumber); my ($leafRef1, $leafRef2, $leafNumber); my ($rootRef, @rootRef, $nodeRef, $currentNode); # %validSeqName is a set of the names of the prokaryotes whose sequences are valid. The keys are the names and # the corresponding values are undef. %validSeqNameAndSeq is the hash, whose keys are the names and the values # are the sequneces. my ($validSeqFile, %validSeqName, %validSeqNameAndSeq); my ($probeLength, %probes); my ($binaryTreeFile); # used to store the tree structure in binary format externally. my ($binaryProbeHashFile); # used to store the hash structure of the probes. my ($binaryProbeHashCalcFile); if (@ARGV != 1) { print "Usage: ./calc_node_values probe_length\n"; exit; } ($probeLength) = @ARGV; $binaryTreeFile = 'SSU_Prok.treeMarkedTrimmed.bin'; $binaryProbeHashFile = "hashForProbeOfLength$probeLength.bin"; $binaryProbeHashCalcFile = "hashForProbeOfLength${probeLength}Calc.bin"; if (-e $binaryTreeFile) # If the tree data file exists, just retrieve the tree from it. { print scalar localtime(), "\nRetrieving the tree from $binaryTreeFile ...\n"; $nodeRef = retrieve ($binaryTreeFile); print "Done.\n"; } else { print "\nFile $binaryTreeFile does not exist. Exit.\nUse \"tree_parse\" to generate the file $binaryTreeFile befor running this program.\n"; exit; } if (-e $binaryProbeHashFile) { print "\nRetrieving the hash from $binaryProbeHashFile ... \n"; %probes = %{ retrieve ($binaryProbeHashFile) }; print "Done.\n"; } else { print "\nFile $binaryProbeHashFile does not exist. Exit.\nUse \"probes_hash_table_generator $probeLength\" to generate the file $binaryProbeHashFile befor running this program.\n"; exit; } $rootRef = $nodeRef; # set the $rootRef. calTreeNodeValuesForEachProbe(); # # # sub calTreeNodeValuesForEachProbe { my ($probe, $hashRef); my (%tempHash); # record all the calculation results from calTreeNodeValues(). Keys: nodeNumber and values: nodeValues my ($tempHashRef) = (\%tempHash); my ($i, $j, $number); $j = 0; while (($probe, $hashRef) = each (%probes)) { print $j++, "\n"; #if ($hashRef->{matchingTimes} > 1) # only consider multiple-matching probes. if (scalar keys %{ $hashRef->{matchedOrg} } > 1) # use the number of the matched org instead. { %tempHash = (); # reset it. $rootRef = $nodeRef; # reset the $rootRef. markMatchedSeq ($rootRef, $hashRef); $rootRef = $nodeRef; # reset the $rootRef. calTreeNodeValues ($rootRef, $tempHashRef, scalar keys %{ $hashRef->{matchedOrg} }); $i = 0; foreach $number (sort { $tempHash{$b} <=> $tempHash{$a} } keys %tempHash) # sort the keys into descending order of the values. { $hashRef->{treeNodeValues}{$number} = $tempHash{$number}; $i++; last if ($i >= 5); # only take the 5 highest values. } } } print "\nStoring the calculated hash table to $binaryProbeHashCalcFile ...\n"; store (\%probes, $binaryProbeHashCalcFile); } # # # sub calTreeNodeValues { my ($tree, $tempHashRef, $numOfMatchedSeq) = @_; my ($nodeValue); return unless $tree; calTreeNodeValues ($tree->{left}, $tempHashRef, $numOfMatchedSeq); calTreeNodeValues ($tree->{right}, $tempHashRef, $numOfMatchedSeq); if (exists $tree->{shortName}) # this is a leaf node. { $tree->{isMatched} = 0; # undiscriminatory reset. } else # this is the branch node. { if ($tree->{numValidLeaves} != 0 and $nodeRef->{numMatchedLeaves} != 0) # $nodeRef is the root. { # Nt (total valid seqs), Nm (total matched seqs), Ngt (total valid seqs in the group), Ngm (total matched seqs in the group) # F2 = Pin * (1 - Pout) # = Ngm/Ngt * (1 - (Nm - Ngm)/Nm) # = (Ngm * Ngm) / (Ngt * Nm) # F2: Using the total number of matched seqs as Nm. Multi-matching in a seq is not considered. $nodeValue = ($tree->{numMatchedLeaves} * $tree->{numMatchedLeaves}) / ($tree->{numValidLeaves} * $numOfMatchedSeq); # F2: Use $nodeRef->{numMatchedLeaves} as Nm. It is the total number of matching times of this probe. # $nodeValue = ($tree->{numMatchedLeaves} * $tree->{numMatchedLeaves}) / ($tree->{numValidLeaves} * $nodeRef->{numMatchedLeaves}); # F1: $nodeValue = $tree->{numMatchedLeaves}/$tree->{numValidLeaves} * (1 - ($nodeRef->{numMatchedLeaves}-$tree->{numMatchedLeaves})/$nodeRef->{numMatchedLeaves}); $tempHashRef->{$tree->{nodeNumber}} = $nodeValue; } $tree->{numMatchedLeaves} = 0; # reset this value for the next probe. } } # # # sub markMatchedSeq { my ($tree, $hashRef) = @_; my $ref; return unless $tree; markMatchedSeq ($tree->{left}, $hashRef); markMatchedSeq ($tree->{right}, $hashRef); if (exists $tree->{shortName}) # this is a leaf node. { if (exists $hashRef->{matchedOrg}{ $tree->{shortName} }) # The 16S of this organism is matched by this probe. { $tree->{isMatched} = 1; # set it to true. # propagate this to all the parent nodes by increment the matched leaves number. for ($ref = $tree->{parentRef}; $ref; $ref = $ref->{parentRef}) { $ref->{numMatchedLeaves}++; } } } }