Sleipnir
src/clusthierarchical.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "clusthierarchical.h"
00024 #include "dat.h"
00025 #include "measure.h"
00026 
00027 namespace Sleipnir {
00028 
00052 CHierarchy::CHierarchy( size_t iID, float dSimilarity, const CHierarchy* pLeft,
00053     const CHierarchy* pRight ) {
00054 
00055     assert( ( pLeft && pRight ) || !( pLeft || pRight ) );
00056 
00057     m_iID = iID;
00058     m_dScore = dSimilarity;
00059     m_pLeft = pLeft;
00060     m_pRight = pRight;
00061     m_iWeight = ( m_pLeft && m_pRight ) ? ( m_pLeft->m_iWeight + m_pRight->m_iWeight ) : 1; }
00062 
00063 CHierarchyImpl::~CHierarchyImpl( ) {
00064 
00065     if( m_pLeft )
00066         delete m_pLeft;
00067     if( m_pRight )
00068         delete m_pRight; }
00069 
00070 bool CHierarchyImpl::Save( std::ostream& ostm, size_t iNode,
00071     const std::vector<std::string>* pvecstrGenes ) const {
00072 
00073     if( IsGene( ) )
00074         return false;
00075 
00076     if( iNode == m_iID ) {
00077         ostm << GetSave( ) << '\t' << m_pLeft->GetSave( pvecstrGenes ) << '\t' <<
00078             m_pRight->GetSave( pvecstrGenes ) << '\t' << m_dScore << endl;
00079         return true; }
00080 
00081     return ( ((const CHierarchyImpl*)m_pLeft)->Save( ostm, iNode, pvecstrGenes ) ||
00082         ((const CHierarchyImpl*)m_pRight)->Save( ostm, iNode, pvecstrGenes ) ); }
00083 
00084 string CHierarchyImpl::GetSave( const std::vector<std::string>* pvecstrGenes ) const {
00085     string  strRet;
00086     char    achBuf[ 16 ];
00087 
00088     if( !( pvecstrGenes && IsGene( ) ) ) {
00089         strRet = IsGene( ) ? "GENE" : "NODE";
00090         sprintf_s( achBuf, "%d", m_iID );
00091         strRet += achBuf; }
00092     else
00093         strRet = (*pvecstrGenes)[ m_iID ];
00094 
00095     return strRet; }
00096 
00111 void CHierarchy::GetGenes( vector<size_t>& veciGenes ) const {
00112 
00113     if( IsGene( ) )
00114         veciGenes.push_back( m_iID );
00115     else {
00116         m_pLeft->GetGenes( veciGenes );
00117         m_pRight->GetGenes( veciGenes ); } }
00118 
00138 float CHierarchy::SortChildren( const vector<float>& vecdScores ) {
00139     float               dLeft, dRight, dRet;
00140     const CHierarchy*   pTemp;
00141 
00142     if( IsGene( ) )
00143         return vecdScores[ m_iID ];
00144 
00145     dLeft = ((CHierarchy*)m_pLeft)->SortChildren( vecdScores );
00146     dRight = ((CHierarchy*)m_pRight)->SortChildren( vecdScores );
00147     dRet = ( ( dLeft * m_pLeft->m_iWeight ) + ( dRight * m_pRight->m_iWeight ) ) /
00148         ( m_pRight->m_iWeight + m_pLeft->m_iWeight );
00149     if( dLeft < dRight ) {
00150         pTemp = m_pLeft;
00151         m_pLeft = m_pRight;
00152         m_pRight = pTemp; }
00153 
00154     return dRet; }
00155 
00182 CHierarchy* CClustHierarchical::Cluster( const CDistanceMatrix& MatSimilarities,
00183     const vector<bool>& vecfIncluded ) {
00184 
00185     return CClustHierarchicalImpl::Cluster( MatSimilarities, &vecfIncluded ); }
00186 
00206 CHierarchy* CClustHierarchical::Cluster( const CDistanceMatrix& MatSimilarities ) {
00207 
00208     return CClustHierarchicalImpl::Cluster( MatSimilarities ); }
00209 
00210 // Implementation inspired by TIGR MeV
00211 
00212 CHierarchy* CClustHierarchicalImpl::Cluster( const CDistanceMatrix& Dist, const vector<bool>* pvecfGenes ) {
00213     CDistanceMatrix Sim;
00214     size_t          i, j, k, iAssigned, iParentless, iOne, iTwo;
00215     float           d, dTotal, dMin;
00216     vector<float>   vecdHeight, vecdMax;
00217     vector<size_t>  veciChild1, veciChild2, veciChildren, veciMax, veciOwner;
00218 
00219     if( !Dist.GetSize( ) )
00220         return NULL;
00221 
00222     dMin = FLT_MAX;
00223     if( pvecfGenes ) {
00224         for( i = j = 0; i < pvecfGenes->size( ); ++i )
00225             if( (*pvecfGenes)[ i ] )
00226                 j++;
00227         Sim.Initialize( j );
00228         for( iOne = i = 0; i < Dist.GetSize( ); ++i )
00229             if( (*pvecfGenes)[ i ] ) {
00230                 for( iTwo = 0,j = 0; j < Dist.GetSize( ); ++j )
00231                     if( (*pvecfGenes)[ j ] )
00232                         Sim.Set( iOne, iTwo++, Dist.Get( i, j ) );
00233                 iOne++; } }
00234     else {
00235         Sim.Initialize( Dist.GetSize( ) );
00236         for( i = 0; i < Sim.GetSize( ); ++i )
00237             Sim.Set( i, Dist.Get( i ) ); }
00238     for( i = 0; i < Sim.GetSize( ); ++i )
00239         for( j = ( i + 1 ); j < Sim.GetSize( ); ++j )
00240             if( ( ( d = Sim.Get( i, j ) ) == -FLT_MAX ) || CMeta::IsNaN( d ) ) {
00241                 g_CatSleipnir( ).error( "CClustHierarchicalImpl::Cluster( ) illegal input value at %d, %d", i,
00242                     j );
00243                 return NULL; }
00244     iAssigned = iParentless = Sim.GetSize( );
00245     dTotal = FLT_MAX;
00246 
00247     vecdHeight.resize( Sim.GetSize( ) );
00248     veciChild1.resize( Sim.GetSize( ) );
00249     veciChild2.resize( Sim.GetSize( ) );
00250     veciChildren.resize( Sim.GetSize( ) * 2 );
00251     for( i = 0; i < veciChild1.size( ); ++i ) {
00252         veciChild1[ i ] = veciChild2[ i ] = -1;
00253         veciChildren[ i ] = 1; }
00254 
00255     vecdMax.resize( Sim.GetSize( ) );
00256     veciMax.resize( Sim.GetSize( ) );
00257     vecdMax[ 0 ] = -FLT_MAX;
00258     veciMax[ 0 ] = -1;
00259     for( i = 1; i < Sim.GetSize( ); ++i ) {
00260         size_t  iMin;
00261 
00262         iMin = 0;
00263         dMin = Sim.Get( 0, i );
00264         for( j = 1; j < i; ++j )
00265             if( ( d = Sim.Get( j, i ) ) > dMin ) {
00266                 dMin = d;
00267                 iMin = j; }
00268         vecdMax[ i ] = dMin;
00269         veciMax[ i ] = iMin; }
00270 
00271     veciOwner.resize( Sim.GetSize( ) );
00272     for( i = 0; i < veciOwner.size( ); ++i )
00273         veciOwner[ i ] = i;
00274     while( iParentless > 1 ) {
00275         float   dHeight;
00276 
00277         if( !( iParentless % 500 ) )
00278             g_CatSleipnir( ).notice( "CClustHierarchical::Cluster( ) %d/%d nodes remaining", iParentless,
00279                 Sim.GetSize( ) );
00280         dHeight = -FLT_MAX;
00281         for( k = 0; k < Sim.GetSize( ); ++k )
00282             if( vecdMax[ k ] > dHeight )
00283                 dHeight = vecdMax[ i = k ];
00284         j = veciMax[ i ];
00285 
00286         if( ( vecdHeight[ ( k = iAssigned++ ) - Sim.GetSize( ) ] = dHeight ) < dTotal )
00287             dTotal = dHeight;
00288         iParentless--;
00289 
00290         UpdateDistances( i, j, Sim, veciChildren[ veciOwner[ i ] ], veciChildren[ veciOwner[ j ] ], vecdMax,
00291             veciMax );
00292         AssertParentage( veciChildren, veciChild1, veciChild2, veciOwner[ i ], k );
00293         AssertParentage( veciChildren, veciChild1, veciChild2, veciOwner[ j ], k );
00294         veciOwner[ i ] = k;
00295         veciOwner[ j ] = -1; }
00296 
00297     return ConstructHierarchy( veciChild1, veciChild2, vecdHeight, ( 2 * Sim.GetSize( ) ) - 2 ); }
00298 
00299 void CClustHierarchicalImpl::UpdateDistances( size_t iOne, size_t iTwo, CDistanceMatrix& Sim, size_t iWOne,
00300     size_t iWTwo, vector<float>& vecdMax, vector<size_t>& veciMax ) {
00301     float   d, dOne, dTwo;
00302     size_t  i, j;
00303 
00304     vecdMax[ iOne ] = -FLT_MAX;
00305     veciMax[ iOne ] = -1;
00306     // Update row iOne across
00307     for( i = 0; i < iOne; ++i )
00308         if( !CMeta::IsNaN( dOne = Sim.Get( i, iOne ) ) && !CMeta::IsNaN( dTwo = Sim.Get( i, iTwo ) ) ) {
00309             Sim.Set( i, iOne, d = ( ( iWOne * dOne ) + ( iWTwo * dTwo ) ) / ( iWOne + iWTwo ) );
00310             if( d > vecdMax[ iOne ] ) {
00311                 vecdMax[ iOne ] = d;
00312                 veciMax[ iOne ] = i; } }
00313     // Update row iOne down
00314     for( i = ( iOne + 1 ); i < Sim.GetSize( ); ++i )
00315         if( !CMeta::IsNaN( dOne = Sim.Get( iOne, i ) ) && !CMeta::IsNaN( dTwo = Sim.Get( iTwo, i ) ) ) {
00316             Sim.Set( iOne, i, d = ( ( iWOne * dOne ) + ( iWTwo * dTwo ) ) / ( iWOne + iWTwo ) );
00317             if( veciMax[ i ] == iOne ) {
00318                 vecdMax[ i ] = -FLT_MAX;
00319                 veciMax[ i ] = 0;
00320                 for( j = 0; j < i; ++j )
00321                     if( !CMeta::IsNaN( d = Sim.Get( j, i ) ) && ( d > vecdMax[ i ] ) ) {
00322                         vecdMax[ i ] = d;
00323                         veciMax[ i ] = j; } }
00324             else if( d > vecdMax[ i ] ) {
00325                 vecdMax[ i ] = d;
00326                 veciMax[ i ] = iOne; } }
00327     // Delete row iTwo across
00328     for( i = 0; i < iTwo; ++i )
00329         Sim.Set( i, iTwo, CMeta::GetNaN( ) );
00330     vecdMax[ iTwo ] = -FLT_MAX;
00331     veciMax[ iTwo ] = -1;
00332     // Delete row iTwo down
00333     for( i = ( iTwo + 1 ); i < Sim.GetSize( ); ++i ) {
00334         Sim.Set( iTwo, i, CMeta::GetNaN( ) );
00335         if( veciMax[ i ] == iTwo ) {
00336             vecdMax[ i ] = -FLT_MAX;
00337             veciMax[ i ] = 0;
00338             for( j = 0; j < i; ++j )
00339                 if( !CMeta::IsNaN( d = Sim.Get( j, i ) ) && ( d > vecdMax[ i ] ) ) {
00340                     vecdMax[ i ] = d;
00341                     veciMax[ i ] = j; } } } }
00342 
00343 CHierarchy* CClustHierarchicalImpl::ConstructHierarchy( const vector<size_t>& veciChild1,
00344     const vector<size_t>& veciChild2, const vector<float>& vecdHeight, size_t iID ) {
00345     bool    fNode;
00346 
00347     if( fNode = ( iID >= veciChild1.size( ) ) )
00348         iID -= veciChild1.size( );
00349     return new CHierarchy( iID, vecdHeight[ iID ],
00350             fNode ? ConstructHierarchy( veciChild1, veciChild2, vecdHeight, veciChild1[ iID ] ) : NULL,
00351             fNode ? ConstructHierarchy( veciChild1, veciChild2, vecdHeight, veciChild2[ iID ] ) : NULL ); }
00352 
00353 }