Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "clusthierarchical.h" 00024 #include "dat.h" 00025 #include "measure.h" 00026 00027 namespace Sleipnir { 00028 00052 CHierarchy::CHierarchy( size_t iID, float dSimilarity, const CHierarchy* pLeft, 00053 const CHierarchy* pRight ) { 00054 00055 assert( ( pLeft && pRight ) || !( pLeft || pRight ) ); 00056 00057 m_iID = iID; 00058 m_dScore = dSimilarity; 00059 m_pLeft = pLeft; 00060 m_pRight = pRight; 00061 m_iWeight = ( m_pLeft && m_pRight ) ? ( m_pLeft->m_iWeight + m_pRight->m_iWeight ) : 1; } 00062 00063 CHierarchyImpl::~CHierarchyImpl( ) { 00064 00065 if( m_pLeft ) 00066 delete m_pLeft; 00067 if( m_pRight ) 00068 delete m_pRight; } 00069 00070 bool CHierarchyImpl::Save( std::ostream& ostm, size_t iNode, 00071 const std::vector<std::string>* pvecstrGenes ) const { 00072 00073 if( IsGene( ) ) 00074 return false; 00075 00076 if( iNode == m_iID ) { 00077 ostm << GetSave( ) << '\t' << m_pLeft->GetSave( pvecstrGenes ) << '\t' << 00078 m_pRight->GetSave( pvecstrGenes ) << '\t' << m_dScore << endl; 00079 return true; } 00080 00081 return ( ((const CHierarchyImpl*)m_pLeft)->Save( ostm, iNode, pvecstrGenes ) || 00082 ((const CHierarchyImpl*)m_pRight)->Save( ostm, iNode, pvecstrGenes ) ); } 00083 00084 string CHierarchyImpl::GetSave( const std::vector<std::string>* pvecstrGenes ) const { 00085 string strRet; 00086 char achBuf[ 16 ]; 00087 00088 if( !( pvecstrGenes && IsGene( ) ) ) { 00089 strRet = IsGene( ) ? "GENE" : "NODE"; 00090 sprintf_s( achBuf, "%d", m_iID ); 00091 strRet += achBuf; } 00092 else 00093 strRet = (*pvecstrGenes)[ m_iID ]; 00094 00095 return strRet; } 00096 00111 void CHierarchy::GetGenes( vector<size_t>& veciGenes ) const { 00112 00113 if( IsGene( ) ) 00114 veciGenes.push_back( m_iID ); 00115 else { 00116 m_pLeft->GetGenes( veciGenes ); 00117 m_pRight->GetGenes( veciGenes ); } } 00118 00138 float CHierarchy::SortChildren( const vector<float>& vecdScores ) { 00139 float dLeft, dRight, dRet; 00140 const CHierarchy* pTemp; 00141 00142 if( IsGene( ) ) 00143 return vecdScores[ m_iID ]; 00144 00145 dLeft = ((CHierarchy*)m_pLeft)->SortChildren( vecdScores ); 00146 dRight = ((CHierarchy*)m_pRight)->SortChildren( vecdScores ); 00147 dRet = ( ( dLeft * m_pLeft->m_iWeight ) + ( dRight * m_pRight->m_iWeight ) ) / 00148 ( m_pRight->m_iWeight + m_pLeft->m_iWeight ); 00149 if( dLeft < dRight ) { 00150 pTemp = m_pLeft; 00151 m_pLeft = m_pRight; 00152 m_pRight = pTemp; } 00153 00154 return dRet; } 00155 00182 CHierarchy* CClustHierarchical::Cluster( const CDistanceMatrix& MatSimilarities, 00183 const vector<bool>& vecfIncluded ) { 00184 00185 return CClustHierarchicalImpl::Cluster( MatSimilarities, &vecfIncluded ); } 00186 00206 CHierarchy* CClustHierarchical::Cluster( const CDistanceMatrix& MatSimilarities ) { 00207 00208 return CClustHierarchicalImpl::Cluster( MatSimilarities ); } 00209 00210 // Implementation inspired by TIGR MeV 00211 00212 CHierarchy* CClustHierarchicalImpl::Cluster( const CDistanceMatrix& Dist, const vector<bool>* pvecfGenes ) { 00213 CDistanceMatrix Sim; 00214 size_t i, j, k, iAssigned, iParentless, iOne, iTwo; 00215 float d, dTotal, dMin; 00216 vector<float> vecdHeight, vecdMax; 00217 vector<size_t> veciChild1, veciChild2, veciChildren, veciMax, veciOwner; 00218 00219 if( !Dist.GetSize( ) ) 00220 return NULL; 00221 00222 dMin = FLT_MAX; 00223 if( pvecfGenes ) { 00224 for( i = j = 0; i < pvecfGenes->size( ); ++i ) 00225 if( (*pvecfGenes)[ i ] ) 00226 j++; 00227 Sim.Initialize( j ); 00228 for( iOne = i = 0; i < Dist.GetSize( ); ++i ) 00229 if( (*pvecfGenes)[ i ] ) { 00230 for( iTwo = 0,j = 0; j < Dist.GetSize( ); ++j ) 00231 if( (*pvecfGenes)[ j ] ) 00232 Sim.Set( iOne, iTwo++, Dist.Get( i, j ) ); 00233 iOne++; } } 00234 else { 00235 Sim.Initialize( Dist.GetSize( ) ); 00236 for( i = 0; i < Sim.GetSize( ); ++i ) 00237 Sim.Set( i, Dist.Get( i ) ); } 00238 for( i = 0; i < Sim.GetSize( ); ++i ) 00239 for( j = ( i + 1 ); j < Sim.GetSize( ); ++j ) 00240 if( ( ( d = Sim.Get( i, j ) ) == -FLT_MAX ) || CMeta::IsNaN( d ) ) { 00241 g_CatSleipnir( ).error( "CClustHierarchicalImpl::Cluster( ) illegal input value at %d, %d", i, 00242 j ); 00243 return NULL; } 00244 iAssigned = iParentless = Sim.GetSize( ); 00245 dTotal = FLT_MAX; 00246 00247 vecdHeight.resize( Sim.GetSize( ) ); 00248 veciChild1.resize( Sim.GetSize( ) ); 00249 veciChild2.resize( Sim.GetSize( ) ); 00250 veciChildren.resize( Sim.GetSize( ) * 2 ); 00251 for( i = 0; i < veciChild1.size( ); ++i ) { 00252 veciChild1[ i ] = veciChild2[ i ] = -1; 00253 veciChildren[ i ] = 1; } 00254 00255 vecdMax.resize( Sim.GetSize( ) ); 00256 veciMax.resize( Sim.GetSize( ) ); 00257 vecdMax[ 0 ] = -FLT_MAX; 00258 veciMax[ 0 ] = -1; 00259 for( i = 1; i < Sim.GetSize( ); ++i ) { 00260 size_t iMin; 00261 00262 iMin = 0; 00263 dMin = Sim.Get( 0, i ); 00264 for( j = 1; j < i; ++j ) 00265 if( ( d = Sim.Get( j, i ) ) > dMin ) { 00266 dMin = d; 00267 iMin = j; } 00268 vecdMax[ i ] = dMin; 00269 veciMax[ i ] = iMin; } 00270 00271 veciOwner.resize( Sim.GetSize( ) ); 00272 for( i = 0; i < veciOwner.size( ); ++i ) 00273 veciOwner[ i ] = i; 00274 while( iParentless > 1 ) { 00275 float dHeight; 00276 00277 if( !( iParentless % 500 ) ) 00278 g_CatSleipnir( ).notice( "CClustHierarchical::Cluster( ) %d/%d nodes remaining", iParentless, 00279 Sim.GetSize( ) ); 00280 dHeight = -FLT_MAX; 00281 for( k = 0; k < Sim.GetSize( ); ++k ) 00282 if( vecdMax[ k ] > dHeight ) 00283 dHeight = vecdMax[ i = k ]; 00284 j = veciMax[ i ]; 00285 00286 if( ( vecdHeight[ ( k = iAssigned++ ) - Sim.GetSize( ) ] = dHeight ) < dTotal ) 00287 dTotal = dHeight; 00288 iParentless--; 00289 00290 UpdateDistances( i, j, Sim, veciChildren[ veciOwner[ i ] ], veciChildren[ veciOwner[ j ] ], vecdMax, 00291 veciMax ); 00292 AssertParentage( veciChildren, veciChild1, veciChild2, veciOwner[ i ], k ); 00293 AssertParentage( veciChildren, veciChild1, veciChild2, veciOwner[ j ], k ); 00294 veciOwner[ i ] = k; 00295 veciOwner[ j ] = -1; } 00296 00297 return ConstructHierarchy( veciChild1, veciChild2, vecdHeight, ( 2 * Sim.GetSize( ) ) - 2 ); } 00298 00299 void CClustHierarchicalImpl::UpdateDistances( size_t iOne, size_t iTwo, CDistanceMatrix& Sim, size_t iWOne, 00300 size_t iWTwo, vector<float>& vecdMax, vector<size_t>& veciMax ) { 00301 float d, dOne, dTwo; 00302 size_t i, j; 00303 00304 vecdMax[ iOne ] = -FLT_MAX; 00305 veciMax[ iOne ] = -1; 00306 // Update row iOne across 00307 for( i = 0; i < iOne; ++i ) 00308 if( !CMeta::IsNaN( dOne = Sim.Get( i, iOne ) ) && !CMeta::IsNaN( dTwo = Sim.Get( i, iTwo ) ) ) { 00309 Sim.Set( i, iOne, d = ( ( iWOne * dOne ) + ( iWTwo * dTwo ) ) / ( iWOne + iWTwo ) ); 00310 if( d > vecdMax[ iOne ] ) { 00311 vecdMax[ iOne ] = d; 00312 veciMax[ iOne ] = i; } } 00313 // Update row iOne down 00314 for( i = ( iOne + 1 ); i < Sim.GetSize( ); ++i ) 00315 if( !CMeta::IsNaN( dOne = Sim.Get( iOne, i ) ) && !CMeta::IsNaN( dTwo = Sim.Get( iTwo, i ) ) ) { 00316 Sim.Set( iOne, i, d = ( ( iWOne * dOne ) + ( iWTwo * dTwo ) ) / ( iWOne + iWTwo ) ); 00317 if( veciMax[ i ] == iOne ) { 00318 vecdMax[ i ] = -FLT_MAX; 00319 veciMax[ i ] = 0; 00320 for( j = 0; j < i; ++j ) 00321 if( !CMeta::IsNaN( d = Sim.Get( j, i ) ) && ( d > vecdMax[ i ] ) ) { 00322 vecdMax[ i ] = d; 00323 veciMax[ i ] = j; } } 00324 else if( d > vecdMax[ i ] ) { 00325 vecdMax[ i ] = d; 00326 veciMax[ i ] = iOne; } } 00327 // Delete row iTwo across 00328 for( i = 0; i < iTwo; ++i ) 00329 Sim.Set( i, iTwo, CMeta::GetNaN( ) ); 00330 vecdMax[ iTwo ] = -FLT_MAX; 00331 veciMax[ iTwo ] = -1; 00332 // Delete row iTwo down 00333 for( i = ( iTwo + 1 ); i < Sim.GetSize( ); ++i ) { 00334 Sim.Set( iTwo, i, CMeta::GetNaN( ) ); 00335 if( veciMax[ i ] == iTwo ) { 00336 vecdMax[ i ] = -FLT_MAX; 00337 veciMax[ i ] = 0; 00338 for( j = 0; j < i; ++j ) 00339 if( !CMeta::IsNaN( d = Sim.Get( j, i ) ) && ( d > vecdMax[ i ] ) ) { 00340 vecdMax[ i ] = d; 00341 veciMax[ i ] = j; } } } } 00342 00343 CHierarchy* CClustHierarchicalImpl::ConstructHierarchy( const vector<size_t>& veciChild1, 00344 const vector<size_t>& veciChild2, const vector<float>& vecdHeight, size_t iID ) { 00345 bool fNode; 00346 00347 if( fNode = ( iID >= veciChild1.size( ) ) ) 00348 iID -= veciChild1.size( ); 00349 return new CHierarchy( iID, vecdHeight[ iID ], 00350 fNode ? ConstructHierarchy( veciChild1, veciChild2, vecdHeight, veciChild1[ iID ] ) : NULL, 00351 fNode ? ConstructHierarchy( veciChild1, veciChild2, vecdHeight, veciChild2[ iID ] ) : NULL ); } 00352 00353 }