Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "clustkmeans.h" 00024 #include "measure.h" 00025 #include "meta.h" 00026 00027 namespace Sleipnir { 00028 00067 bool CClustKMeans::Cluster( const CDataMatrix& MatData, const IMeasure* pMeasure, size_t iK, 00068 vector<uint16_t>& vecsClusters, const CDataMatrix* pMatWeights ) { 00069 size_t i, j, iIteration; 00070 float d; 00071 CDataMatrix MatMeans; 00072 bool fDone; 00073 vector<size_t> veciCounts; 00074 uint16_t sMax; 00075 00076 if( !pMeasure || ( MatData.GetRows( ) < iK ) ) 00077 return false; 00078 00079 MatMeans.Initialize( iK, MatData.GetColumns( ) ); 00080 for( i = 0; i < MatMeans.GetRows( ); ++i ) 00081 Randomize( MatMeans, i, MatData ); 00082 00083 vecsClusters.resize( MatData.GetRows( ) ); 00084 fill( vecsClusters.begin( ), vecsClusters.end( ), -1 ); 00085 veciCounts.resize( iK ); 00086 for( iIteration = 0,fDone = false; !fDone; ++iIteration ) { 00087 if( !( iIteration % 10 ) ) 00088 g_CatSleipnir( ).notice( "CClustKMeans::Cluster( %d ) iteration %d", iK, iIteration ); 00089 fill( veciCounts.begin( ), veciCounts.end( ), 0 ); 00090 fDone = true; 00091 for( i = 0; i < vecsClusters.size( ); ++i ) { 00092 float dMax; 00093 00094 dMax = -FLT_MAX; 00095 for( sMax = j = 0; j < MatMeans.GetRows( ); ++j ) { 00096 d = (float)pMeasure->Measure( MatData.Get( i ), MatData.GetColumns( ), MatMeans.Get( j ), 00097 MatMeans.GetColumns( ), IMeasure::EMapNone, pMatWeights ? pMatWeights->Get( i ) : NULL ); 00098 if( CMeta::IsNaN( d ) ) { 00099 g_CatSleipnir( ).error( "CClustKMeans::Cluster( %d ) got invalid measure for genes: %d, %d", 00100 iK, i, j ); 00101 return false; } 00102 if( d > dMax ) { 00103 dMax = d; 00104 sMax = j; } } 00105 veciCounts[ sMax ]++; 00106 if( vecsClusters[ i ] != sMax ) { 00107 fDone = false; 00108 vecsClusters[ i ] = sMax; } } 00109 00110 MatMeans.Clear( ); 00111 for( i = 0; i < vecsClusters.size( ); ++i ) 00112 for( j = 0; j < MatMeans.GetColumns( ); ++j ) 00113 MatMeans.Get( vecsClusters[ i ], j ) += MatData.Get( i, j ); 00114 for( i = 0; i < MatMeans.GetRows( ); ++i ) 00115 if( veciCounts[ i ] ) 00116 for( j = 0; j < MatMeans.GetColumns( ); ++j ) 00117 MatMeans.Get( i, j ) /= veciCounts[ i ]; 00118 else 00119 Randomize( MatMeans, i, MatData ); } 00120 00121 return true; } 00122 00152 bool CClustKMeans::Cluster( const CDistanceMatrix& MatSimilarities, size_t iK, 00153 vector<uint16_t>& vecsClusters ) { 00154 size_t i, j, iOne, iIteration, iChanged, iState; 00155 float d, dMax, dMin; 00156 CDataMatrix MatPrev, MatNext; 00157 vector<size_t> veciPrev, veciNext; 00158 uint16_t sMax; 00159 set<size_t> setiStates; 00160 00161 if( MatSimilarities.GetSize( ) < iK ) 00162 return false; 00163 00164 dMax = -( dMin = FLT_MAX ); 00165 for( i = 0; i < MatSimilarities.GetSize( ); ++i ) 00166 for( j = ( i + 1 ); j < MatSimilarities.GetSize( ); ++j ) 00167 if( !CMeta::IsNaN( d = MatSimilarities.Get( i, j ) ) ) { 00168 if( d > dMax ) 00169 dMax = d; 00170 if( d < dMin ) 00171 dMin = d; } 00172 if( dMin == FLT_MAX ) 00173 return false; 00174 dMax++; 00175 dMin--; 00176 MatPrev.Initialize( MatSimilarities.GetSize( ), iK ); 00177 for( i = 0; i < MatPrev.GetColumns( ); ++i ) { 00178 iOne = rand( ) % MatSimilarities.GetSize( ); 00179 for( j = 0; j < MatPrev.GetRows( ); ++j ) 00180 MatPrev.Set( j, i, GetClean( iOne, j, dMin, dMax, MatSimilarities ) ); } 00181 MatNext.Initialize( MatPrev.GetRows( ), MatPrev.GetColumns( ) ); 00182 MatNext.Clear( ); 00183 00184 vecsClusters.resize( MatSimilarities.GetSize( ) ); 00185 fill( vecsClusters.begin( ), vecsClusters.end( ), iK ); 00186 veciPrev.resize( iK ); 00187 fill( veciPrev.begin( ), veciPrev.end( ), 1 ); 00188 veciNext.resize( veciPrev.size( ) ); 00189 for( iChanged = MatSimilarities.GetSize( ),iIteration = 0; iChanged > 2; ++iIteration ) { 00190 g_CatSleipnir( ).log( ( iIteration % 10 ) ? Priority::DEBUG : Priority::NOTICE, 00191 "CClustKMeans::Cluster( %d ) iteration %d", iK, iIteration ); 00192 for( iChanged = i = 0; i < vecsClusters.size( ); ++i ) { 00193 float dMax; 00194 00195 dMax = -FLT_MAX; 00196 for( sMax = j = 0; j < MatPrev.GetColumns( ); ++j ) { 00197 if( CMeta::IsNaN( d = MatPrev.Get( i, j ) ) ) { 00198 g_CatSleipnir( ).error( "CClustKMeans::Cluster( %d ) failed on gene %d, cluster %d", i, j ); 00199 return false; } 00200 d /= veciPrev[ j ]; 00201 if( d > dMax ) { 00202 dMax = d; 00203 sMax = j; } } 00204 if( vecsClusters[ i ] != sMax ) { 00205 iChanged++; 00206 if( vecsClusters[ i ] != iK ) 00207 veciNext[ vecsClusters[ i ] ]--; 00208 veciNext[ sMax ]++; 00209 for( j = 0; j < MatSimilarities.GetSize( ); ++j ) { 00210 d = GetClean( i, j, dMin, dMax, MatSimilarities ); 00211 if( vecsClusters[ i ] != iK ) 00212 MatNext.Get( j, vecsClusters[ i ] ) -= d; 00213 MatNext.Get( j, sMax ) += d; } 00214 vecsClusters[ i ] = sMax; } } 00215 00216 for( i = 0; i < veciNext.size( ); ++i ) 00217 if( !veciNext[ i ] ) { 00218 do 00219 iOne = rand( ) % vecsClusters.size( ); 00220 while( veciNext[ vecsClusters[ iOne ] ] < 2 ); 00221 g_CatSleipnir( ).info( "CClustKMeans::Cluster( %d ) moving gene %d into empty cluster %d", iK, 00222 iOne, i ); 00223 veciNext[ vecsClusters[ iOne ] ]--; 00224 for( j = 0; j < MatNext.GetRows( ); ++j ) { 00225 d = GetClean( iOne, j, dMin, dMax, MatSimilarities ); 00226 MatNext.Get( j, vecsClusters[ iOne ] ) -= d; 00227 MatNext.Set( j, i, d ); } 00228 veciNext[ i ]++; } 00229 00230 // This calculates a simple hash for the current cluster assignments 00231 for( iState = i = 0; i < vecsClusters.size( ); ++i ) { 00232 j = iState & (size_t)-32768; 00233 iState <<= 15; 00234 iState ^= j >> ( ( sizeof(size_t) * 8 ) - 15 ); 00235 iState ^= vecsClusters[ i ] + 1; } 00236 if( setiStates.find( iState ) != setiStates.end( ) ) { 00237 g_CatSleipnir( ).info( "CClustKMeans::Cluster( %d ) found redundant state, terminating", iK ); 00238 break; } 00239 setiStates.insert( iState ); 00240 00241 g_CatSleipnir( ).notice( "CClustKMeans::Cluster( %d ) updated %d genes", iK, iChanged ); 00242 if( g_CatSleipnir( ).isDebugEnabled( ) ) 00243 for( i = 0; i < MatPrev.GetRows( ); ++i ) { 00244 ostringstream ossm; 00245 00246 for( j = 0; j < MatPrev.GetColumns( ); ++j ) 00247 ossm << ( j ? "\t" : "" ) << MatPrev.Get( i, j ); 00248 g_CatSleipnir( ).debug( "CClustKMeans::Cluster( %d ) object %d: %s", iK, i, 00249 ossm.str( ).c_str( ) ); } 00250 copy( veciNext.begin( ), veciNext.end( ), veciPrev.begin( ) ); 00251 for( i = 0; i < MatPrev.GetRows( ); ++i ) 00252 memcpy( MatPrev.Get( i ), MatNext.Get( i ), MatPrev.GetColumns( ) * sizeof(*MatPrev.Get( i )) ); } 00253 00254 return true; } 00255 00256 void CClustKMeansImpl::Randomize( CDataMatrix& MatMeans, size_t iRow, const CDataMatrix& MatData ) { 00257 00258 MatMeans.Set( iRow, MatData.Get( rand( ) % MatData.GetRows( ) ) ); } 00259 00260 }