Sleipnir
src/clustkmeans.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "clustkmeans.h"
00024 #include "measure.h"
00025 #include "meta.h"
00026 
00027 namespace Sleipnir {
00028 
00067 bool CClustKMeans::Cluster( const CDataMatrix& MatData, const IMeasure* pMeasure, size_t iK,
00068     vector<uint16_t>& vecsClusters, const CDataMatrix* pMatWeights ) {
00069     size_t          i, j, iIteration;
00070     float           d;
00071     CDataMatrix     MatMeans;
00072     bool            fDone;
00073     vector<size_t>  veciCounts;
00074     uint16_t        sMax;
00075 
00076     if( !pMeasure || ( MatData.GetRows( ) < iK ) )
00077         return false;
00078 
00079     MatMeans.Initialize( iK, MatData.GetColumns( ) );
00080     for( i = 0; i < MatMeans.GetRows( ); ++i )
00081         Randomize( MatMeans, i, MatData );
00082 
00083     vecsClusters.resize( MatData.GetRows( ) );
00084     fill( vecsClusters.begin( ), vecsClusters.end( ), -1 );
00085     veciCounts.resize( iK );
00086     for( iIteration = 0,fDone = false; !fDone; ++iIteration ) {
00087         if( !( iIteration % 10 ) )
00088             g_CatSleipnir( ).notice( "CClustKMeans::Cluster( %d ) iteration %d", iK, iIteration );
00089         fill( veciCounts.begin( ), veciCounts.end( ), 0 );
00090         fDone = true;
00091         for( i = 0; i < vecsClusters.size( ); ++i ) {
00092             float   dMax;
00093 
00094             dMax = -FLT_MAX;
00095             for( sMax = j = 0; j < MatMeans.GetRows( ); ++j ) {
00096                 d = (float)pMeasure->Measure( MatData.Get( i ), MatData.GetColumns( ), MatMeans.Get( j ),
00097                     MatMeans.GetColumns( ), IMeasure::EMapNone, pMatWeights ? pMatWeights->Get( i ) : NULL );
00098                 if( CMeta::IsNaN( d ) ) {
00099                     g_CatSleipnir( ).error( "CClustKMeans::Cluster( %d ) got invalid measure for genes: %d, %d",
00100                         iK, i, j );
00101                     return false; }
00102                 if( d > dMax ) {
00103                     dMax = d;
00104                     sMax = j; } }
00105             veciCounts[ sMax ]++;
00106             if( vecsClusters[ i ] != sMax ) {
00107                 fDone = false;
00108                 vecsClusters[ i ] = sMax; } }
00109 
00110         MatMeans.Clear( );
00111         for( i = 0; i < vecsClusters.size( ); ++i )
00112             for( j = 0; j < MatMeans.GetColumns( ); ++j )
00113                 MatMeans.Get( vecsClusters[ i ], j ) += MatData.Get( i, j );
00114         for( i = 0; i < MatMeans.GetRows( ); ++i )
00115             if( veciCounts[ i ] )
00116                 for( j = 0; j < MatMeans.GetColumns( ); ++j )
00117                     MatMeans.Get( i, j ) /= veciCounts[ i ];
00118             else
00119                 Randomize( MatMeans, i, MatData ); }
00120 
00121     return true; }
00122 
00152 bool CClustKMeans::Cluster( const CDistanceMatrix& MatSimilarities, size_t iK,
00153     vector<uint16_t>& vecsClusters ) {
00154     size_t          i, j, iOne, iIteration, iChanged, iState;
00155     float           d, dMax, dMin;
00156     CDataMatrix     MatPrev, MatNext;
00157     vector<size_t>  veciPrev, veciNext;
00158     uint16_t        sMax;
00159     set<size_t>     setiStates;
00160 
00161     if( MatSimilarities.GetSize( ) < iK )
00162         return false;
00163 
00164     dMax = -( dMin = FLT_MAX );
00165     for( i = 0; i < MatSimilarities.GetSize( ); ++i )
00166         for( j = ( i + 1 ); j < MatSimilarities.GetSize( ); ++j )
00167             if( !CMeta::IsNaN( d = MatSimilarities.Get( i, j ) ) ) {
00168                 if( d > dMax )
00169                     dMax = d;
00170                 if( d < dMin )
00171                     dMin = d; }
00172     if( dMin == FLT_MAX )
00173         return false;
00174     dMax++;
00175     dMin--;
00176     MatPrev.Initialize( MatSimilarities.GetSize( ), iK );
00177     for( i = 0; i < MatPrev.GetColumns( ); ++i ) {
00178         iOne = rand( ) % MatSimilarities.GetSize( );
00179         for( j = 0; j < MatPrev.GetRows( ); ++j )
00180             MatPrev.Set( j, i, GetClean( iOne, j, dMin, dMax, MatSimilarities ) ); }
00181     MatNext.Initialize( MatPrev.GetRows( ), MatPrev.GetColumns( ) );
00182     MatNext.Clear( );
00183 
00184     vecsClusters.resize( MatSimilarities.GetSize( ) );
00185     fill( vecsClusters.begin( ), vecsClusters.end( ), iK );
00186     veciPrev.resize( iK );
00187     fill( veciPrev.begin( ), veciPrev.end( ), 1 );
00188     veciNext.resize( veciPrev.size( ) );
00189     for( iChanged = MatSimilarities.GetSize( ),iIteration = 0; iChanged > 2; ++iIteration ) {
00190         g_CatSleipnir( ).log( ( iIteration % 10 ) ? Priority::DEBUG : Priority::NOTICE,
00191             "CClustKMeans::Cluster( %d ) iteration %d", iK, iIteration );
00192         for( iChanged = i = 0; i < vecsClusters.size( ); ++i ) {
00193             float   dMax;
00194 
00195             dMax = -FLT_MAX;
00196             for( sMax = j = 0; j < MatPrev.GetColumns( ); ++j ) {
00197                 if( CMeta::IsNaN( d = MatPrev.Get( i, j ) ) ) {
00198                     g_CatSleipnir( ).error( "CClustKMeans::Cluster( %d ) failed on gene %d, cluster %d", i, j );
00199                     return false; }
00200                 d /= veciPrev[ j ];
00201                 if( d > dMax ) {
00202                     dMax = d;
00203                     sMax = j; } }
00204             if( vecsClusters[ i ] != sMax ) {
00205                 iChanged++;
00206                 if( vecsClusters[ i ] != iK )
00207                     veciNext[ vecsClusters[ i ] ]--;
00208                 veciNext[ sMax ]++;
00209                 for( j = 0; j < MatSimilarities.GetSize( ); ++j ) {
00210                     d = GetClean( i, j, dMin, dMax, MatSimilarities );
00211                     if( vecsClusters[ i ] != iK )
00212                         MatNext.Get( j, vecsClusters[ i ] ) -= d;
00213                     MatNext.Get( j, sMax ) += d; }
00214                 vecsClusters[ i ] = sMax; } }
00215 
00216         for( i = 0; i < veciNext.size( ); ++i )
00217             if( !veciNext[ i ] ) {
00218                 do
00219                     iOne = rand( ) % vecsClusters.size( );
00220                 while( veciNext[ vecsClusters[ iOne ] ] < 2 );
00221                 g_CatSleipnir( ).info( "CClustKMeans::Cluster( %d ) moving gene %d into empty cluster %d", iK,
00222                     iOne, i );
00223                 veciNext[ vecsClusters[ iOne ] ]--;
00224                 for( j = 0; j < MatNext.GetRows( ); ++j ) {
00225                     d = GetClean( iOne, j, dMin, dMax, MatSimilarities );
00226                     MatNext.Get( j, vecsClusters[ iOne ] ) -= d;
00227                     MatNext.Set( j, i, d ); }
00228                 veciNext[ i ]++; }
00229 
00230 // This calculates a simple hash for the current cluster assignments
00231         for( iState = i = 0; i < vecsClusters.size( ); ++i ) {
00232             j = iState & (size_t)-32768;
00233             iState <<= 15;
00234             iState ^= j >> ( ( sizeof(size_t) * 8 ) - 15 );
00235             iState ^= vecsClusters[ i ] + 1; }
00236         if( setiStates.find( iState ) != setiStates.end( ) ) {
00237             g_CatSleipnir( ).info( "CClustKMeans::Cluster( %d ) found redundant state, terminating", iK );
00238             break; }
00239         setiStates.insert( iState );
00240 
00241         g_CatSleipnir( ).notice( "CClustKMeans::Cluster( %d ) updated %d genes", iK, iChanged );
00242         if( g_CatSleipnir( ).isDebugEnabled( ) )
00243             for( i = 0; i < MatPrev.GetRows( ); ++i ) {
00244                 ostringstream   ossm;
00245 
00246                 for( j = 0; j < MatPrev.GetColumns( ); ++j )
00247                     ossm << ( j ? "\t" : "" ) << MatPrev.Get( i, j );
00248                 g_CatSleipnir( ).debug( "CClustKMeans::Cluster( %d ) object %d: %s", iK, i,
00249                     ossm.str( ).c_str( ) ); }
00250         copy( veciNext.begin( ), veciNext.end( ), veciPrev.begin( ) );
00251         for( i = 0; i < MatPrev.GetRows( ); ++i )
00252             memcpy( MatPrev.Get( i ), MatNext.Get( i ), MatPrev.GetColumns( ) * sizeof(*MatPrev.Get( i )) ); }
00253 
00254     return true; }
00255 
00256 void CClustKMeansImpl::Randomize( CDataMatrix& MatMeans, size_t iRow, const CDataMatrix& MatData ) {
00257 
00258     MatMeans.Set( iRow, MatData.Get( rand( ) % MatData.GetRows( ) ) ); }
00259 
00260 }