Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "clustqtc.h" 00024 #include "pcl.h" 00025 #include "measure.h" 00026 #include "meta.h" 00027 00028 namespace Sleipnir { 00029 00068 uint16_t CClustQTC::Cluster( const CDataMatrix& MatData, const IMeasure* pMeasure, 00069 float dDiameter, size_t iSize, vector<uint16_t>& vecsClusters, const CDataMatrix* pMatWeights ) { 00070 CDistanceMatrix Dist; 00071 00072 InitializeDistances( MatData, pMeasure, Dist, pMatWeights ); 00073 return QualityThresholdAll( MatData.GetRows( ), dDiameter, iSize, Dist, vecsClusters ); } 00074 00105 uint16_t CClustQTC::Cluster( const CDistanceMatrix& MatSimilarities, float dDiameter, size_t iSize, 00106 vector<uint16_t>& vecsClusters ) { 00107 00108 return QualityThresholdAll( MatSimilarities.GetSize( ), dDiameter, iSize, MatSimilarities, 00109 vecsClusters ); } 00110 00150 void CClustQTC::Cluster( const CDataMatrix& MatData, const IMeasure* pMeasure, 00151 float dMinDiameter, float dMaxDiameter, float dDeltaDiameter, size_t iSize, CDistanceMatrix& MatResults, 00152 const CDataMatrix* pMatWeights ) { 00153 CDistanceMatrix Dist; 00154 00155 InitializeDistances( MatData, pMeasure, Dist, pMatWeights ); 00156 return Cluster( Dist, dMinDiameter, dMaxDiameter, dDeltaDiameter, iSize, MatResults ); } 00157 00190 void CClustQTC::Cluster( const CDistanceMatrix& MatSimilarities, float dMinDiameter, float dMaxDiameter, 00191 float dDeltaDiameter, size_t iSize, CDistanceMatrix& MatResults ) { 00192 float dDiameter; 00193 vector<uint16_t> vecsClusters; 00194 uint16_t sClusters; 00195 size_t i, j; 00196 00197 for( dDiameter = dMinDiameter; dDiameter <= dMaxDiameter; dDiameter += dDeltaDiameter ) { 00198 g_CatSleipnir( ).notice( "CClustQTC::Cluster( %g, %g, %g, %d ) processing diameter %g", dMinDiameter, 00199 dMaxDiameter, dDeltaDiameter, iSize, dDiameter ); 00200 sClusters = QualityThresholdAll( MatSimilarities.GetSize( ), dDiameter, iSize, MatSimilarities, 00201 vecsClusters ); 00202 for( i = 0; i < vecsClusters.size( ); ++i ) { 00203 if( ( vecsClusters[ i ] + 1 ) == sClusters ) 00204 continue; 00205 for( j = ( i + 1 ); j < vecsClusters.size( ); ++j ) 00206 if( ( vecsClusters[ j ] == vecsClusters[ i ] ) && CMeta::IsNaN( MatResults.Get( i, j ) ) ) 00207 MatResults.Set( i, j, 1 - dDiameter ); } } } 00208 00209 uint16_t CClustQTCImpl::QualityThresholdAll( size_t iGenes, float dDiameter, size_t iSize, 00210 const CDistanceMatrix& Dist, vector<uint16_t>& vecsClusters ) { 00211 size_t i, iAssigned; 00212 uint16_t sCluster; 00213 vector<uint16_t> vecsCur; 00214 vector<bool> vecfAssigned; 00215 00216 vecfAssigned.resize( iGenes ); 00217 for( i = 0; i < vecfAssigned.size( ); ++i ) 00218 vecfAssigned[ i ] = false; 00219 00220 vecsClusters.resize( iGenes ); 00221 for( iAssigned = sCluster = 0; ; ++sCluster ) { 00222 g_CatSleipnir( ).notice( "CClustQTCImpl::QualityThresholdAll( ) cluster %d, assigned %d/%d genes", 00223 sCluster + 1, iAssigned, iGenes ); 00224 QualityThresholdLargest( iGenes, dDiameter, Dist, vecfAssigned, vecsCur ); 00225 for( i = 0; i < vecsCur.size( ); ++i ) 00226 vecsClusters[ vecsCur[ i ] ] = sCluster; 00227 00228 if( vecsCur.size( ) < iSize ) { 00229 for( i = 0; i < vecfAssigned.size( ); ++i ) 00230 if( !vecfAssigned[ i ] ) 00231 vecsClusters[ i ] = sCluster; 00232 break; } 00233 00234 if( ( iAssigned += vecsCur.size( ) ) >= iGenes ) { 00235 sCluster++; 00236 break; } 00237 for( i = 0; i < vecsCur.size( ); ++i ) 00238 vecfAssigned[ vecsCur[ i ] ] = true; } 00239 00240 return ( sCluster + 1 ); } 00241 00242 void CClustQTCImpl::QualityThresholdLargest( size_t iGenes, float dDiameter, const CDistanceMatrix& Dist, 00243 const vector<bool>& vecfAssigned, 00244 vector<uint16_t>& vecsCluster ) { 00245 vector<bool> vecfClone; 00246 size_t iGene; 00247 vector<uint16_t> vecsCur; 00248 vector<float> vecdDiameter; 00249 00250 vecsCluster.clear( ); 00251 vecfClone.resize( vecfAssigned.size( ) ); 00252 for( iGene = 0; iGene < vecfAssigned.size( ); ++iGene ) { 00253 if( !( iGene % 1000 ) ) 00254 g_CatSleipnir( ).notice( "CClustQTCImpl::QualityThresholdLargest( %g ) processing gene %d/%d", 00255 dDiameter, iGene, vecfAssigned.size( ) ); 00256 if( vecfAssigned[ iGene ] ) 00257 continue; 00258 copy( vecfAssigned.begin( ), vecfAssigned.end( ), vecfClone.begin( ) ); 00259 vecfClone[ iGene ] = true; 00260 QualityThresholdGene( iGene, iGenes, dDiameter, Dist, vecfClone, vecdDiameter, vecsCur ); 00261 if( vecsCur.size( ) > vecsCluster.size( ) ) { 00262 vecsCluster.resize( vecsCur.size( ) ); 00263 copy( vecsCur.begin( ), vecsCur.end( ), vecsCluster.begin( ) ); } } } 00264 00265 void CClustQTCImpl::QualityThresholdGene( size_t iGene, size_t iGenes, float dDiameter, 00266 const CDistanceMatrix& Dist, vector<bool>& vecfAssigned, vector<float>& vecdDiameter, 00267 vector<uint16_t>& vecsCluster ) { 00268 size_t iAdded, iLocal, iBest; 00269 float dBest; 00270 00271 vecsCluster.resize( 1 ); 00272 vecsCluster[ 0 ] = iAdded = iGene; 00273 vecdDiameter.resize( iGenes ); 00274 for( iLocal = 0; iLocal < vecfAssigned.size( ); ++iLocal ) 00275 if( !vecfAssigned[ iLocal ] ) 00276 vecdDiameter[ iLocal ] = -(float)HUGE_VAL; 00277 00278 while( true ) { 00279 iBest = -1; 00280 dBest = (float)HUGE_VAL; 00281 00282 for( iLocal = 0; iLocal < vecfAssigned.size( ); ++iLocal ) { 00283 if( vecfAssigned[ iLocal ] ) 00284 continue; 00285 vecdDiameter[ iLocal ] = max( vecdDiameter[ iLocal ], Dist.Get( iLocal, iAdded ) ); 00286 if( vecdDiameter[ iLocal ] > dDiameter ) { 00287 vecfAssigned[ iLocal ] = true; 00288 continue; } 00289 if( vecdDiameter[ iLocal ] < dBest ) { 00290 dBest = vecdDiameter[ iLocal ]; 00291 iBest = iLocal; } } 00292 00293 if( iBest == -1 ) 00294 break; 00295 vecfAssigned[ iAdded = iBest ] = true; 00296 vecsCluster.push_back( iAdded ); } } 00297 00298 void CClustQTCImpl::InitializeDistances( const CDataMatrix& Data, const IMeasure* pMeasure, 00299 CDistanceMatrix& Dist, const CDataMatrix* pWeights ) { 00300 size_t i, j; 00301 float* adA; 00302 float* adB; 00303 float* adWA; 00304 float* adWB; 00305 00306 Dist.Initialize( Data.GetRows( ) ); 00307 adA = new float[ Data.GetColumns( ) - 1 ]; 00308 adB = new float[ Data.GetColumns( ) - 1 ]; 00309 if( pWeights ) { 00310 adWA = new float[ Data.GetColumns( ) - 1 ]; 00311 adWB = new float[ Data.GetColumns( ) - 1 ]; } 00312 else 00313 adWA = adWB = NULL; 00314 for( i = 0; i < Data.GetRows( ); ++i ) { 00315 if( !( i % 10 ) ) 00316 g_CatSleipnir( ).notice( "CClustQTCImpl::InitializeDistances( %d ) initializing %d/%d genes", 00317 i, Data.GetRows( ) ); 00318 for( j = ( i + 1 ); j < Data.GetRows( ); ++j ) 00319 Dist.Set( i, j, (float)GetJackDistance( Data.Get( i ), Data.Get( j ), 00320 Data.GetColumns( ), adA, adB, pMeasure, pWeights ? pWeights->Get( i ) : NULL, 00321 pWeights ? pWeights->Get( j ) : NULL, adWA, adWB ) ); } 00322 if( pWeights ) { 00323 delete[] adWA; 00324 delete[] adWB; } 00325 delete[] adA; 00326 delete[] adB; } 00327 00328 double CClustQTCImpl::GetJackDistance( const float* adX, const float* adY, size_t iN, float* adA, float* adB, 00329 const IMeasure* pMeasure, const float* adWX, const float* adWY, 00330 float* adWA, float* adWB ) { 00331 size_t i; 00332 double dRet, dCur; 00333 00334 dRet = pMeasure->Measure( adX, iN, adY, iN, IMeasure::EMapCenter, adWX, adWY ); 00335 dRet = 1 - dRet; 00336 for( i = 0; i < iN; ++i ) { 00337 memcpy( adA, adX, i * sizeof(*adX) ); 00338 memcpy( adA + i, adX + i + 1, ( iN - 1 - i ) * sizeof(*adX) ); 00339 memcpy( adB, adY, i * sizeof(*adY) ); 00340 memcpy( adB + i, adY + i + 1, ( iN - 1 - i ) * sizeof(*adY) ); 00341 if( adWX && adWY && adWA && adWB ) { 00342 memcpy( adWA, adWX, i * sizeof(*adWX) ); 00343 memcpy( adWA + i, adWX + i + 1, ( iN - 1 - i ) * sizeof(*adWX) ); 00344 memcpy( adWB, adWY, i * sizeof(*adWY) ); 00345 memcpy( adWB + i, adWY + i + 1, ( iN - 1 - i ) * sizeof(*adWY) ); } 00346 dCur = pMeasure->Measure( adA, iN - 1, adB, iN - 1, IMeasure::EMapCenter, adWA, adWB ); 00347 if( ( dCur = ( 1 - dCur ) ) > dRet ) 00348 dRet = dCur; } 00349 00350 return dRet; } 00351 00352 }