Sleipnir
src/clustqtc.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "clustqtc.h"
00024 #include "pcl.h"
00025 #include "measure.h"
00026 #include "meta.h"
00027 
00028 namespace Sleipnir {
00029 
00068 uint16_t CClustQTC::Cluster( const CDataMatrix& MatData, const IMeasure* pMeasure,
00069     float dDiameter, size_t iSize, vector<uint16_t>& vecsClusters, const CDataMatrix* pMatWeights ) {
00070     CDistanceMatrix Dist;
00071 
00072     InitializeDistances( MatData, pMeasure, Dist, pMatWeights );
00073     return QualityThresholdAll( MatData.GetRows( ), dDiameter, iSize, Dist, vecsClusters ); }
00074 
00105 uint16_t CClustQTC::Cluster( const CDistanceMatrix& MatSimilarities, float dDiameter, size_t iSize,
00106     vector<uint16_t>& vecsClusters ) {
00107 
00108     return QualityThresholdAll( MatSimilarities.GetSize( ), dDiameter, iSize, MatSimilarities,
00109         vecsClusters ); }
00110 
00150 void CClustQTC::Cluster( const CDataMatrix& MatData, const IMeasure* pMeasure,
00151     float dMinDiameter, float dMaxDiameter, float dDeltaDiameter, size_t iSize, CDistanceMatrix& MatResults,
00152     const CDataMatrix* pMatWeights ) {
00153     CDistanceMatrix Dist;
00154 
00155     InitializeDistances( MatData, pMeasure, Dist, pMatWeights );
00156     return Cluster( Dist, dMinDiameter, dMaxDiameter, dDeltaDiameter, iSize, MatResults ); }
00157 
00190 void CClustQTC::Cluster( const CDistanceMatrix& MatSimilarities, float dMinDiameter, float dMaxDiameter,
00191     float dDeltaDiameter, size_t iSize, CDistanceMatrix& MatResults ) {
00192     float               dDiameter;
00193     vector<uint16_t>    vecsClusters;
00194     uint16_t            sClusters;
00195     size_t              i, j;
00196 
00197     for( dDiameter = dMinDiameter; dDiameter <= dMaxDiameter; dDiameter += dDeltaDiameter ) {
00198         g_CatSleipnir( ).notice( "CClustQTC::Cluster( %g, %g, %g, %d ) processing diameter %g", dMinDiameter,
00199             dMaxDiameter, dDeltaDiameter, iSize, dDiameter );
00200         sClusters = QualityThresholdAll( MatSimilarities.GetSize( ), dDiameter, iSize, MatSimilarities,
00201             vecsClusters );
00202         for( i = 0; i < vecsClusters.size( ); ++i ) {
00203             if( ( vecsClusters[ i ] + 1 ) == sClusters )
00204                 continue;
00205             for( j = ( i + 1 ); j < vecsClusters.size( ); ++j )
00206                 if( ( vecsClusters[ j ] == vecsClusters[ i ] ) && CMeta::IsNaN( MatResults.Get( i, j ) ) )
00207                     MatResults.Set( i, j, 1 - dDiameter ); } } }
00208 
00209 uint16_t CClustQTCImpl::QualityThresholdAll( size_t iGenes, float dDiameter, size_t iSize,
00210     const CDistanceMatrix& Dist, vector<uint16_t>& vecsClusters ) {
00211     size_t              i, iAssigned;
00212     uint16_t            sCluster;
00213     vector<uint16_t>    vecsCur;
00214     vector<bool>        vecfAssigned;
00215 
00216     vecfAssigned.resize( iGenes );
00217     for( i = 0; i < vecfAssigned.size( ); ++i )
00218         vecfAssigned[ i ] = false;
00219 
00220     vecsClusters.resize( iGenes );
00221     for( iAssigned = sCluster = 0; ; ++sCluster ) {
00222         g_CatSleipnir( ).notice( "CClustQTCImpl::QualityThresholdAll( ) cluster %d, assigned %d/%d genes",
00223             sCluster + 1, iAssigned, iGenes );
00224         QualityThresholdLargest( iGenes, dDiameter, Dist, vecfAssigned, vecsCur );
00225         for( i = 0; i < vecsCur.size( ); ++i )
00226             vecsClusters[ vecsCur[ i ] ] = sCluster;
00227 
00228         if( vecsCur.size( ) < iSize ) {
00229             for( i = 0; i < vecfAssigned.size( ); ++i )
00230                 if( !vecfAssigned[ i ] )
00231                     vecsClusters[ i ] = sCluster;
00232             break; }
00233 
00234         if( ( iAssigned += vecsCur.size( ) ) >= iGenes ) {
00235             sCluster++;
00236             break; }
00237         for( i = 0; i < vecsCur.size( ); ++i )
00238             vecfAssigned[ vecsCur[ i ] ] = true; }
00239 
00240     return ( sCluster + 1 ); }
00241 
00242 void CClustQTCImpl::QualityThresholdLargest( size_t iGenes, float dDiameter, const CDistanceMatrix& Dist,
00243     const vector<bool>& vecfAssigned,
00244     vector<uint16_t>& vecsCluster ) {
00245     vector<bool>        vecfClone;
00246     size_t              iGene;
00247     vector<uint16_t>    vecsCur;
00248     vector<float>       vecdDiameter;
00249 
00250     vecsCluster.clear( );
00251     vecfClone.resize( vecfAssigned.size( ) );
00252     for( iGene = 0; iGene < vecfAssigned.size( ); ++iGene ) {
00253         if( !( iGene % 1000 ) )
00254             g_CatSleipnir( ).notice( "CClustQTCImpl::QualityThresholdLargest( %g ) processing gene %d/%d",
00255                 dDiameter, iGene, vecfAssigned.size( ) );
00256         if( vecfAssigned[ iGene ] )
00257             continue;
00258         copy( vecfAssigned.begin( ), vecfAssigned.end( ), vecfClone.begin( ) );
00259         vecfClone[ iGene ] = true;
00260         QualityThresholdGene( iGene, iGenes, dDiameter, Dist, vecfClone, vecdDiameter, vecsCur );
00261         if( vecsCur.size( ) > vecsCluster.size( ) ) {
00262             vecsCluster.resize( vecsCur.size( ) );
00263             copy( vecsCur.begin( ), vecsCur.end( ), vecsCluster.begin( ) ); } } }
00264 
00265 void CClustQTCImpl::QualityThresholdGene( size_t iGene, size_t iGenes, float dDiameter,
00266     const CDistanceMatrix& Dist, vector<bool>& vecfAssigned, vector<float>& vecdDiameter,
00267     vector<uint16_t>& vecsCluster ) {
00268     size_t  iAdded, iLocal, iBest;
00269     float   dBest;
00270 
00271     vecsCluster.resize( 1 );
00272     vecsCluster[ 0 ] = iAdded = iGene;
00273     vecdDiameter.resize( iGenes );
00274     for( iLocal = 0; iLocal < vecfAssigned.size( ); ++iLocal )
00275         if( !vecfAssigned[ iLocal ] )
00276             vecdDiameter[ iLocal ] = -(float)HUGE_VAL;
00277 
00278     while( true ) {
00279         iBest = -1;
00280         dBest = (float)HUGE_VAL;
00281 
00282         for( iLocal = 0; iLocal < vecfAssigned.size( ); ++iLocal ) {
00283             if( vecfAssigned[ iLocal ] )
00284                 continue;
00285             vecdDiameter[ iLocal ] = max( vecdDiameter[ iLocal ], Dist.Get( iLocal, iAdded ) );
00286             if( vecdDiameter[ iLocal ] > dDiameter ) {
00287                 vecfAssigned[ iLocal ] = true;
00288                 continue; }
00289             if( vecdDiameter[ iLocal ] < dBest ) {
00290                 dBest = vecdDiameter[ iLocal ];
00291                 iBest = iLocal; } }
00292 
00293         if( iBest == -1 )
00294             break;
00295         vecfAssigned[ iAdded = iBest ] = true;
00296         vecsCluster.push_back( iAdded ); } }
00297 
00298 void CClustQTCImpl::InitializeDistances( const CDataMatrix& Data, const IMeasure* pMeasure,
00299     CDistanceMatrix& Dist, const CDataMatrix* pWeights ) {
00300     size_t  i, j;
00301     float*  adA;
00302     float*  adB;
00303     float*  adWA;
00304     float*  adWB;
00305 
00306     Dist.Initialize( Data.GetRows( ) );
00307     adA = new float[ Data.GetColumns( ) - 1 ];
00308     adB = new float[ Data.GetColumns( ) - 1 ];
00309     if( pWeights ) {
00310         adWA = new float[ Data.GetColumns( ) - 1 ];
00311         adWB = new float[ Data.GetColumns( ) - 1 ]; }
00312     else
00313         adWA = adWB = NULL;
00314     for( i = 0; i < Data.GetRows( ); ++i ) {
00315         if( !( i % 10 ) )
00316             g_CatSleipnir( ).notice( "CClustQTCImpl::InitializeDistances( %d ) initializing %d/%d genes",
00317                 i, Data.GetRows( ) );
00318         for( j = ( i + 1 ); j < Data.GetRows( ); ++j )
00319             Dist.Set( i, j, (float)GetJackDistance( Data.Get( i ), Data.Get( j ),
00320                 Data.GetColumns( ), adA, adB, pMeasure, pWeights ? pWeights->Get( i ) : NULL,
00321                 pWeights ? pWeights->Get( j ) : NULL, adWA, adWB ) ); }
00322     if( pWeights ) {
00323         delete[] adWA;
00324         delete[] adWB; }
00325     delete[] adA;
00326     delete[] adB; }
00327 
00328 double CClustQTCImpl::GetJackDistance( const float* adX, const float* adY, size_t iN, float* adA, float* adB,
00329     const IMeasure* pMeasure, const float* adWX, const float* adWY,
00330     float* adWA, float* adWB ) {
00331     size_t  i;
00332     double  dRet, dCur;
00333 
00334     dRet = pMeasure->Measure( adX, iN, adY, iN, IMeasure::EMapCenter, adWX, adWY );
00335     dRet = 1 - dRet;
00336     for( i = 0; i < iN; ++i ) {
00337         memcpy( adA, adX, i * sizeof(*adX) );
00338         memcpy( adA + i, adX + i + 1, ( iN - 1 - i ) * sizeof(*adX) );
00339         memcpy( adB, adY, i * sizeof(*adY) );
00340         memcpy( adB + i, adY + i + 1, ( iN - 1 - i ) * sizeof(*adY) );
00341         if( adWX && adWY && adWA && adWB ) {
00342             memcpy( adWA, adWX, i * sizeof(*adWX) );
00343             memcpy( adWA + i, adWX + i + 1, ( iN - 1 - i ) * sizeof(*adWX) );
00344             memcpy( adWB, adWY, i * sizeof(*adWY) );
00345             memcpy( adWB + i, adWY + i + 1, ( iN - 1 - i ) * sizeof(*adWY) ); }
00346         dCur = pMeasure->Measure( adA, iN - 1, adB, iN - 1, IMeasure::EMapCenter, adWA, adWB );
00347         if( ( dCur = ( 1 - dCur ) ) > dRet )
00348             dRet = dCur; }
00349 
00350     return dRet; }
00351 
00352 }