Sleipnir
src/coalescecluster.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "coalescecluster.h"
00024 #include "coalescei.h"
00025 #include "coalescemotifs.h"
00026 #include "pcl.h"
00027 #include "halfmatrix.h"
00028 #include "statistics.h"
00029 #include "clusthierarchical.h"
00030 #include "pst.h"
00031 
00032 namespace Sleipnir {
00033 
00034 const char  CCoalesceClusterImpl::c_szMotifs[]      = "_motifs.txt";
00035 const char  CCoalesceClusterImpl::c_szGenes[]       = "Genes";
00036 const char  CCoalesceClusterImpl::c_szConditions[]  = "Conditions";
00037 
00089 bool CCoalesceCluster::Initialize( const CPCL& PCL, CCoalesceCluster& Pot,
00090     const std::vector<SCoalesceDataset>& vecsDatasets, std::set<std::pair<size_t, size_t> >& setpriiSeeds,
00091     const vector<float>& vecdSeed, size_t iPairs, float dPValue, float dProbability, size_t iThreads ) {
00092     size_t  i;
00093     float   dFraction;
00094     CPCL    PCLCopy;
00095 
00096     m_setiDatasets.clear( );
00097     m_setiGenes.clear( );
00098     m_setsMotifs.clear( );
00099     m_vecsDatasets.resize( vecsDatasets.size( ) );
00100     Pot.m_vecsDatasets.resize( vecsDatasets.size( ) );
00101     for( i = 0; i < m_vecsDatasets.size( ); ++i ) {
00102         m_vecsDatasets[ i ].m_psDataset = Pot.m_vecsDatasets[ i ].m_psDataset = &vecsDatasets[ i ];
00103         m_setiDatasets.insert( i ); }
00104     m_vecdPriors.resize( PCL.GetGenes( ) );
00105     fill( m_vecdPriors.begin( ), m_vecdPriors.end( ), 1 - dProbability );
00106     dFraction = min( 1.0f, 2.0f * iPairs / PCL.GetGenes( ) / ( PCL.GetGenes( ) - 1 ) );
00107 
00108     PCLCopy.Open( PCL );
00109     PCLCopy.Normalize( CPCL::ENormalizeColumn );
00110     return ( ( vecdSeed.size( ) || AddSeedPair( PCLCopy, Pot, setpriiSeeds, dFraction, dPValue, iThreads ) ) &&
00111         AddCorrelatedGenes( PCLCopy, Pot, vecdSeed, dPValue ) ); }
00112 
00113 void CCoalesceClusterImpl::Add( size_t iGene, CCoalesceCluster& Pot ) {
00114 
00115     m_setiGenes.insert( iGene );
00116     Pot.m_setiGenes.erase( iGene ); }
00117 
00118 void* CCoalesceClusterImpl::ThreadSeedPair( void* pData ) {
00119     SThreadSeedPair*        psData;
00120     double                  dR, dP;
00121     size_t                  i, j, iN, iExperiments, iPair, iPairs, iCur;
00122     pair<size_t, size_t>    priiSeed;
00123 
00124     psData = (SThreadSeedPair*)pData;
00125     iExperiments = psData->m_pPCL->GetExperiments( );
00126     psData->m_dMaxCorr = -( psData->m_dMinP = DBL_MAX );
00127     iPairs = psData->m_pPCL->GetGenes( ) * ( psData->m_pPCL->GetGenes( ) - 1 ) / 2;
00128     for( psData->m_iOne = psData->m_iTwo = 0,iPair = psData->m_iOffset; iPair < iPairs;
00129         iPair += psData->m_iStep ) {
00130         if( ( (float)rand( ) / RAND_MAX ) > psData->m_dFraction )
00131             continue;
00132         for( i = 0,iCur = iPair; iCur >= ( psData->m_pPCL->GetGenes( ) - 1 - i );
00133             iCur -= psData->m_pPCL->GetGenes( ) - 1 - i++ );
00134         j = i + iCur + 1;
00135         if( ( ( dR = CMeasurePearson::Pearson( psData->m_pPCL->Get( i ), iExperiments,
00136             psData->m_pPCL->Get( j ), iExperiments, IMeasure::EMapNone, NULL, NULL, &iN ) ) < 0 ) ||
00137             CMeta::IsNaN( dR ) )
00138             continue;
00139         if( ( ( dP = CStatistics::PValuePearson( dR, iN ) ) < psData->m_dMinP ) ||
00140             ( !dP && ( dR > psData->m_dMaxCorr ) ) ) {
00141             priiSeed.first = i;
00142             priiSeed.second = j;
00143             if( psData->m_psetpriiSeeds->find( priiSeed ) == psData->m_psetpriiSeeds->end( ) ) {
00144                 psData->m_dMaxCorr = dR;
00145                 psData->m_dMinP = dP;
00146                 psData->m_iOne = i;
00147                 psData->m_iTwo = j; } } }
00148 
00149     return NULL; }
00150 
00151 bool CCoalesceClusterImpl::AddSeedPair( const CPCL& PCL, CCoalesceCluster& Pot,
00152     std::set<std::pair<size_t, size_t> >& setpriiSeeds, float dFraction, float dPValue, size_t iThreads ) {
00153     size_t                  i, iOne, iTwo;
00154     double                  dMaxCorr, dMinP;
00155     pair<size_t, size_t>    priiSeed;
00156     vector<pthread_t>       vecpthdThreads;
00157     vector<SThreadSeedPair> vecsThreads;
00158 
00159     if( PCL.GetGenes( ) < 2 ) {
00160         g_CatSleipnir( ).error( "CCoalesceClusterImpl::AddSeedPair( %g ) found no genes", dPValue );
00161         return false; }
00162     vecpthdThreads.resize( iThreads );
00163     vecsThreads.resize( vecpthdThreads.size( ) );
00164     for( i = 0; i < vecsThreads.size( ); ++i ) {
00165         vecsThreads[ i ].m_iOffset = i;
00166         vecsThreads[ i ].m_iStep = vecsThreads.size( );
00167         vecsThreads[ i ].m_pPCL = &PCL;
00168         vecsThreads[ i ].m_dFraction = dFraction;
00169         vecsThreads[ i ].m_psetpriiSeeds = &setpriiSeeds;
00170         if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSeedPair, &vecsThreads[ i ] ) ) {
00171             g_CatSleipnir( ).error( "CCoalesceClusterImpl::AddSeedPair( %g, %g, %d ) could not seed pair",
00172                 dFraction, dPValue, iThreads );
00173             return false; } }
00174     dMaxCorr = -( dMinP = DBL_MAX );
00175     for( iOne = iTwo = i = 0; i < vecpthdThreads.size( ); ++i ) {
00176         pthread_join( vecpthdThreads[ i ], NULL );
00177         if( ( vecsThreads[ i ].m_dMinP < dMinP ) ||
00178             ( !vecsThreads[ i ].m_dMinP && ( vecsThreads[ i ].m_dMaxCorr > dMaxCorr ) ) ) {
00179             dMinP = vecsThreads[ i ].m_dMinP;
00180             dMaxCorr = vecsThreads[ i ].m_dMaxCorr;
00181             iOne = vecsThreads[ i ].m_iOne;
00182             iTwo = vecsThreads[ i ].m_iTwo; } }
00183     if( ( dMinP * PCL.GetGenes( ) * ( PCL.GetGenes( ) - 1 ) * dFraction * dFraction ) <= ( dPValue * 2 ) ) {
00184         g_CatSleipnir( ).info( "CCoalesceClusterImpl::AddSeedPair( %g, %g ) seeding: %s, %s, %g (p=%g)",
00185             dFraction, dPValue, PCL.GetGene( iOne ).c_str( ), PCL.GetGene( iTwo ).c_str( ), dMaxCorr, dMinP );
00186         priiSeed.first = iOne;
00187         priiSeed.second = iTwo;
00188         setpriiSeeds.insert( priiSeed );
00189         Add( iOne, Pot );
00190         Add( iTwo, Pot );
00191         return true; }
00192 
00193     g_CatSleipnir( ).notice( "CCoalesceClusterImpl::AddSeedPair( %g, %g ) inadequate seed pair: %s, %s, %g (p=%g)",
00194         dFraction, dPValue, PCL.GetGene( iOne ).c_str( ), PCL.GetGene( iTwo ).c_str( ), dMaxCorr, dMinP );
00195     return false; }
00196 
00197 bool CCoalesceClusterImpl::AddCorrelatedGenes( const CPCL& PCL, CCoalesceCluster& Pot, const vector<float>& vecdSeed, float dPValue ) {
00198     size_t  iGene, iN;
00199     double  dR;
00200 
00201     CalculateCentroid( PCL );
00202     if( vecdSeed.size( ) ) {
00203         if( vecdSeed.size( ) == m_vecdCentroid.size( ) )
00204             copy( vecdSeed.begin( ), vecdSeed.end( ), m_vecdCentroid.begin( ) );
00205         else
00206             g_CatSleipnir( ).error( "CCoalesceClusterImpl::AddCorrelatedGenes( %g ) invalid seed provided, ignoring", dPValue ); }
00207     for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene )
00208         if( !IsGene( iGene ) &&
00209             ( ( dR = CMeasurePearson::Pearson( &m_vecdCentroid.front( ), PCL.GetExperiments( ),
00210             PCL.Get( iGene ), PCL.GetExperiments( ), IMeasure::EMapNone, NULL, NULL, &iN ) ) > 0 ) &&
00211             ( ( CStatistics::PValuePearson( dR, iN ) * PCL.GetGenes( ) ) <= dPValue ) )
00212             Add( iGene, Pot );
00213 
00214     return true; }
00215 
00266 void CCoalesceCluster::CalculateHistograms( const CCoalesceGeneScores& GeneScores,
00267     CCoalesceGroupHistograms& HistogramsCluster, CCoalesceGroupHistograms* pHistogramsPot ) const {
00268     set<size_t>::const_iterator iterGene;
00269     size_t                      i;
00270 
00271     if( !GeneScores.GetMotifs( ) )
00272         return;
00273     for( iterGene = GetGenes( ).begin( ); iterGene != GetGenes( ).end( ); ++iterGene )
00274         if( !binary_search( m_veciPrevGenes.begin( ), m_veciPrevGenes.end( ), *iterGene ) ) {
00275             HistogramsCluster.Add( GeneScores, *iterGene, false );
00276             if( pHistogramsPot )
00277                 pHistogramsPot->Add( GeneScores, *iterGene, true ); }
00278     for( i = 0; i < m_veciPrevGenes.size( ); ++i )
00279         if( GetGenes( ).find( m_veciPrevGenes[ i ] ) == GetGenes( ).end( ) ) {
00280             HistogramsCluster.Add( GeneScores, m_veciPrevGenes[ i ], true );
00281             if( pHistogramsPot )
00282                 pHistogramsPot->Add( GeneScores, m_veciPrevGenes[ i ], false ); } }
00283 
00301 void CCoalesceCluster::Subtract( CPCL& PCL, const CCoalesceCluster& Pot ) const {
00302     set<size_t>::const_iterator iterGene, iterDataset;
00303     size_t                      i, iCondition;
00304     float                       d, dAve;
00305 
00306     for( iterGene = GetGenes( ).begin( ); iterGene != GetGenes( ).end( ); ++iterGene )
00307         for( iterDataset = m_setiDatasets.begin( ); iterDataset != m_setiDatasets.end( ); ++iterDataset )
00308             for( i = 0; i < GetConditions( *iterDataset ); ++i ) {
00309                 iCondition = GetCondition( *iterDataset, i );
00310                 if( !CMeta::IsNaN( d = m_vecdCentroid[ iCondition ] ) ) {
00311                     if( CMeta::IsNaN( dAve = Pot.m_vecdCentroid[ iCondition ] ) )
00312                         dAve = 0;
00313                     else
00314                         dAve = ( ( GetGenes( ).size( ) * d ) + ( Pot.GetGenes( ).size( ) * dAve ) ) /
00315                             ( GetGenes( ).size( ) + Pot.GetGenes( ).size( ) );
00316                     PCL.Get( *iterGene, iCondition ) -= d - dAve; } } }
00317 
00330 void CCoalesceCluster::Subtract( CCoalesceGeneScores& GeneScores ) const {
00331     set<size_t>::const_iterator         iterGene;
00332     set<SMotifMatch>::const_iterator    iterMotif;
00333 
00334     for( iterGene = GetGenes( ).begin( ); iterGene != GetGenes( ).end( ); ++iterGene )
00335         for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif )
00336             GeneScores.Subtract( *iterMotif, *iterGene ); }
00337 
00338 void CCoalesceClusterImpl::CalculateCentroid( const CPCL& PCL ) {
00339     set<size_t>::const_iterator iterGene;
00340     size_t                      i, j;
00341     float                       d;
00342 
00343     m_veciCounts.resize( PCL.GetExperiments( ) );
00344     fill( m_veciCounts.begin( ), m_veciCounts.end( ), 0 );
00345     m_vecdCentroid.resize( PCL.GetExperiments( ) );
00346     fill( m_vecdCentroid.begin( ), m_vecdCentroid.end( ), 0.0f );
00347     m_vecdStdevs.resize( PCL.GetExperiments( ) );
00348     fill( m_vecdStdevs.begin( ), m_vecdStdevs.end( ), 0.0f );
00349     for( iterGene = GetGenes( ).begin( ); iterGene != GetGenes( ).end( ); ++iterGene )
00350         for( i = 0; i < m_veciCounts.size( ); ++i )
00351             if( !CMeta::IsNaN( d = PCL.Get( *iterGene, i ) ) ) {
00352                 m_veciCounts[ i ]++;
00353                 m_vecdCentroid[ i ] += d;
00354                 m_vecdStdevs[ i ] += d * d; }
00355     for( i = 0; i < m_veciCounts.size( ); ++i ) {
00356         if( j = m_veciCounts[ i ] ) {
00357             m_vecdCentroid[ i ] /= j;
00358             m_vecdStdevs[ i ] = ( m_vecdStdevs[ i ] / ( ( j == 1 ) ? 1 : ( j - 1 ) ) ) -
00359                 ( m_vecdCentroid[ i ] * m_vecdCentroid[ i ] );
00360             m_vecdStdevs[ i ] = ( m_vecdStdevs[ i ] < 0 ) ? 0 : sqrt( m_vecdStdevs[ i ] ); }
00361         else
00362             m_vecdCentroid[ i ] = CMeta::GetNaN( );
00363         g_CatSleipnir( ).debug( "CCoalesceClusterImpl::CalculateCentroid( ) condition %d: count %d, mean %g, stdev %g",
00364             i, m_veciCounts[ i ], m_vecdCentroid[ i ], m_vecdStdevs[ i ] ); }
00365 
00366     for( i = 0; i < m_vecsDatasets.size( ); ++i ) {
00367         SDataset&   sDataset    = m_vecsDatasets[ i ];
00368 
00369         if( sDataset.GetConditions( ) == 1 )
00370             continue;
00371         sDataset.m_vecdCentroid.resize( sDataset.GetConditions( ) );
00372         for( j = 0; j < sDataset.GetConditions( ); ++j )
00373             sDataset.m_vecdCentroid[ j ] = m_vecdCentroid[ sDataset.GetCondition( j ) ]; } }
00374 
00375 void* CCoalesceClusterImpl::ThreadSelectCondition( void* pData ) {
00376     vector<float>           vecdDatasetCluster, vecdDatasetPot;
00377     vector<size_t>          veciDatasetCluster, veciDatasetPot;
00378     size_t                  iDataset, iCondition, iCluster, iPot, iGene;
00379     double                  dP, dZ;
00380     float                   d;
00381     SThreadSelectCondition* psData;
00382     float*                  adCluster;
00383     float*                  adPot;
00384 
00385     psData = (SThreadSelectCondition*)pData;
00386     const vector<size_t>&   veciPot     = *psData->m_pveciPot;
00387     const vector<size_t>&   veciCluster = *psData->m_pveciCluster;
00388     const CPCL&             PCL         = *psData->m_pPCL;
00389 
00390     adCluster = new float[ veciCluster.size( ) ];
00391     adPot = new float[ veciPot.size( ) ];
00392     for( iDataset = psData->m_iOffset; iDataset < psData->m_pvecsDatasets->size( );
00393         iDataset += psData->m_iStep ) {
00394         SDataset&   sDataset    = (*psData->m_pvecsDatasets)[ iDataset ];
00395 
00396         if( sDataset.GetConditions( ) == 1 ) {
00397             iCondition = sDataset.GetCondition( 0 );
00398             for( iCluster = iPot = iGene = 0; iGene < veciCluster.size( ); ++iGene )
00399                 if( !CMeta::IsNaN( d = PCL.Get( veciCluster[ iGene ], iCondition ) ) )
00400                     adCluster[ iCluster++ ] = d;
00401             for( iGene = 0; iGene < veciPot.size( ); ++iGene )
00402                 if( !CMeta::IsNaN( d = PCL.Get( veciPot[ iGene ], iCondition ) ) )
00403                     adPot[ iPot++ ] = d;
00404             if( !( iCluster && iPot ) )
00405                 continue;
00406             dZ = CStatistics::ZScore( adCluster, adCluster + iCluster, adPot, adPot + iPot );
00407             sDataset.m_dP = CStatistics::ZTest( dZ, min( iCluster, iPot ) );
00408             sDataset.m_dZ = (float)min( dZ, (double)FLT_MAX ); }
00409         else {
00410             vecdDatasetCluster.resize( sDataset.GetConditions( ) );
00411             fill( vecdDatasetCluster.begin( ), vecdDatasetCluster.end( ), 0.0f );
00412             vecdDatasetPot.resize( sDataset.GetConditions( ) );
00413             fill( vecdDatasetPot.begin( ), vecdDatasetPot.end( ), 0.0f );
00414             veciDatasetCluster.resize( sDataset.GetConditions( ) );
00415             fill( veciDatasetCluster.begin( ), veciDatasetCluster.end( ), 0 );
00416             veciDatasetPot.resize( sDataset.GetConditions( ) );
00417             fill( veciDatasetPot.begin( ), veciDatasetPot.end( ), 0 );
00418             for( iGene = 0; iGene < veciCluster.size( ); ++iGene )
00419                 for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition )
00420                     if( !CMeta::IsNaN( d = PCL.Get( veciCluster[ iGene ],
00421                         sDataset.GetCondition( iCondition ) ) ) ) {
00422                         veciDatasetCluster[ iCondition ]++;
00423                         vecdDatasetCluster[ iCondition ] += d; }
00424             for( iGene = 0; iGene < veciPot.size( ); ++iGene )
00425                 for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition )
00426                     if( !CMeta::IsNaN( d = PCL.Get( veciPot[ iGene ],
00427                         sDataset.GetCondition( iCondition ) ) ) ) {
00428                         veciDatasetPot[ iCondition ]++;
00429                         vecdDatasetPot[ iCondition ] += d; }
00430             for( iCluster = iPot = iCondition = 0; iCondition < vecdDatasetCluster.size( ); ++iCondition ) {
00431                 if( veciDatasetCluster[ iCondition ] > iCluster )
00432                     iCluster = veciDatasetCluster[ iCondition ];
00433                 if( veciDatasetPot[ iCondition ] > iPot )
00434                     iPot = veciDatasetPot[ iCondition ];
00435                 if( veciDatasetCluster[ iCondition ] )
00436                     vecdDatasetCluster[ iCondition ] /= veciDatasetCluster[ iCondition ];
00437                 if( veciDatasetPot[ iCondition ] )
00438                     vecdDatasetPot[ iCondition ] /= veciDatasetPot[ iCondition ]; }
00439             dP = CStatistics::MultivariateNormalCDF( vecdDatasetCluster, vecdDatasetPot,
00440                 sDataset.m_psDataset->m_MatSigmaChol, min( iCluster, iPot ) );
00441             if( dP > 0.5 )
00442                 dP = 1 - dP;
00443 
00444             sDataset.m_dP = 2 * dP;
00445             dZ = 0;
00446             for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition )
00447                 if( sDataset.m_psDataset->m_vecdStdevs[ iCondition ] ) {
00448                     d = ( vecdDatasetCluster[ iCondition ] - vecdDatasetPot[ iCondition ] ) /
00449                         sDataset.m_psDataset->m_vecdStdevs[ iCondition ];
00450                     dZ += d * d; }
00451             dZ = sqrt( dZ );
00452             sDataset.m_dZ = (float)min( dZ, (double)FLT_MAX ); } }
00453     delete[] adCluster;
00454     delete[] adPot;
00455 
00456     return NULL; }
00457 
00489 bool CCoalesceCluster::SelectConditions( const CPCL& PCL, const CCoalesceCluster& Pot, size_t iThreads,
00490     float dPValue, float dZScore ) {
00491     vector<pthread_t>               vecpthdThreads;
00492     vector<SThreadSelectCondition>  vecsThreads;
00493     size_t                          i;
00494     vector<size_t>                  veciCluster, veciPot;
00495 
00496     m_setiDatasets.clear( );
00497     veciCluster.resize( GetGenes( ).size( ) );
00498     copy( GetGenes( ).begin( ), GetGenes( ).end( ), veciCluster.begin( ) );
00499     veciPot.resize( Pot.GetGenes( ).size( ) );
00500     copy( Pot.GetGenes( ).begin( ), Pot.GetGenes( ).end( ), veciPot.begin( ) );
00501     vecpthdThreads.resize( iThreads );
00502     vecsThreads.resize( vecpthdThreads.size( ) );
00503     for( i = 0; i < vecsThreads.size( ); ++i ) {
00504         vecsThreads[ i ].m_iOffset = i;
00505         vecsThreads[ i ].m_iStep = vecsThreads.size( );
00506         vecsThreads[ i ].m_pveciCluster = &veciCluster;
00507         vecsThreads[ i ].m_pveciPot = &veciPot;
00508         vecsThreads[ i ].m_pvecsDatasets = &m_vecsDatasets;
00509         vecsThreads[ i ].m_pPCL = &PCL;
00510         if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSelectCondition, &vecsThreads[ i ] ) ) {
00511             g_CatSleipnir( ).error( "CCoalesceCluster::SelectConditions( %d, %g ) could not select conditions",
00512                 iThreads, dZScore );
00513             return false; } }
00514     for( i = 0; i < vecpthdThreads.size( ); ++i )
00515         pthread_join( vecpthdThreads[ i ], NULL );
00516 
00517     dPValue /= m_vecsDatasets.size( );
00518     for( i = 0; i < m_vecsDatasets.size( ); ++i ) {
00519         const SDataset& sDataset    = m_vecsDatasets[ i ];
00520 
00521         if( ( sDataset.m_dP < dPValue ) && ( fabs( sDataset.m_dZ ) > dZScore ) ) {
00522             g_CatSleipnir( ).debug( "CCoalesceCluster::SelectConditions( %d, %g ) selected dataset %d at %g, z=%g",
00523                 iThreads, dZScore, i, sDataset.m_dP, sDataset.m_dZ );
00524             m_setiDatasets.insert( i ); }
00525         else
00526             g_CatSleipnir( ).debug( "CCoalesceCluster::SelectConditions( %d, %g ) rejected dataset %d at %g, z=%g",
00527                 iThreads, dZScore, i, sDataset.m_dP, sDataset.m_dZ ); }
00528 
00529     return true; }
00530 
00531 void* CCoalesceClusterImpl::ThreadSelectMotif( void* pData ) {
00532     SThreadSelectMotif* psData;
00533     uint32_t            iMotif;
00534 
00535     psData = (SThreadSelectMotif*)pData;
00536     for( iMotif = psData->m_iOffset; iMotif < ( psData->m_pveciMotifs ? psData->m_pveciMotifs->size( ) :
00537         psData->m_pMotifs->GetMotifs( ) ); iMotif += psData->m_iStep )
00538         if( !AddSignificant( *psData->m_pMotifs,  psData->m_pveciMotifs ? (*psData->m_pveciMotifs)[ iMotif ] :
00539             iMotif, *psData->m_pHistsCluster, *psData->m_pHistsPot, psData->m_dPValue, psData->m_dZScore,
00540             psData->m_vecsMotifs ) )
00541             break;
00542 
00543     return NULL; }
00544 
00588 bool CCoalesceCluster::SelectMotifs( const CCoalesceGroupHistograms& HistsCluster,
00589     const CCoalesceGroupHistograms& HistsPot, float dPValue, float dZScore, size_t iMaxMotifs,
00590     size_t iThreads, const CCoalesceMotifLibrary* pMotifs ) {
00591     uint32_t                    i, j;
00592     vector<pthread_t>           vecpthdThreads;
00593     vector<SThreadSelectMotif>  vecsThreads;
00594     vector<uint32_t>            veciMotifs;
00595 
00596     if( !pMotifs )
00597         return true;
00598     if( m_setsMotifs.size( ) > iMaxMotifs ) {
00599         set<uint32_t>                       setiMotifs;
00600         set<SMotifMatch>::const_iterator    iterMotif;
00601 
00602         for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif )
00603             setiMotifs.insert( iterMotif->m_iMotif );
00604         veciMotifs.resize( setiMotifs.size( ) );
00605         copy( setiMotifs.begin( ), setiMotifs.end( ), veciMotifs.begin( ) ); }
00606     m_setsMotifs.clear( );
00607     vecpthdThreads.resize( iThreads );
00608     vecsThreads.resize( vecpthdThreads.size( ) );
00609     for( i = 0; i < vecsThreads.size( ); ++i ) {
00610         vecsThreads[ i ].m_iOffset = i;
00611         vecsThreads[ i ].m_iStep = vecsThreads.size( );
00612         vecsThreads[ i ].m_pMotifs = pMotifs;
00613         vecsThreads[ i ].m_pHistsCluster = &HistsCluster;
00614         vecsThreads[ i ].m_pHistsPot = &HistsPot;
00615         vecsThreads[ i ].m_dPValue = dPValue;
00616         vecsThreads[ i ].m_dZScore = dZScore;
00617         vecsThreads[ i ].m_pveciMotifs = veciMotifs.empty( ) ? NULL : &veciMotifs;
00618         if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSelectMotif, &vecsThreads[ i ] ) ) {
00619             g_CatSleipnir( ).error( "CCoalesceCluster::SelectMotifs( %g, %d ) could not select motifs",
00620                 dZScore, iThreads );
00621             return false; } }
00622     for( i = 0; i < vecpthdThreads.size( ); ++i ) {
00623         pthread_join( vecpthdThreads[ i ], NULL );
00624         for( j = 0; j < vecsThreads[ i ].m_vecsMotifs.size( ); ++j )
00625             m_setsMotifs.insert( vecsThreads[ i ].m_vecsMotifs[ j ] ); }
00626 
00627     return true; }
00628 
00629 bool CCoalesceClusterImpl::AddSignificant( const CCoalesceMotifLibrary& Motifs, uint32_t iMotif,
00630     const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot, float dPValue,
00631     float dZScore, vector<SMotifMatch>& vecsMotifs ) {
00632     size_t                                  iTypeCluster, iTypePot;
00633     double                                  dP, dAveOne, dAverage, dZ;
00634     CCoalesceSequencerBase::ESubsequence    eSubsequence;
00635 
00636     dPValue /= Motifs.GetMotifs( );
00637     for( iTypeCluster = 0; iTypeCluster < HistsCluster.GetTypes( ); ++iTypeCluster ) {
00638         const string&   strTypeCluster  = HistsCluster.GetType( iTypeCluster );
00639 
00640         if( ( iTypePot = HistsPot.GetType( strTypeCluster ) ) == -1 )
00641             continue;
00642         for( eSubsequence = CCoalesceSequencerBase::ESubsequenceBegin;
00643             (size_t)eSubsequence < HistsCluster.GetSubsequences( iTypeCluster );
00644             eSubsequence = (CCoalesceSequencerBase::ESubsequence)( (size_t)eSubsequence + 1 ) ) {
00645             const CCoalesceHistogramSet<>&  HistSetCluster  = HistsCluster.Get( iTypeCluster, eSubsequence );
00646             const CCoalesceHistogramSet<>&  HistSetPot      = HistsPot.Get( iTypePot, eSubsequence );
00647 
00648             if( ( HistSetCluster.GetMembers( ) <= iMotif ) ||
00649                 ( HistSetPot.GetMembers( ) <= iMotif ) ||
00650                 !( HistSetCluster.GetTotal( ) && HistSetPot.GetTotal( ) ) )
00651                 continue;
00652             dP = HistSetCluster.CohensD( iMotif, HistSetPot, dAveOne, dAverage, dZ );
00653             if( ( dP < dPValue ) && ( fabs( dZ ) > dZScore ) ) {
00654                 SMotifMatch sMotif( iMotif, strTypeCluster, eSubsequence, (float)dZ, (float)( dAveOne -
00655                     dAverage ) );
00656 
00657                 if( g_CatSleipnir( ).isInfoEnabled( ) ) {
00658                     ostringstream   ossm;
00659 
00660                     ossm << "CCoalesceClusterImpl::AddSignificant( " << iMotif << ", " << dZScore <<
00661                         " ) adding at " << dP << ":" << endl << sMotif.Save( &Motifs ) << endl <<
00662                         "Cluster    " << HistSetCluster.Save( iMotif ) << endl <<
00663                         "Pot    " << HistSetPot.Save( iMotif );
00664                     g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); }
00665                 vecsMotifs.push_back( sMotif ); }
00666             else if( g_CatSleipnir( ).isDebugEnabled( ) ) {
00667                 ostringstream   ossm;
00668 
00669                 ossm << "CCoalesceClusterImpl::AddSignificant( " << iMotif << ", " << dZScore <<
00670                     " ) failed at " << dP << ":" << endl << SMotifMatch( iMotif, strTypeCluster, eSubsequence,
00671                     (float)dZ, (float)( dAveOne - dAverage ) ).Save( &Motifs ) << endl <<
00672                     "Cluster    " << HistSetCluster.Save( iMotif ) << endl <<
00673                     "Pot    " << HistSetPot.Save( iMotif );
00674                 g_CatSleipnir( ).debug( ossm.str( ).c_str( ) ); } } }
00675 
00676     return true; }
00677 
00678 void* CCoalesceClusterImpl::ThreadCentroid( void* pData ) {
00679     SThreadCentroid*    psData;
00680 
00681     psData = (SThreadCentroid*)pData;
00682     psData->m_pCluster->CalculateCentroid( psData->m_PCL );
00683 
00684     return NULL; }
00685 
00686 void* CCoalesceClusterImpl::ThreadSignificantGene( void* pData ) {
00687     SThreadSignificantGene* psData;
00688     size_t                  i;
00689 
00690     psData = (SThreadSignificantGene*)pData;
00691     for( i = psData->m_iOffset; i < psData->m_pvecfSignificant->size( ); i += psData->m_iStep )
00692         (*psData->m_pvecfSignificant)[ i ] = psData->m_pCluster->IsSignificant( i, *psData->m_pPCL,
00693             *psData->m_pvecdStdevs, psData->m_pMotifs, *psData->m_pGeneScores, *psData->m_pHistsCluster,
00694             *psData->m_pHistsPot, *psData->m_pPot, *psData->m_pveciDatasets, psData->m_dBeta,
00695             psData->m_iMinimum, psData->m_dProbability );
00696 
00697     return NULL; }
00698 
00755 bool CCoalesceCluster::SelectGenes( const CPCL& PCL, const CCoalesceGeneScores& GeneScores,
00756     const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot, size_t iMinimum,
00757     size_t iThreads, CCoalesceCluster& Pot, float dProbability, const CCoalesceMotifLibrary* pMotifs ) {
00758     size_t                              i, iCluster, iPot;
00759     vector<pthread_t>                   vecpthdThreads;
00760     vector<bool>                        vecfSignificant;
00761     vector<SThreadSignificantGene>      vecsThreads;
00762     vector<size_t>                      veciDatasets;
00763     float                               dSSCluster, dSSPot, dAve;
00764     set<size_t>                         setiMotifs;
00765     set<SMotifMatch>::const_iterator    iterMotif;
00766     vector<float>                       vecdStdevs;
00767 
00768     vecpthdThreads.resize( iThreads );
00769     {
00770         SThreadCentroid sUs( this, PCL ), sThem( &Pot, PCL );
00771 
00772         if( iThreads > 1 ) {
00773             if( pthread_create( &vecpthdThreads[ 0 ], NULL, ThreadCentroid, &sUs ) ||
00774                 pthread_create( &vecpthdThreads[ 1 ], NULL, ThreadCentroid, &sThem ) ) {
00775                 g_CatSleipnir( ).error( "CCoalesceCluster::SelectGenes( %d, %g ) could not calculate centroids",
00776                     iThreads, dProbability );
00777                 return false; }
00778             for( i = 0; i < 2; ++i )
00779                 pthread_join( vecpthdThreads[ i ], NULL ); }
00780         else if( ThreadCentroid( &sUs ) || ThreadCentroid( &sThem ) ) {
00781             g_CatSleipnir( ).error( "CCoalesceCluster::SelectGenes( %d, %g ) could not calculate centroids",
00782                 iThreads, dProbability );
00783             return false; }
00784     }
00785     iCluster = GetGenes( ).size( );
00786     iPot = Pot.GetGenes( ).size( );
00787     vecdStdevs.resize( m_vecdStdevs.size( ) );
00788     for( i = 0; i < vecdStdevs.size( ); ++i ) {
00789         dSSCluster = m_vecdStdevs[ i ];
00790         dSSCluster = iCluster * ( ( dSSCluster * dSSCluster ) + ( m_vecdCentroid[ i ] *
00791             m_vecdCentroid[ i ] ) );
00792         dSSPot = Pot.m_vecdStdevs[ i ];
00793         dSSPot = iPot * ( ( dSSPot * dSSPot ) + ( Pot.m_vecdCentroid[ i ] * Pot.m_vecdCentroid[ i ] ) );
00794         dAve = ( ( m_vecdCentroid[ i ] * iCluster ) + ( Pot.m_vecdCentroid[ i ] * iPot ) ) / PCL.GetGenes( );
00795         vecdStdevs[ i ] = sqrt( ( ( dSSCluster + dSSPot ) / ( iCluster + iPot ) ) - ( dAve * dAve ) ); }
00796 
00797     veciDatasets.resize( m_setiDatasets.size( ) );
00798     copy( m_setiDatasets.begin( ), m_setiDatasets.end( ), veciDatasets.begin( ) );
00799     vecfSignificant.resize( PCL.GetGenes( ) );
00800     vecsThreads.resize( vecpthdThreads.size( ) );
00801     for( i = 0; i < vecpthdThreads.size( ); ++i ) {
00802         vecsThreads[ i ].m_iOffset = i;
00803         vecsThreads[ i ].m_iStep = vecpthdThreads.size( );
00804         vecsThreads[ i ].m_pvecfSignificant = &vecfSignificant;
00805         vecsThreads[ i ].m_pPCL = &PCL;
00806         vecsThreads[ i ].m_pMotifs = pMotifs;
00807         vecsThreads[ i ].m_pGeneScores = &GeneScores;
00808         vecsThreads[ i ].m_pHistsCluster = &HistsCluster;
00809         vecsThreads[ i ].m_pHistsPot = &HistsPot;
00810         vecsThreads[ i ].m_pCluster = this;
00811         vecsThreads[ i ].m_pPot = &Pot;
00812         vecsThreads[ i ].m_pveciDatasets = &veciDatasets;
00813         vecsThreads[ i ].m_pvecdStdevs = &vecdStdevs;
00814         vecsThreads[ i ].m_dBeta = m_setsMotifs.size( ) ? ( (float)m_setsMotifs.size( ) / ( m_setiDatasets.size( ) + m_setsMotifs.size( ) ) ) : 0.5f;
00815         vecsThreads[ i ].m_iMinimum = iMinimum;
00816         vecsThreads[ i ].m_dProbability = dProbability;
00817         if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSignificantGene, &vecsThreads[ i ] ) ) {
00818             g_CatSleipnir( ).error( "CCoalesceCluster::SelectGenes( %d, %g ) could not calculate significance",
00819                 iThreads, dProbability );
00820             return false; } }
00821     for( i = 0; i < vecpthdThreads.size( ); ++i )
00822         pthread_join( vecpthdThreads[ i ], NULL );
00823     m_setiGenes.clear( );
00824     Pot.m_setiGenes.clear( );
00825     for( i = 0; i < vecfSignificant.size( ); ++i )
00826         if( vecfSignificant[ i ] ) {
00827             m_setiGenes.insert( i );
00828             m_vecdPriors[ i ] = dProbability; }
00829         else {
00830             Pot.m_setiGenes.insert( i );
00831             m_vecdPriors[ i ] = 1 - dProbability; }
00832 
00833     return true; }
00834 
00835 bool CCoalesceClusterImpl::IsSignificant( size_t iGene, const CPCL& PCL, const vector<float>& vecdStdevs,
00836     const CCoalesceMotifLibrary* pMotifs, const CCoalesceGeneScores& GeneScores,
00837     const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot,
00838     const CCoalesceCluster& Pot, const vector<size_t>& veciDatasets, float dBeta, size_t iMinimum,
00839     float dProbability ) const {
00840     float   d, dP, dLogPMotifsGivenIn, dLogPMotifsGivenOut, dLogPExpressionGivenIn, dLogPExpressionGivenOut;
00841     bool    fClustered;
00842 
00843     fClustered = IsGene( iGene );
00844 // We want P(g in C|S,E) =
00845 // P(S,E|g in C)P(g in C)/P(S,E) =
00846 // P(S|g in C)P(E|g in C)P(g in C)/(P(S,E|g in C)P(g in C) + P(S,E|g notin C)P(g notin C)) =
00847 // P(S|g in C)P(E|g in C)P(g in C)/(P(S|g in C)P(E|g in C)P(g in C) + P(S|g notin C)P(E|g notin C)P(g notin C))
00848     if( !( CalculateProbabilityExpression( iGene, PCL, vecdStdevs, Pot, veciDatasets, fClustered,
00849         dLogPExpressionGivenIn, dLogPExpressionGivenOut ) && CalculateProbabilityMotifs( GeneScores, iGene,
00850         HistsCluster, HistsPot, fClustered, iMinimum, dLogPMotifsGivenIn, dLogPMotifsGivenOut ) ) )
00851         return false;
00852     if( ( d = m_vecdPriors[ iGene ] ) >= 1 )
00853         dP = 1;
00854     else if( d <= 0 )
00855         dP = 0;
00856     else
00857         dP = 1 / ( 1 + exp( ( dBeta * ( dLogPExpressionGivenOut - dLogPExpressionGivenIn ) + ( 1 - dBeta ) *
00858             ( dLogPMotifsGivenOut - dLogPMotifsGivenIn ) ) / ( 0.5f + 2 * pow( 0.5f - dBeta, 2 ) ) +
00859             log( ( 1 - d ) / d ) ) );
00860 
00861     if( g_CatSleipnir( ).isDebugEnabled( ) ) {
00862         set<SMotifMatch>::const_iterator    iterMotif;
00863         size_t                              iType;
00864 
00865         g_CatSleipnir( ).debug( "CCoalesceClusterImpl::IsSignificant( %s ) is %g, prior %g beta %g, exp. p=%g vs. %g, seq. p=%g vs %g",
00866             PCL.GetGene( iGene ).c_str( ), dP, d, dBeta, dLogPExpressionGivenIn, dLogPExpressionGivenOut,
00867             dLogPMotifsGivenIn, dLogPMotifsGivenOut );
00868         for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif )
00869             if( ( iType = GeneScores.GetType( iterMotif->m_strType ) ) != -1 )
00870                 g_CatSleipnir( ).debug( "%g %s", GeneScores.Get( iType, iterMotif->m_eSubsequence, iGene,
00871                     iterMotif->m_iMotif ), iterMotif->Save( pMotifs ).c_str( ) ); }
00872 
00873     return ( dP >= dProbability ); }
00874 
00875 bool CCoalesceClusterImpl::CalculateProbabilityExpression( size_t iGene, const CPCL& PCL,
00876     const vector<float>& vecdStdevs, const CCoalesceCluster& Pot, const vector<size_t>& veciDatasets,
00877     bool fClustered, float& dLogPIn, float& dLogPOut ) const {
00878     static const double c_dEpsilonZero  = 1e-10;
00879     float               dGene, dCluster, dPot;
00880     double              dPCluster, dPPot, dStdev;
00881     size_t              iDataset, iPot, iCondition;
00882     long double         dPIn, dPOut;
00883 
00884     iPot = PCL.GetGenes( ) - GetGenes( ).size( );
00885     dLogPIn = dLogPOut = 0;
00886     dPIn = dPOut = 1;
00887     for( iDataset = 0; iDataset < veciDatasets.size( ); ++iDataset ) {
00888         const SDataset& sDataset    = m_vecsDatasets[ veciDatasets[ iDataset ] ];
00889 
00890         if( sDataset.GetConditions( ) == 1 ) {
00891             iCondition = sDataset.GetCondition( 0 );
00892             if( CMeta::IsNaN( dGene = PCL.Get( iGene, iCondition ) ) ||
00893                 CMeta::IsNaN( dCluster = m_vecdCentroid[ iCondition ] ) ||
00894                 CMeta::IsNaN( dPot = Pot.m_vecdCentroid[ iCondition ] ) )
00895                 continue;
00896 
00897             if( fClustered )
00898                 dCluster = ( ( dCluster * GetGenes( ).size( ) ) - dGene ) /
00899                     ( GetGenes( ).size( ) - 1 );
00900             else
00901                 dPot = ( ( dPot * iPot ) - dGene ) / ( iPot - 1 );
00902             dStdev = max( c_dEpsilonZero, (double)vecdStdevs[ iCondition ] );
00903             dPCluster = max( c_dEpsilonZero, CStatistics::NormalPDF( dGene, dCluster, dStdev ) );
00904             dPPot = max( c_dEpsilonZero, CStatistics::NormalPDF( dGene, dPot, dStdev ) ); }
00905         else {
00906             vector<float>   vecdGene;
00907 
00908             vecdGene.resize( sDataset.GetConditions( ) );
00909             for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition )
00910                 vecdGene[ iCondition ] = PCL.Get( iGene, sDataset.GetCondition( iCondition ) );
00911             dPCluster = min( 1.0, max( c_dEpsilonZero, CStatistics::MultivariateNormalPDF( vecdGene,
00912                 sDataset.m_vecdCentroid, sDataset.m_psDataset->m_dSigmaDetSqrt,
00913                 sDataset.m_psDataset->m_MatSigmaInv ) ) );
00914             dPPot = min( 1.0, max( c_dEpsilonZero, CStatistics::MultivariateNormalPDF( vecdGene,
00915                 Pot.m_vecsDatasets[ veciDatasets[ iDataset ] ].m_vecdCentroid,
00916                 sDataset.m_psDataset->m_dSigmaDetSqrt, sDataset.m_psDataset->m_MatSigmaInv ) ) ); }
00917         dPIn *= dPCluster;
00918         dPOut *= dPPot;
00919         if( ( dPIn < DBL_MIN ) || ( dPOut < DBL_MIN ) ) {
00920             dLogPIn += (float)log( dPIn );
00921             dLogPOut += (float)log( dPOut );
00922             dPIn = dPOut = 1; } }
00923     dLogPIn += (float)log( dPIn );
00924     dLogPOut += (float)log( dPOut );
00925 
00926     return true; }
00927 
00928 bool CCoalesceClusterImpl::CalculateProbabilityMotifs( const CCoalesceGeneScores& GeneScores, size_t iGene,
00929     const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot,
00930     bool fClustered, size_t iMinimum, float& dLogPIn, float& dLogPOut ) const {
00931     set<SMotifMatch>::const_iterator    iterMotif;
00932     size_t                              iType, iCluster, iPot;
00933     unsigned short                      sCluster, sPot;
00934     double                              dPCluster, dPPot;
00935     float                               dGene;
00936     const float*                        adValues;
00937     long double                         dPIn, dPOut;
00938 
00939     dLogPIn = dLogPOut = 0;
00940     if( m_setsMotifs.empty( ) )
00941         return true;
00942 
00943 // TODO: this is the slowest part of the entire algorithm
00944     dPIn = dPOut = 1;
00945     for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) {
00946         if( ( ( iType = GeneScores.GetType( iterMotif->m_strType ) ) == -1 ) ||
00947             !( adValues = GeneScores.Get( iType, iterMotif->m_eSubsequence, iGene ) ) ||
00948             ( HistsCluster.GetType( iterMotif->m_strType ) == -1 ) ||
00949             ( HistsPot.GetType( iterMotif->m_strType ) == -1 ) )
00950             continue;
00951         dGene = adValues[ iterMotif->m_iMotif ];
00952         {
00953             const CCoalesceHistogramSet<>&  HistSetCluster  = HistsCluster.Get( iterMotif->m_strType,
00954                                                                 iterMotif->m_eSubsequence );
00955             const CCoalesceHistogramSet<>&  HistSetPot      = HistsPot.Get( iterMotif->m_strType,
00956                                                                 iterMotif->m_eSubsequence );
00957 
00958             sCluster = HistSetCluster.Get( iterMotif->m_iMotif, dGene );
00959             iCluster = HistSetCluster.GetTotal( );
00960             sPot = HistSetPot.Get( iterMotif->m_iMotif, dGene );
00961             iPot = HistSetPot.GetTotal( );
00962             if( ( iCluster < iMinimum ) || ( iPot < iMinimum ) )
00963                 continue;
00964 
00965             if( fClustered ) {
00966                 if( sCluster ) {
00967                     sCluster--;
00968                     iCluster--; }
00969                 else
00970                     g_CatSleipnir( ).warn( "CCoalesceClusterImpl::CalculateProbabilityMotifs( %d, %d, %g, %g ) no motifs of %d in cluster: %g, %s\n%s",
00971                         iGene, fClustered, dLogPIn, dLogPOut, iCluster, dGene,
00972                         iterMotif->Save( NULL ).c_str( ), HistSetCluster.Save( iterMotif->m_iMotif ).c_str( ) ); }
00973             else if( sPot ) {
00974                 sPot--;
00975                 iPot--; }
00976             else
00977                 g_CatSleipnir( ).warn( "CCoalesceClusterImpl::CalculateProbabilityMotifs( %d, %d, %g, %g ) no motifs of %d in pot: %g, %s\n%s",
00978                     iGene, fClustered, dLogPIn, dLogPOut, iPot, dGene, iterMotif->Save( NULL ).c_str( ),
00979                     HistSetPot.Save( iterMotif->m_iMotif ).c_str( ) );
00980             dPCluster = (double)( sCluster + 1 ) / ( iCluster + HistSetCluster.GetEdges( ) );
00981             dPPot = (double)( sPot + 1 ) / ( iPot + HistSetPot.GetEdges( ) );
00982         }
00983         dPIn *= dPCluster;
00984         dPOut *= dPPot;
00985         if( ( dPIn < DBL_MIN ) || ( dPOut < DBL_MIN ) ) {
00986             dLogPIn += (float)log( dPIn );
00987             dLogPOut += (float)log( dPOut );
00988             dPIn = dPOut = 1; } }
00989     dLogPIn += (float)log( dPIn );
00990     dLogPOut += (float)log( dPOut );
00991 
00992     return true; }
00993 
01033 bool CCoalesceCluster::Save( const std::string& strDirectory, size_t iID, const CPCL& PCL,
01034     const CCoalesceMotifLibrary* pMotifs ) const {
01035     ofstream                            ofsm;
01036     char*                               szTmp;
01037     CPCL                                PCLCopy;
01038     size_t                              iGeneFrom, iGeneTo, iExpFrom, iExpTo, iLength;
01039     string                              strBase;
01040     set<SMotifMatch>::const_iterator    iterMotif;
01041     set<size_t>                         setiConditions;
01042 
01043     GetConditions( setiConditions );
01044     szTmp = new char[ iLength = ( strDirectory.length( ) + 16 ) ];
01045     sprintf_s( szTmp, iLength - 1, "%s/c%04d_XXXXXX", strDirectory.empty( ) ? "." : strDirectory.c_str( ),
01046         iID );
01047     _mktemp_s( szTmp, iLength - 1 );
01048     szTmp[ iLength - 1 ] = 0;
01049     strBase = szTmp;
01050     delete[] szTmp;
01051 
01052     ofsm.open( ( strBase + c_szMotifs ).c_str( ) );
01053     if( !ofsm.is_open( ) )
01054         return false;
01055     for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif )
01056         ofsm << iterMotif->Save( pMotifs ) << endl;
01057     ofsm.close( );
01058 
01059     PCLCopy.Open( PCL );
01060 // Reorder header
01061     for( iExpTo = iExpFrom = 0; iExpFrom < PCL.GetExperiments( ); ++iExpFrom )
01062         if( setiConditions.find( iExpFrom ) != setiConditions.end( ) )
01063             PCLCopy.SetExperiment( iExpTo++, c_cStar + PCL.GetExperiment( iExpFrom ) );
01064     for( iExpFrom = 0; iExpFrom < PCL.GetExperiments( ); ++iExpFrom )
01065         if( setiConditions.find( iExpFrom ) == setiConditions.end( ) )
01066             PCLCopy.SetExperiment( iExpTo++, PCL.GetExperiment( iExpFrom ) );
01067 // Reorder genes
01068     for( iGeneTo = iGeneFrom = 0; iGeneFrom < PCL.GetGenes( ); ++iGeneFrom )
01069         if( IsGene( iGeneFrom ) && !SaveCopy( PCL, setiConditions, iGeneFrom, PCLCopy, iGeneTo++, false ) )
01070             return false;
01071     for( iGeneFrom = 0; iGeneFrom < PCL.GetGenes( ); ++iGeneFrom )
01072         if( !IsGene( iGeneFrom ) )
01073             PCLCopy.MaskGene( iGeneTo++ );
01074 
01075     ofsm.clear( );
01076     ofsm.open( ( strBase + CPCL::GetExtension( ) ).c_str( ) );
01077     if( !ofsm.is_open( ) )
01078         return false;
01079     PCLCopy.Save( ofsm );
01080     ofsm.close( );
01081 
01082     return true; }
01083 
01084 bool CCoalesceClusterImpl::SaveCopy( const CPCL& PCLFrom, const std::set<size_t>& setiConditions,
01085     size_t iGeneFrom, CPCL& PCLTo, size_t iGeneTo, bool fClustered ) const {
01086     size_t  i, iExpTo, iExpFrom;
01087 
01088     PCLTo.SetGene( iGeneTo, PCLFrom.GetGene( iGeneFrom ) );
01089     for( i = 1; i < PCLFrom.GetFeatures( ); ++i )
01090         PCLTo.SetFeature( iGeneTo, i, (string)( ( fClustered && ( i == 1 ) ) ? "*" : "" ) +
01091             PCLFrom.GetFeature( iGeneFrom, i ) );
01092     for( iExpTo = iExpFrom = 0; iExpFrom < PCLFrom.GetExperiments( ); ++iExpFrom )
01093         if( setiConditions.find( iExpFrom ) != setiConditions.end( ) )
01094             PCLTo.Set( iGeneTo, iExpTo++, PCLFrom.Get( iGeneFrom, iExpFrom ) );
01095     for( iExpFrom = 0; iExpFrom < PCLFrom.GetExperiments( ); ++iExpFrom )
01096         if( setiConditions.find( iExpFrom ) == setiConditions.end( ) )
01097             PCLTo.Set( iGeneTo, iExpTo++, PCLFrom.Get( iGeneFrom, iExpFrom ) );
01098 
01099     return true; }
01100 
01151 void CCoalesceCluster::Save( std::ostream& ostm, size_t iID, const CPCL& PCL,
01152     const CCoalesceMotifLibrary* pMotifs, float dCutoffPWMs, float dPenaltyGap, float dPenaltyMismatch,
01153     bool fNoRCs ) const {
01154     set<size_t>::const_iterator         iterID;
01155     set<SMotifMatch>::const_iterator    iterMotif;
01156     size_t                              i;
01157     string                              strMotif;
01158 
01159     ostm << "Cluster\t" << iID << endl;
01160     ostm << c_szGenes;
01161     for( iterID = GetGenes( ).begin( ); iterID != GetGenes( ).end( ); ++iterID )
01162         ostm << '\t' << PCL.GetGene( *iterID );
01163     ostm << endl << c_szConditions;
01164     for( iterID = m_setiDatasets.begin( ); iterID != m_setiDatasets.end( ); ++iterID )
01165         for( i = 0; i < GetConditions( *iterID ); ++i )
01166             ostm << '\t' << PCL.GetExperiment( GetCondition( *iterID, i ) );
01167     ostm << endl << "Motifs" << endl;
01168     for( iterMotif = GetMotifs( ).begin( ); iterMotif != GetMotifs( ).end( ); ++iterMotif )
01169         if( !( strMotif = iterMotif->Save( pMotifs, true, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch,
01170             fNoRCs ) ).empty( ) )
01171             ostm << strMotif << endl;
01172     ostm.flush( ); }
01173 
01201 size_t CCoalesceCluster::Open( const std::string& strPCL, size_t iSkip, const CPCL& PCL,
01202     CCoalesceMotifLibrary* pMotifs ) {
01203     CPCL        PCLCluster;
01204     size_t      i, j, iGene;
01205     string      strMotifs;
01206     ifstream    ifsm;
01207 
01208     Clear( );
01209     if( !PCLCluster.Open( strPCL.c_str( ), iSkip ) )
01210         return -1;
01211 
01212     for( i = 0; i < PCLCluster.GetExperiments( ); ++i ) {
01213         if( PCLCluster.GetExperiment( i )[ 0 ] != c_cStar )
01214             break;
01215         for( j = 0; j < PCL.GetExperiments( ); ++j )
01216             if( PCL.GetExperiment( j ) == ( PCLCluster.GetExperiment( i ).c_str( ) + 1 ) ) {
01217                 m_setiDatasets.insert( j );
01218                 break; } }
01219     for( i = 0; i < PCLCluster.GetGenes( ); ++i ) {
01220         if( ( iGene = PCL.GetGene( PCLCluster.GetGene( i ) ) ) == -1 ) {
01221             g_CatSleipnir( ).error( "CCoalesceCluster::Open( %s, %i ) unrecognized gene: %s", strPCL.c_str( ),
01222                 iSkip, PCLCluster.GetGene( i ).c_str( ) );
01223             return -1; }
01224         m_setiGenes.insert( iGene ); }
01225 
01226     if( !pMotifs || ( ( i = strPCL.rfind( CPCL::GetExtension( ) ) ) !=
01227         ( strPCL.length( ) - strlen( CPCL::GetExtension( ) ) ) ) )
01228         return PCLCluster.GetExperiments( );
01229     strMotifs = strPCL.substr( 0, i ) + c_szMotifs;
01230     ifsm.open( strMotifs.c_str( ) );
01231     if( !ifsm.is_open( ) )
01232         return PCLCluster.GetExperiments( );
01233     while( !ifsm.eof( ) && ( ifsm.peek( ) != -1 ) ) {
01234         SMotifMatch sMotif;
01235 
01236         if( !sMotif.Open( ifsm, *pMotifs ) ) {
01237             g_CatSleipnir( ).warn( "CCoalesceCluster::Open( %s, %d ) could not open: %s", strPCL.c_str( ),
01238                 iSkip, strMotifs.c_str( ) );
01239             return -1; }
01240         m_setsMotifs.insert( sMotif ); }
01241 
01242     return PCLCluster.GetExperiments( ); }
01243 
01268 size_t CCoalesceCluster::Open( std::istream& istm, const CPCL& PCL, CCoalesceMotifLibrary* pMotifs ) {
01269     string              strBuffer;
01270     vector<string>      vecstrLine;
01271     size_t              i, j;
01272     vector<SMotifMatch> vecsMotifs;
01273 
01274     Clear( );
01275     strBuffer.resize( CFile::GetBufferSize( ) );
01276     istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
01277 
01278     istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
01279     CMeta::Tokenize( strBuffer.c_str( ), vecstrLine );
01280     if( vecstrLine.empty( ) || ( vecstrLine[ 0 ] != c_szGenes ) ) {
01281         g_CatSleipnir( ).error( "CCoalesceCluster::Open( ) invalid line: %s", strBuffer.c_str( ) );
01282         return -1; }
01283     for( i = 1; i < vecstrLine.size( ); ++i ) {
01284         if( ( j = PCL.GetGene( vecstrLine[ i ] ) ) == -1 ) {
01285             g_CatSleipnir( ).warn( "CCoalesceCluster::Open( ) unrecognized gene: %s",
01286                 vecstrLine[ i ].c_str( ) );
01287             continue; }
01288         m_setiGenes.insert( j ); }
01289 
01290     istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
01291     vecstrLine.clear( );
01292     CMeta::Tokenize( strBuffer.c_str( ), vecstrLine );
01293     if( vecstrLine.empty( ) || ( vecstrLine[ 0 ] != c_szConditions ) ) {
01294         g_CatSleipnir( ).error( "CCoalesceCluster::Open( ) invalid line: %s", strBuffer.c_str( ) );
01295         return -1; }
01296     for( i = 1; i < vecstrLine.size( ); ++i ) {
01297         if( ( j = PCL.GetExperiment( vecstrLine[ i ] ) ) == -1 ) {
01298             g_CatSleipnir( ).error( "CCoalesceCluster::Open( ) unrecognized condition: %s",
01299                 vecstrLine[ i ].c_str( ) );
01300             return -1; }
01301         m_setiDatasets.insert( j ); }
01302 
01303     istm.getline( &strBuffer[ 0 ], CFile::GetBufferSize( ) );
01304     if( !CCoalesceMotifLibrary::Open( istm, vecsMotifs, pMotifs ) )
01305         return -1;
01306     for( i = 0; i < vecsMotifs.size( ); ++i )
01307         m_setsMotifs.insert( vecsMotifs[ i ] );
01308 
01309     return m_setiDatasets.size( ); }
01310 
01347 bool CCoalesceCluster::Open( const CHierarchy& Hierarchy, const std::vector<CCoalesceCluster>& vecClusters,
01348     const std::vector<std::string>& vecstrClusters, float dFraction, float dCutoff, size_t iCutoff,
01349     CCoalesceMotifLibrary* pMotifs ) {
01350     map<size_t, size_t>                     mapiiGenes, mapiiDatasets;
01351     size_t                                  i, iClusters;
01352     map<size_t, size_t>::const_iterator     iterItem;
01353     vector<map<string, set<SMotifMatch> > > vecmapstrsetsMotifs;
01354 
01355     vecmapstrsetsMotifs.resize( CCoalesceSequencerBase::ESubsequenceEnd );
01356     g_CatSleipnir( ).notice( "CCoalesceCluster::Open( %g ) merging clusters:", dFraction );
01357     if( !( iClusters = CCoalesceClusterImpl::Open( Hierarchy, vecClusters, vecstrClusters, mapiiGenes,
01358         mapiiDatasets, vecmapstrsetsMotifs ) ) ) {
01359         g_CatSleipnir( ).error( "CCoalesceCluster::Open( %g ) no clusters found", dFraction );
01360         return false; }
01361 
01362     Clear( );
01363     for( iterItem = mapiiGenes.begin( ); iterItem != mapiiGenes.end( ); ++iterItem )
01364         if( ( (float)iterItem->second / iClusters ) >= dFraction )
01365             m_setiGenes.insert( iterItem->first );
01366     for( iterItem = mapiiDatasets.begin( ); iterItem != mapiiDatasets.end( ); ++iterItem )
01367 //      if( ( (float)iterItem->second / iClusters ) >= dFraction )
01368             m_setiDatasets.insert( iterItem->first );
01369     if( !pMotifs )
01370         return true;
01371 
01372     for( i = 0; i < vecmapstrsetsMotifs.size( ); ++i ) {
01373         const map<string, set<SMotifMatch> >&           mapstrsetsMotifs    = vecmapstrsetsMotifs[ i ];
01374         map<string, set<SMotifMatch> >::const_iterator  iterMotifs;
01375 
01376         for( iterMotifs = mapstrsetsMotifs.begin( ); iterMotifs != mapstrsetsMotifs.end( ); ++iterMotifs ) {
01377             const set<SMotifMatch>& setsMotifs  = iterMotifs->second;
01378 
01379             if( !( ( setsMotifs.size( ) < iCutoff ) ?
01380                 CCoalesceClusterImpl::OpenMotifs( setsMotifs, *pMotifs, dCutoff ) :
01381                 CCoalesceClusterImpl::OpenMotifsHeuristic( setsMotifs, *pMotifs, dCutoff, iCutoff ) ) )
01382                 return false; } }
01383 
01384     return true; }
01385 
01386 bool CCoalesceClusterImpl::OpenMotifsHeuristic( const std::set<SMotifMatch>& setsMotifs,
01387     CCoalesceMotifLibrary& Motifs, float dCutoff, size_t iCutoff ) {
01388     vector<SMotifMatch> vecsMotifs;
01389     bool                fDone;
01390     size_t              i, iMotifs;
01391     set<SMotifMatch>    setsMerged;
01392 
01393     iMotifs = setsMotifs.size( );
01394     g_CatSleipnir( ).notice( "CCoalesceClusterImpl::OpenMotifsHeuristic( %g ) resolving %d motifs", dCutoff,
01395         iMotifs );
01396 
01397     vecsMotifs.resize( iMotifs );
01398     copy( setsMotifs.begin( ), setsMotifs.end( ), vecsMotifs.begin( ) );
01399     do {
01400         fDone = true;
01401         sort( vecsMotifs.begin( ), vecsMotifs.end( ) );
01402         for( i = 0; ( i + 1 ) < vecsMotifs.size( ); ++i ) {
01403             SMotifMatch&    sOne    = vecsMotifs[ i ];
01404             SMotifMatch&    sTwo    = vecsMotifs[ i + 1 ];
01405 
01406             if( ( sOne.m_iMotif == -1 ) || ( sTwo.m_iMotif == -1 ) )
01407                 break;
01408             if( Motifs.Align( sOne.m_iMotif, sTwo.m_iMotif, dCutoff ) > dCutoff )
01409                 continue;
01410             if( sTwo.Open( sOne, sTwo, Motifs ) == -1 )
01411                 return false;
01412             if( Motifs.GetPST( sTwo.m_iMotif )->Integrate( ) > iCutoff )
01413                 Motifs.Simplify( sTwo.m_iMotif );
01414             fDone = false;
01415             iMotifs--;
01416             sOne.m_iMotif = -1; } }
01417     while( !fDone );
01418 
01419     for( i = 0; i < vecsMotifs.size( ); ++i )
01420         if( vecsMotifs[ i ].m_iMotif != -1 )
01421             setsMerged.insert( vecsMotifs[ i ] );
01422 
01423     return OpenMotifs( setsMerged, Motifs, dCutoff ); }
01424 
01425 bool CCoalesceClusterImpl::OpenMotifs( const std::set<SMotifMatch>& setsMotifs, CCoalesceMotifLibrary& Motifs,
01426     float dCutoff ) {
01427     vector<SMotifMatch> vecsMotifs;
01428     CDistanceMatrix     MatSimilarity;
01429     CHierarchy*         pHierMotifs;
01430     size_t              i, j;
01431     bool                fRet;
01432 
01433     g_CatSleipnir( ).notice( "CCoalesceClusterImpl::OpenMotifs( %g ) resolving %d motifs", dCutoff,
01434         setsMotifs.size( ) );
01435 
01436     vecsMotifs.resize( setsMotifs.size( ) );
01437     copy( setsMotifs.begin( ), setsMotifs.end( ), vecsMotifs.begin( ) );
01438     MatSimilarity.Initialize( vecsMotifs.size( ) );
01439     for( i = 0; i < vecsMotifs.size( ); ++i )
01440         for( j = ( i + 1 ); j < vecsMotifs.size( ); ++j )
01441             MatSimilarity.Set( i, j, -Motifs.Align( vecsMotifs[ i ].m_iMotif,
01442                 vecsMotifs[ j ].m_iMotif, dCutoff ) );
01443     if( !( pHierMotifs = CClustHierarchical::Cluster( MatSimilarity ) ) )
01444         return false;
01445     fRet = CCoalesceClusterImpl::OpenMotifs( Motifs, *pHierMotifs, vecsMotifs, dCutoff, m_setsMotifs );
01446     pHierMotifs->Destroy( );
01447     return fRet; }
01448 
01449 size_t CCoalesceClusterImpl::Open( const CHierarchy& Hier, const std::vector<CCoalesceCluster>& vecClusters,
01450     const std::vector<std::string>& vecstrClusters, std::map<size_t, size_t>& mapiiGenes,
01451     std::map<size_t, size_t>& mapiiDatasets, TVecMapStrSetSMotifs& vecmapstrsetsMotifs ) {
01452     set<size_t>::const_iterator         iterFrom;
01453     map<size_t, size_t>::iterator       iterTo;
01454     set<SMotifMatch>::const_iterator    iterMotif;
01455 
01456     if( !Hier.IsGene( ) )
01457         return ( Open( Hier.Get( false ), vecClusters, vecstrClusters, mapiiGenes, mapiiDatasets,
01458             vecmapstrsetsMotifs ) + Open( Hier.Get( true ), vecClusters, vecstrClusters, mapiiGenes,
01459             mapiiDatasets, vecmapstrsetsMotifs ) );
01460 
01461     const CCoalesceCluster& Cluster = vecClusters[ Hier.GetID( ) ];
01462 
01463     g_CatSleipnir( ).notice( "CCoalesceClusterImpl::Open( ) cluster %s",
01464         vecstrClusters[ Hier.GetID( ) ].c_str( ) );
01465     for( iterFrom = Cluster.GetGenes( ).begin( ); iterFrom != Cluster.GetGenes( ).end( ); ++iterFrom )
01466         if( ( iterTo = mapiiGenes.find( *iterFrom ) ) == mapiiGenes.end( ) )
01467             mapiiGenes[ *iterFrom ] = 1;
01468         else
01469             iterTo->second++;
01470     for( iterFrom = Cluster.m_setiDatasets.begin( ); iterFrom != Cluster.m_setiDatasets.end( ); ++iterFrom )
01471         if( ( iterTo = mapiiDatasets.find( *iterFrom ) ) == mapiiDatasets.end( ) )
01472             mapiiDatasets[ *iterFrom ] = 1;
01473         else
01474             iterTo->second++;
01475     for( iterMotif = Cluster.m_setsMotifs.begin( ); iterMotif != Cluster.m_setsMotifs.end( ); ++iterMotif )
01476         vecmapstrsetsMotifs[ iterMotif->m_eSubsequence ][ iterMotif->m_strType ].insert( *iterMotif );
01477 
01478     return 1; }
01479 
01480 bool CCoalesceClusterImpl::OpenMotifs( CCoalesceMotifLibrary& Motifs, const CHierarchy& Hier,
01481     const std::vector<SMotifMatch>& vecsMotifs, float dCutoff, std::set<SMotifMatch>& setsMotifs ) {
01482 
01483     if( Hier.IsGene( ) || ( -Hier.GetSimilarity( ) < dCutoff ) ) {
01484         SMotifMatch sMotif;
01485         size_t      iCount;
01486 
01487         sMotif.m_dZ = 0;
01488         if( sMotif.Open( Hier, vecsMotifs, Motifs, iCount = 0 ) == -1 )
01489             return false;
01490         sMotif.m_dZ /= iCount;
01491         setsMotifs.insert( sMotif );
01492         return true; }
01493 
01494     return ( OpenMotifs( Motifs, Hier.Get( false ), vecsMotifs, dCutoff, setsMotifs ) &&
01495         OpenMotifs( Motifs, Hier.Get( true ), vecsMotifs, dCutoff, setsMotifs ) ); }
01496 
01517 float CCoalesceCluster::GetSimilarity( const CCoalesceCluster& Cluster, size_t iGenes,
01518     size_t iDatasets ) const {
01519     size_t                      iOverlapGenes;
01520     set<size_t>::const_iterator iterItem;
01521     float                       dRet;
01522 
01523     if( GetGenes( ).empty( ) || Cluster.GetGenes( ).empty( ) )
01524         return 0;
01525     for( iOverlapGenes = 0,iterItem = GetGenes( ).begin( ); iterItem != GetGenes( ).end( ); ++iterItem )
01526         if( Cluster.IsGene( *iterItem ) )
01527             iOverlapGenes++;
01528 // simple gene overlap works best with an ~0.75 cutoff
01529     dRet = (float)iOverlapGenes / min( GetGenes( ).size( ), Cluster.GetGenes( ).size( ) );
01530 // jaccard index works best with an 0.25-0.33 cutoff
01531 //  dRet = (float)iOverlapGenes / ( GetGenes( ).size( ) + Cluster.GetGenes( ).size( ) - iOverlapGenes );
01532     g_CatSleipnir( ).debug( "CCoalesceCluster::GetSimilarity( %d, %d ): %d, %d, %d = %g", iGenes,
01533         iDatasets, iOverlapGenes, GetGenes( ).size( ), Cluster.GetGenes( ).size( ), dRet );
01534     return dRet; }
01535 
01553 void CCoalesceCluster::Snapshot( const CCoalesceGeneScores& GeneScores,
01554     CCoalesceGroupHistograms& Histograms ) {
01555 
01556     Histograms.SetTotal( GeneScores, GetGenes( ) );
01557     m_setiHistory.insert( GetHash( ) );
01558     CCoalesceClusterImpl::Snapshot( m_setiDatasets, m_veciPrevDatasets );
01559     CCoalesceClusterImpl::Snapshot( m_setsMotifs, m_vecsPrevMotifs );
01560     CCoalesceClusterImpl::Snapshot( GetGenes( ), m_veciPrevGenes ); }
01561 
01596 bool CCoalesceCluster::LabelMotifs( const CCoalesceMotifLibrary& Motifs, SMotifMatch::EType eMatchType,
01597     float dPenaltyGap, float dPenaltyMismatch, float dPValue ) {
01598     set<SMotifMatch>::iterator  iterMotif;
01599 
01600     if( !Motifs.GetKnowns( ) )
01601         return true;
01602     for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) {
01603         SMotifMatch&    sMotif  = (SMotifMatch&)*iterMotif;
01604 // For some obscure reason, Linux won't let me use this as a non-const iterator...
01605         if( !sMotif.Label( Motifs, eMatchType, dPenaltyGap, dPenaltyMismatch, dPValue ) )
01606             return false; }
01607 
01608     return true; }
01609 
01610 }