Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "coalescecluster.h" 00024 #include "coalescei.h" 00025 #include "coalescemotifs.h" 00026 #include "pcl.h" 00027 #include "halfmatrix.h" 00028 #include "statistics.h" 00029 #include "clusthierarchical.h" 00030 #include "pst.h" 00031 00032 namespace Sleipnir { 00033 00034 const char CCoalesceClusterImpl::c_szMotifs[] = "_motifs.txt"; 00035 const char CCoalesceClusterImpl::c_szGenes[] = "Genes"; 00036 const char CCoalesceClusterImpl::c_szConditions[] = "Conditions"; 00037 00089 bool CCoalesceCluster::Initialize( const CPCL& PCL, CCoalesceCluster& Pot, 00090 const std::vector<SCoalesceDataset>& vecsDatasets, std::set<std::pair<size_t, size_t> >& setpriiSeeds, 00091 const vector<float>& vecdSeed, size_t iPairs, float dPValue, float dProbability, size_t iThreads ) { 00092 size_t i; 00093 float dFraction; 00094 CPCL PCLCopy; 00095 00096 m_setiDatasets.clear( ); 00097 m_setiGenes.clear( ); 00098 m_setsMotifs.clear( ); 00099 m_vecsDatasets.resize( vecsDatasets.size( ) ); 00100 Pot.m_vecsDatasets.resize( vecsDatasets.size( ) ); 00101 for( i = 0; i < m_vecsDatasets.size( ); ++i ) { 00102 m_vecsDatasets[ i ].m_psDataset = Pot.m_vecsDatasets[ i ].m_psDataset = &vecsDatasets[ i ]; 00103 m_setiDatasets.insert( i ); } 00104 m_vecdPriors.resize( PCL.GetGenes( ) ); 00105 fill( m_vecdPriors.begin( ), m_vecdPriors.end( ), 1 - dProbability ); 00106 dFraction = min( 1.0f, 2.0f * iPairs / PCL.GetGenes( ) / ( PCL.GetGenes( ) - 1 ) ); 00107 00108 PCLCopy.Open( PCL ); 00109 PCLCopy.Normalize( CPCL::ENormalizeColumn ); 00110 return ( ( vecdSeed.size( ) || AddSeedPair( PCLCopy, Pot, setpriiSeeds, dFraction, dPValue, iThreads ) ) && 00111 AddCorrelatedGenes( PCLCopy, Pot, vecdSeed, dPValue ) ); } 00112 00113 void CCoalesceClusterImpl::Add( size_t iGene, CCoalesceCluster& Pot ) { 00114 00115 m_setiGenes.insert( iGene ); 00116 Pot.m_setiGenes.erase( iGene ); } 00117 00118 void* CCoalesceClusterImpl::ThreadSeedPair( void* pData ) { 00119 SThreadSeedPair* psData; 00120 double dR, dP; 00121 size_t i, j, iN, iExperiments, iPair, iPairs, iCur; 00122 pair<size_t, size_t> priiSeed; 00123 00124 psData = (SThreadSeedPair*)pData; 00125 iExperiments = psData->m_pPCL->GetExperiments( ); 00126 psData->m_dMaxCorr = -( psData->m_dMinP = DBL_MAX ); 00127 iPairs = psData->m_pPCL->GetGenes( ) * ( psData->m_pPCL->GetGenes( ) - 1 ) / 2; 00128 for( psData->m_iOne = psData->m_iTwo = 0,iPair = psData->m_iOffset; iPair < iPairs; 00129 iPair += psData->m_iStep ) { 00130 if( ( (float)rand( ) / RAND_MAX ) > psData->m_dFraction ) 00131 continue; 00132 for( i = 0,iCur = iPair; iCur >= ( psData->m_pPCL->GetGenes( ) - 1 - i ); 00133 iCur -= psData->m_pPCL->GetGenes( ) - 1 - i++ ); 00134 j = i + iCur + 1; 00135 if( ( ( dR = CMeasurePearson::Pearson( psData->m_pPCL->Get( i ), iExperiments, 00136 psData->m_pPCL->Get( j ), iExperiments, IMeasure::EMapNone, NULL, NULL, &iN ) ) < 0 ) || 00137 CMeta::IsNaN( dR ) ) 00138 continue; 00139 if( ( ( dP = CStatistics::PValuePearson( dR, iN ) ) < psData->m_dMinP ) || 00140 ( !dP && ( dR > psData->m_dMaxCorr ) ) ) { 00141 priiSeed.first = i; 00142 priiSeed.second = j; 00143 if( psData->m_psetpriiSeeds->find( priiSeed ) == psData->m_psetpriiSeeds->end( ) ) { 00144 psData->m_dMaxCorr = dR; 00145 psData->m_dMinP = dP; 00146 psData->m_iOne = i; 00147 psData->m_iTwo = j; } } } 00148 00149 return NULL; } 00150 00151 bool CCoalesceClusterImpl::AddSeedPair( const CPCL& PCL, CCoalesceCluster& Pot, 00152 std::set<std::pair<size_t, size_t> >& setpriiSeeds, float dFraction, float dPValue, size_t iThreads ) { 00153 size_t i, iOne, iTwo; 00154 double dMaxCorr, dMinP; 00155 pair<size_t, size_t> priiSeed; 00156 vector<pthread_t> vecpthdThreads; 00157 vector<SThreadSeedPair> vecsThreads; 00158 00159 if( PCL.GetGenes( ) < 2 ) { 00160 g_CatSleipnir( ).error( "CCoalesceClusterImpl::AddSeedPair( %g ) found no genes", dPValue ); 00161 return false; } 00162 vecpthdThreads.resize( iThreads ); 00163 vecsThreads.resize( vecpthdThreads.size( ) ); 00164 for( i = 0; i < vecsThreads.size( ); ++i ) { 00165 vecsThreads[ i ].m_iOffset = i; 00166 vecsThreads[ i ].m_iStep = vecsThreads.size( ); 00167 vecsThreads[ i ].m_pPCL = &PCL; 00168 vecsThreads[ i ].m_dFraction = dFraction; 00169 vecsThreads[ i ].m_psetpriiSeeds = &setpriiSeeds; 00170 if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSeedPair, &vecsThreads[ i ] ) ) { 00171 g_CatSleipnir( ).error( "CCoalesceClusterImpl::AddSeedPair( %g, %g, %d ) could not seed pair", 00172 dFraction, dPValue, iThreads ); 00173 return false; } } 00174 dMaxCorr = -( dMinP = DBL_MAX ); 00175 for( iOne = iTwo = i = 0; i < vecpthdThreads.size( ); ++i ) { 00176 pthread_join( vecpthdThreads[ i ], NULL ); 00177 if( ( vecsThreads[ i ].m_dMinP < dMinP ) || 00178 ( !vecsThreads[ i ].m_dMinP && ( vecsThreads[ i ].m_dMaxCorr > dMaxCorr ) ) ) { 00179 dMinP = vecsThreads[ i ].m_dMinP; 00180 dMaxCorr = vecsThreads[ i ].m_dMaxCorr; 00181 iOne = vecsThreads[ i ].m_iOne; 00182 iTwo = vecsThreads[ i ].m_iTwo; } } 00183 if( ( dMinP * PCL.GetGenes( ) * ( PCL.GetGenes( ) - 1 ) * dFraction * dFraction ) <= ( dPValue * 2 ) ) { 00184 g_CatSleipnir( ).info( "CCoalesceClusterImpl::AddSeedPair( %g, %g ) seeding: %s, %s, %g (p=%g)", 00185 dFraction, dPValue, PCL.GetGene( iOne ).c_str( ), PCL.GetGene( iTwo ).c_str( ), dMaxCorr, dMinP ); 00186 priiSeed.first = iOne; 00187 priiSeed.second = iTwo; 00188 setpriiSeeds.insert( priiSeed ); 00189 Add( iOne, Pot ); 00190 Add( iTwo, Pot ); 00191 return true; } 00192 00193 g_CatSleipnir( ).notice( "CCoalesceClusterImpl::AddSeedPair( %g, %g ) inadequate seed pair: %s, %s, %g (p=%g)", 00194 dFraction, dPValue, PCL.GetGene( iOne ).c_str( ), PCL.GetGene( iTwo ).c_str( ), dMaxCorr, dMinP ); 00195 return false; } 00196 00197 bool CCoalesceClusterImpl::AddCorrelatedGenes( const CPCL& PCL, CCoalesceCluster& Pot, const vector<float>& vecdSeed, float dPValue ) { 00198 size_t iGene, iN; 00199 double dR; 00200 00201 CalculateCentroid( PCL ); 00202 if( vecdSeed.size( ) ) { 00203 if( vecdSeed.size( ) == m_vecdCentroid.size( ) ) 00204 copy( vecdSeed.begin( ), vecdSeed.end( ), m_vecdCentroid.begin( ) ); 00205 else 00206 g_CatSleipnir( ).error( "CCoalesceClusterImpl::AddCorrelatedGenes( %g ) invalid seed provided, ignoring", dPValue ); } 00207 for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene ) 00208 if( !IsGene( iGene ) && 00209 ( ( dR = CMeasurePearson::Pearson( &m_vecdCentroid.front( ), PCL.GetExperiments( ), 00210 PCL.Get( iGene ), PCL.GetExperiments( ), IMeasure::EMapNone, NULL, NULL, &iN ) ) > 0 ) && 00211 ( ( CStatistics::PValuePearson( dR, iN ) * PCL.GetGenes( ) ) <= dPValue ) ) 00212 Add( iGene, Pot ); 00213 00214 return true; } 00215 00266 void CCoalesceCluster::CalculateHistograms( const CCoalesceGeneScores& GeneScores, 00267 CCoalesceGroupHistograms& HistogramsCluster, CCoalesceGroupHistograms* pHistogramsPot ) const { 00268 set<size_t>::const_iterator iterGene; 00269 size_t i; 00270 00271 if( !GeneScores.GetMotifs( ) ) 00272 return; 00273 for( iterGene = GetGenes( ).begin( ); iterGene != GetGenes( ).end( ); ++iterGene ) 00274 if( !binary_search( m_veciPrevGenes.begin( ), m_veciPrevGenes.end( ), *iterGene ) ) { 00275 HistogramsCluster.Add( GeneScores, *iterGene, false ); 00276 if( pHistogramsPot ) 00277 pHistogramsPot->Add( GeneScores, *iterGene, true ); } 00278 for( i = 0; i < m_veciPrevGenes.size( ); ++i ) 00279 if( GetGenes( ).find( m_veciPrevGenes[ i ] ) == GetGenes( ).end( ) ) { 00280 HistogramsCluster.Add( GeneScores, m_veciPrevGenes[ i ], true ); 00281 if( pHistogramsPot ) 00282 pHistogramsPot->Add( GeneScores, m_veciPrevGenes[ i ], false ); } } 00283 00301 void CCoalesceCluster::Subtract( CPCL& PCL, const CCoalesceCluster& Pot ) const { 00302 set<size_t>::const_iterator iterGene, iterDataset; 00303 size_t i, iCondition; 00304 float d, dAve; 00305 00306 for( iterGene = GetGenes( ).begin( ); iterGene != GetGenes( ).end( ); ++iterGene ) 00307 for( iterDataset = m_setiDatasets.begin( ); iterDataset != m_setiDatasets.end( ); ++iterDataset ) 00308 for( i = 0; i < GetConditions( *iterDataset ); ++i ) { 00309 iCondition = GetCondition( *iterDataset, i ); 00310 if( !CMeta::IsNaN( d = m_vecdCentroid[ iCondition ] ) ) { 00311 if( CMeta::IsNaN( dAve = Pot.m_vecdCentroid[ iCondition ] ) ) 00312 dAve = 0; 00313 else 00314 dAve = ( ( GetGenes( ).size( ) * d ) + ( Pot.GetGenes( ).size( ) * dAve ) ) / 00315 ( GetGenes( ).size( ) + Pot.GetGenes( ).size( ) ); 00316 PCL.Get( *iterGene, iCondition ) -= d - dAve; } } } 00317 00330 void CCoalesceCluster::Subtract( CCoalesceGeneScores& GeneScores ) const { 00331 set<size_t>::const_iterator iterGene; 00332 set<SMotifMatch>::const_iterator iterMotif; 00333 00334 for( iterGene = GetGenes( ).begin( ); iterGene != GetGenes( ).end( ); ++iterGene ) 00335 for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) 00336 GeneScores.Subtract( *iterMotif, *iterGene ); } 00337 00338 void CCoalesceClusterImpl::CalculateCentroid( const CPCL& PCL ) { 00339 set<size_t>::const_iterator iterGene; 00340 size_t i, j; 00341 float d; 00342 00343 m_veciCounts.resize( PCL.GetExperiments( ) ); 00344 fill( m_veciCounts.begin( ), m_veciCounts.end( ), 0 ); 00345 m_vecdCentroid.resize( PCL.GetExperiments( ) ); 00346 fill( m_vecdCentroid.begin( ), m_vecdCentroid.end( ), 0.0f ); 00347 m_vecdStdevs.resize( PCL.GetExperiments( ) ); 00348 fill( m_vecdStdevs.begin( ), m_vecdStdevs.end( ), 0.0f ); 00349 for( iterGene = GetGenes( ).begin( ); iterGene != GetGenes( ).end( ); ++iterGene ) 00350 for( i = 0; i < m_veciCounts.size( ); ++i ) 00351 if( !CMeta::IsNaN( d = PCL.Get( *iterGene, i ) ) ) { 00352 m_veciCounts[ i ]++; 00353 m_vecdCentroid[ i ] += d; 00354 m_vecdStdevs[ i ] += d * d; } 00355 for( i = 0; i < m_veciCounts.size( ); ++i ) { 00356 if( j = m_veciCounts[ i ] ) { 00357 m_vecdCentroid[ i ] /= j; 00358 m_vecdStdevs[ i ] = ( m_vecdStdevs[ i ] / ( ( j == 1 ) ? 1 : ( j - 1 ) ) ) - 00359 ( m_vecdCentroid[ i ] * m_vecdCentroid[ i ] ); 00360 m_vecdStdevs[ i ] = ( m_vecdStdevs[ i ] < 0 ) ? 0 : sqrt( m_vecdStdevs[ i ] ); } 00361 else 00362 m_vecdCentroid[ i ] = CMeta::GetNaN( ); 00363 g_CatSleipnir( ).debug( "CCoalesceClusterImpl::CalculateCentroid( ) condition %d: count %d, mean %g, stdev %g", 00364 i, m_veciCounts[ i ], m_vecdCentroid[ i ], m_vecdStdevs[ i ] ); } 00365 00366 for( i = 0; i < m_vecsDatasets.size( ); ++i ) { 00367 SDataset& sDataset = m_vecsDatasets[ i ]; 00368 00369 if( sDataset.GetConditions( ) == 1 ) 00370 continue; 00371 sDataset.m_vecdCentroid.resize( sDataset.GetConditions( ) ); 00372 for( j = 0; j < sDataset.GetConditions( ); ++j ) 00373 sDataset.m_vecdCentroid[ j ] = m_vecdCentroid[ sDataset.GetCondition( j ) ]; } } 00374 00375 void* CCoalesceClusterImpl::ThreadSelectCondition( void* pData ) { 00376 vector<float> vecdDatasetCluster, vecdDatasetPot; 00377 vector<size_t> veciDatasetCluster, veciDatasetPot; 00378 size_t iDataset, iCondition, iCluster, iPot, iGene; 00379 double dP, dZ; 00380 float d; 00381 SThreadSelectCondition* psData; 00382 float* adCluster; 00383 float* adPot; 00384 00385 psData = (SThreadSelectCondition*)pData; 00386 const vector<size_t>& veciPot = *psData->m_pveciPot; 00387 const vector<size_t>& veciCluster = *psData->m_pveciCluster; 00388 const CPCL& PCL = *psData->m_pPCL; 00389 00390 adCluster = new float[ veciCluster.size( ) ]; 00391 adPot = new float[ veciPot.size( ) ]; 00392 for( iDataset = psData->m_iOffset; iDataset < psData->m_pvecsDatasets->size( ); 00393 iDataset += psData->m_iStep ) { 00394 SDataset& sDataset = (*psData->m_pvecsDatasets)[ iDataset ]; 00395 00396 if( sDataset.GetConditions( ) == 1 ) { 00397 iCondition = sDataset.GetCondition( 0 ); 00398 for( iCluster = iPot = iGene = 0; iGene < veciCluster.size( ); ++iGene ) 00399 if( !CMeta::IsNaN( d = PCL.Get( veciCluster[ iGene ], iCondition ) ) ) 00400 adCluster[ iCluster++ ] = d; 00401 for( iGene = 0; iGene < veciPot.size( ); ++iGene ) 00402 if( !CMeta::IsNaN( d = PCL.Get( veciPot[ iGene ], iCondition ) ) ) 00403 adPot[ iPot++ ] = d; 00404 if( !( iCluster && iPot ) ) 00405 continue; 00406 dZ = CStatistics::ZScore( adCluster, adCluster + iCluster, adPot, adPot + iPot ); 00407 sDataset.m_dP = CStatistics::ZTest( dZ, min( iCluster, iPot ) ); 00408 sDataset.m_dZ = (float)min( dZ, (double)FLT_MAX ); } 00409 else { 00410 vecdDatasetCluster.resize( sDataset.GetConditions( ) ); 00411 fill( vecdDatasetCluster.begin( ), vecdDatasetCluster.end( ), 0.0f ); 00412 vecdDatasetPot.resize( sDataset.GetConditions( ) ); 00413 fill( vecdDatasetPot.begin( ), vecdDatasetPot.end( ), 0.0f ); 00414 veciDatasetCluster.resize( sDataset.GetConditions( ) ); 00415 fill( veciDatasetCluster.begin( ), veciDatasetCluster.end( ), 0 ); 00416 veciDatasetPot.resize( sDataset.GetConditions( ) ); 00417 fill( veciDatasetPot.begin( ), veciDatasetPot.end( ), 0 ); 00418 for( iGene = 0; iGene < veciCluster.size( ); ++iGene ) 00419 for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition ) 00420 if( !CMeta::IsNaN( d = PCL.Get( veciCluster[ iGene ], 00421 sDataset.GetCondition( iCondition ) ) ) ) { 00422 veciDatasetCluster[ iCondition ]++; 00423 vecdDatasetCluster[ iCondition ] += d; } 00424 for( iGene = 0; iGene < veciPot.size( ); ++iGene ) 00425 for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition ) 00426 if( !CMeta::IsNaN( d = PCL.Get( veciPot[ iGene ], 00427 sDataset.GetCondition( iCondition ) ) ) ) { 00428 veciDatasetPot[ iCondition ]++; 00429 vecdDatasetPot[ iCondition ] += d; } 00430 for( iCluster = iPot = iCondition = 0; iCondition < vecdDatasetCluster.size( ); ++iCondition ) { 00431 if( veciDatasetCluster[ iCondition ] > iCluster ) 00432 iCluster = veciDatasetCluster[ iCondition ]; 00433 if( veciDatasetPot[ iCondition ] > iPot ) 00434 iPot = veciDatasetPot[ iCondition ]; 00435 if( veciDatasetCluster[ iCondition ] ) 00436 vecdDatasetCluster[ iCondition ] /= veciDatasetCluster[ iCondition ]; 00437 if( veciDatasetPot[ iCondition ] ) 00438 vecdDatasetPot[ iCondition ] /= veciDatasetPot[ iCondition ]; } 00439 dP = CStatistics::MultivariateNormalCDF( vecdDatasetCluster, vecdDatasetPot, 00440 sDataset.m_psDataset->m_MatSigmaChol, min( iCluster, iPot ) ); 00441 if( dP > 0.5 ) 00442 dP = 1 - dP; 00443 00444 sDataset.m_dP = 2 * dP; 00445 dZ = 0; 00446 for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition ) 00447 if( sDataset.m_psDataset->m_vecdStdevs[ iCondition ] ) { 00448 d = ( vecdDatasetCluster[ iCondition ] - vecdDatasetPot[ iCondition ] ) / 00449 sDataset.m_psDataset->m_vecdStdevs[ iCondition ]; 00450 dZ += d * d; } 00451 dZ = sqrt( dZ ); 00452 sDataset.m_dZ = (float)min( dZ, (double)FLT_MAX ); } } 00453 delete[] adCluster; 00454 delete[] adPot; 00455 00456 return NULL; } 00457 00489 bool CCoalesceCluster::SelectConditions( const CPCL& PCL, const CCoalesceCluster& Pot, size_t iThreads, 00490 float dPValue, float dZScore ) { 00491 vector<pthread_t> vecpthdThreads; 00492 vector<SThreadSelectCondition> vecsThreads; 00493 size_t i; 00494 vector<size_t> veciCluster, veciPot; 00495 00496 m_setiDatasets.clear( ); 00497 veciCluster.resize( GetGenes( ).size( ) ); 00498 copy( GetGenes( ).begin( ), GetGenes( ).end( ), veciCluster.begin( ) ); 00499 veciPot.resize( Pot.GetGenes( ).size( ) ); 00500 copy( Pot.GetGenes( ).begin( ), Pot.GetGenes( ).end( ), veciPot.begin( ) ); 00501 vecpthdThreads.resize( iThreads ); 00502 vecsThreads.resize( vecpthdThreads.size( ) ); 00503 for( i = 0; i < vecsThreads.size( ); ++i ) { 00504 vecsThreads[ i ].m_iOffset = i; 00505 vecsThreads[ i ].m_iStep = vecsThreads.size( ); 00506 vecsThreads[ i ].m_pveciCluster = &veciCluster; 00507 vecsThreads[ i ].m_pveciPot = &veciPot; 00508 vecsThreads[ i ].m_pvecsDatasets = &m_vecsDatasets; 00509 vecsThreads[ i ].m_pPCL = &PCL; 00510 if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSelectCondition, &vecsThreads[ i ] ) ) { 00511 g_CatSleipnir( ).error( "CCoalesceCluster::SelectConditions( %d, %g ) could not select conditions", 00512 iThreads, dZScore ); 00513 return false; } } 00514 for( i = 0; i < vecpthdThreads.size( ); ++i ) 00515 pthread_join( vecpthdThreads[ i ], NULL ); 00516 00517 dPValue /= m_vecsDatasets.size( ); 00518 for( i = 0; i < m_vecsDatasets.size( ); ++i ) { 00519 const SDataset& sDataset = m_vecsDatasets[ i ]; 00520 00521 if( ( sDataset.m_dP < dPValue ) && ( fabs( sDataset.m_dZ ) > dZScore ) ) { 00522 g_CatSleipnir( ).debug( "CCoalesceCluster::SelectConditions( %d, %g ) selected dataset %d at %g, z=%g", 00523 iThreads, dZScore, i, sDataset.m_dP, sDataset.m_dZ ); 00524 m_setiDatasets.insert( i ); } 00525 else 00526 g_CatSleipnir( ).debug( "CCoalesceCluster::SelectConditions( %d, %g ) rejected dataset %d at %g, z=%g", 00527 iThreads, dZScore, i, sDataset.m_dP, sDataset.m_dZ ); } 00528 00529 return true; } 00530 00531 void* CCoalesceClusterImpl::ThreadSelectMotif( void* pData ) { 00532 SThreadSelectMotif* psData; 00533 uint32_t iMotif; 00534 00535 psData = (SThreadSelectMotif*)pData; 00536 for( iMotif = psData->m_iOffset; iMotif < ( psData->m_pveciMotifs ? psData->m_pveciMotifs->size( ) : 00537 psData->m_pMotifs->GetMotifs( ) ); iMotif += psData->m_iStep ) 00538 if( !AddSignificant( *psData->m_pMotifs, psData->m_pveciMotifs ? (*psData->m_pveciMotifs)[ iMotif ] : 00539 iMotif, *psData->m_pHistsCluster, *psData->m_pHistsPot, psData->m_dPValue, psData->m_dZScore, 00540 psData->m_vecsMotifs ) ) 00541 break; 00542 00543 return NULL; } 00544 00588 bool CCoalesceCluster::SelectMotifs( const CCoalesceGroupHistograms& HistsCluster, 00589 const CCoalesceGroupHistograms& HistsPot, float dPValue, float dZScore, size_t iMaxMotifs, 00590 size_t iThreads, const CCoalesceMotifLibrary* pMotifs ) { 00591 uint32_t i, j; 00592 vector<pthread_t> vecpthdThreads; 00593 vector<SThreadSelectMotif> vecsThreads; 00594 vector<uint32_t> veciMotifs; 00595 00596 if( !pMotifs ) 00597 return true; 00598 if( m_setsMotifs.size( ) > iMaxMotifs ) { 00599 set<uint32_t> setiMotifs; 00600 set<SMotifMatch>::const_iterator iterMotif; 00601 00602 for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) 00603 setiMotifs.insert( iterMotif->m_iMotif ); 00604 veciMotifs.resize( setiMotifs.size( ) ); 00605 copy( setiMotifs.begin( ), setiMotifs.end( ), veciMotifs.begin( ) ); } 00606 m_setsMotifs.clear( ); 00607 vecpthdThreads.resize( iThreads ); 00608 vecsThreads.resize( vecpthdThreads.size( ) ); 00609 for( i = 0; i < vecsThreads.size( ); ++i ) { 00610 vecsThreads[ i ].m_iOffset = i; 00611 vecsThreads[ i ].m_iStep = vecsThreads.size( ); 00612 vecsThreads[ i ].m_pMotifs = pMotifs; 00613 vecsThreads[ i ].m_pHistsCluster = &HistsCluster; 00614 vecsThreads[ i ].m_pHistsPot = &HistsPot; 00615 vecsThreads[ i ].m_dPValue = dPValue; 00616 vecsThreads[ i ].m_dZScore = dZScore; 00617 vecsThreads[ i ].m_pveciMotifs = veciMotifs.empty( ) ? NULL : &veciMotifs; 00618 if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSelectMotif, &vecsThreads[ i ] ) ) { 00619 g_CatSleipnir( ).error( "CCoalesceCluster::SelectMotifs( %g, %d ) could not select motifs", 00620 dZScore, iThreads ); 00621 return false; } } 00622 for( i = 0; i < vecpthdThreads.size( ); ++i ) { 00623 pthread_join( vecpthdThreads[ i ], NULL ); 00624 for( j = 0; j < vecsThreads[ i ].m_vecsMotifs.size( ); ++j ) 00625 m_setsMotifs.insert( vecsThreads[ i ].m_vecsMotifs[ j ] ); } 00626 00627 return true; } 00628 00629 bool CCoalesceClusterImpl::AddSignificant( const CCoalesceMotifLibrary& Motifs, uint32_t iMotif, 00630 const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot, float dPValue, 00631 float dZScore, vector<SMotifMatch>& vecsMotifs ) { 00632 size_t iTypeCluster, iTypePot; 00633 double dP, dAveOne, dAverage, dZ; 00634 CCoalesceSequencerBase::ESubsequence eSubsequence; 00635 00636 dPValue /= Motifs.GetMotifs( ); 00637 for( iTypeCluster = 0; iTypeCluster < HistsCluster.GetTypes( ); ++iTypeCluster ) { 00638 const string& strTypeCluster = HistsCluster.GetType( iTypeCluster ); 00639 00640 if( ( iTypePot = HistsPot.GetType( strTypeCluster ) ) == -1 ) 00641 continue; 00642 for( eSubsequence = CCoalesceSequencerBase::ESubsequenceBegin; 00643 (size_t)eSubsequence < HistsCluster.GetSubsequences( iTypeCluster ); 00644 eSubsequence = (CCoalesceSequencerBase::ESubsequence)( (size_t)eSubsequence + 1 ) ) { 00645 const CCoalesceHistogramSet<>& HistSetCluster = HistsCluster.Get( iTypeCluster, eSubsequence ); 00646 const CCoalesceHistogramSet<>& HistSetPot = HistsPot.Get( iTypePot, eSubsequence ); 00647 00648 if( ( HistSetCluster.GetMembers( ) <= iMotif ) || 00649 ( HistSetPot.GetMembers( ) <= iMotif ) || 00650 !( HistSetCluster.GetTotal( ) && HistSetPot.GetTotal( ) ) ) 00651 continue; 00652 dP = HistSetCluster.CohensD( iMotif, HistSetPot, dAveOne, dAverage, dZ ); 00653 if( ( dP < dPValue ) && ( fabs( dZ ) > dZScore ) ) { 00654 SMotifMatch sMotif( iMotif, strTypeCluster, eSubsequence, (float)dZ, (float)( dAveOne - 00655 dAverage ) ); 00656 00657 if( g_CatSleipnir( ).isInfoEnabled( ) ) { 00658 ostringstream ossm; 00659 00660 ossm << "CCoalesceClusterImpl::AddSignificant( " << iMotif << ", " << dZScore << 00661 " ) adding at " << dP << ":" << endl << sMotif.Save( &Motifs ) << endl << 00662 "Cluster " << HistSetCluster.Save( iMotif ) << endl << 00663 "Pot " << HistSetPot.Save( iMotif ); 00664 g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); } 00665 vecsMotifs.push_back( sMotif ); } 00666 else if( g_CatSleipnir( ).isDebugEnabled( ) ) { 00667 ostringstream ossm; 00668 00669 ossm << "CCoalesceClusterImpl::AddSignificant( " << iMotif << ", " << dZScore << 00670 " ) failed at " << dP << ":" << endl << SMotifMatch( iMotif, strTypeCluster, eSubsequence, 00671 (float)dZ, (float)( dAveOne - dAverage ) ).Save( &Motifs ) << endl << 00672 "Cluster " << HistSetCluster.Save( iMotif ) << endl << 00673 "Pot " << HistSetPot.Save( iMotif ); 00674 g_CatSleipnir( ).debug( ossm.str( ).c_str( ) ); } } } 00675 00676 return true; } 00677 00678 void* CCoalesceClusterImpl::ThreadCentroid( void* pData ) { 00679 SThreadCentroid* psData; 00680 00681 psData = (SThreadCentroid*)pData; 00682 psData->m_pCluster->CalculateCentroid( psData->m_PCL ); 00683 00684 return NULL; } 00685 00686 void* CCoalesceClusterImpl::ThreadSignificantGene( void* pData ) { 00687 SThreadSignificantGene* psData; 00688 size_t i; 00689 00690 psData = (SThreadSignificantGene*)pData; 00691 for( i = psData->m_iOffset; i < psData->m_pvecfSignificant->size( ); i += psData->m_iStep ) 00692 (*psData->m_pvecfSignificant)[ i ] = psData->m_pCluster->IsSignificant( i, *psData->m_pPCL, 00693 *psData->m_pvecdStdevs, psData->m_pMotifs, *psData->m_pGeneScores, *psData->m_pHistsCluster, 00694 *psData->m_pHistsPot, *psData->m_pPot, *psData->m_pveciDatasets, psData->m_dBeta, 00695 psData->m_iMinimum, psData->m_dProbability ); 00696 00697 return NULL; } 00698 00755 bool CCoalesceCluster::SelectGenes( const CPCL& PCL, const CCoalesceGeneScores& GeneScores, 00756 const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot, size_t iMinimum, 00757 size_t iThreads, CCoalesceCluster& Pot, float dProbability, const CCoalesceMotifLibrary* pMotifs ) { 00758 size_t i, iCluster, iPot; 00759 vector<pthread_t> vecpthdThreads; 00760 vector<bool> vecfSignificant; 00761 vector<SThreadSignificantGene> vecsThreads; 00762 vector<size_t> veciDatasets; 00763 float dSSCluster, dSSPot, dAve; 00764 set<size_t> setiMotifs; 00765 set<SMotifMatch>::const_iterator iterMotif; 00766 vector<float> vecdStdevs; 00767 00768 vecpthdThreads.resize( iThreads ); 00769 { 00770 SThreadCentroid sUs( this, PCL ), sThem( &Pot, PCL ); 00771 00772 if( iThreads > 1 ) { 00773 if( pthread_create( &vecpthdThreads[ 0 ], NULL, ThreadCentroid, &sUs ) || 00774 pthread_create( &vecpthdThreads[ 1 ], NULL, ThreadCentroid, &sThem ) ) { 00775 g_CatSleipnir( ).error( "CCoalesceCluster::SelectGenes( %d, %g ) could not calculate centroids", 00776 iThreads, dProbability ); 00777 return false; } 00778 for( i = 0; i < 2; ++i ) 00779 pthread_join( vecpthdThreads[ i ], NULL ); } 00780 else if( ThreadCentroid( &sUs ) || ThreadCentroid( &sThem ) ) { 00781 g_CatSleipnir( ).error( "CCoalesceCluster::SelectGenes( %d, %g ) could not calculate centroids", 00782 iThreads, dProbability ); 00783 return false; } 00784 } 00785 iCluster = GetGenes( ).size( ); 00786 iPot = Pot.GetGenes( ).size( ); 00787 vecdStdevs.resize( m_vecdStdevs.size( ) ); 00788 for( i = 0; i < vecdStdevs.size( ); ++i ) { 00789 dSSCluster = m_vecdStdevs[ i ]; 00790 dSSCluster = iCluster * ( ( dSSCluster * dSSCluster ) + ( m_vecdCentroid[ i ] * 00791 m_vecdCentroid[ i ] ) ); 00792 dSSPot = Pot.m_vecdStdevs[ i ]; 00793 dSSPot = iPot * ( ( dSSPot * dSSPot ) + ( Pot.m_vecdCentroid[ i ] * Pot.m_vecdCentroid[ i ] ) ); 00794 dAve = ( ( m_vecdCentroid[ i ] * iCluster ) + ( Pot.m_vecdCentroid[ i ] * iPot ) ) / PCL.GetGenes( ); 00795 vecdStdevs[ i ] = sqrt( ( ( dSSCluster + dSSPot ) / ( iCluster + iPot ) ) - ( dAve * dAve ) ); } 00796 00797 veciDatasets.resize( m_setiDatasets.size( ) ); 00798 copy( m_setiDatasets.begin( ), m_setiDatasets.end( ), veciDatasets.begin( ) ); 00799 vecfSignificant.resize( PCL.GetGenes( ) ); 00800 vecsThreads.resize( vecpthdThreads.size( ) ); 00801 for( i = 0; i < vecpthdThreads.size( ); ++i ) { 00802 vecsThreads[ i ].m_iOffset = i; 00803 vecsThreads[ i ].m_iStep = vecpthdThreads.size( ); 00804 vecsThreads[ i ].m_pvecfSignificant = &vecfSignificant; 00805 vecsThreads[ i ].m_pPCL = &PCL; 00806 vecsThreads[ i ].m_pMotifs = pMotifs; 00807 vecsThreads[ i ].m_pGeneScores = &GeneScores; 00808 vecsThreads[ i ].m_pHistsCluster = &HistsCluster; 00809 vecsThreads[ i ].m_pHistsPot = &HistsPot; 00810 vecsThreads[ i ].m_pCluster = this; 00811 vecsThreads[ i ].m_pPot = &Pot; 00812 vecsThreads[ i ].m_pveciDatasets = &veciDatasets; 00813 vecsThreads[ i ].m_pvecdStdevs = &vecdStdevs; 00814 vecsThreads[ i ].m_dBeta = m_setsMotifs.size( ) ? ( (float)m_setsMotifs.size( ) / ( m_setiDatasets.size( ) + m_setsMotifs.size( ) ) ) : 0.5f; 00815 vecsThreads[ i ].m_iMinimum = iMinimum; 00816 vecsThreads[ i ].m_dProbability = dProbability; 00817 if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadSignificantGene, &vecsThreads[ i ] ) ) { 00818 g_CatSleipnir( ).error( "CCoalesceCluster::SelectGenes( %d, %g ) could not calculate significance", 00819 iThreads, dProbability ); 00820 return false; } } 00821 for( i = 0; i < vecpthdThreads.size( ); ++i ) 00822 pthread_join( vecpthdThreads[ i ], NULL ); 00823 m_setiGenes.clear( ); 00824 Pot.m_setiGenes.clear( ); 00825 for( i = 0; i < vecfSignificant.size( ); ++i ) 00826 if( vecfSignificant[ i ] ) { 00827 m_setiGenes.insert( i ); 00828 m_vecdPriors[ i ] = dProbability; } 00829 else { 00830 Pot.m_setiGenes.insert( i ); 00831 m_vecdPriors[ i ] = 1 - dProbability; } 00832 00833 return true; } 00834 00835 bool CCoalesceClusterImpl::IsSignificant( size_t iGene, const CPCL& PCL, const vector<float>& vecdStdevs, 00836 const CCoalesceMotifLibrary* pMotifs, const CCoalesceGeneScores& GeneScores, 00837 const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot, 00838 const CCoalesceCluster& Pot, const vector<size_t>& veciDatasets, float dBeta, size_t iMinimum, 00839 float dProbability ) const { 00840 float d, dP, dLogPMotifsGivenIn, dLogPMotifsGivenOut, dLogPExpressionGivenIn, dLogPExpressionGivenOut; 00841 bool fClustered; 00842 00843 fClustered = IsGene( iGene ); 00844 // We want P(g in C|S,E) = 00845 // P(S,E|g in C)P(g in C)/P(S,E) = 00846 // P(S|g in C)P(E|g in C)P(g in C)/(P(S,E|g in C)P(g in C) + P(S,E|g notin C)P(g notin C)) = 00847 // P(S|g in C)P(E|g in C)P(g in C)/(P(S|g in C)P(E|g in C)P(g in C) + P(S|g notin C)P(E|g notin C)P(g notin C)) 00848 if( !( CalculateProbabilityExpression( iGene, PCL, vecdStdevs, Pot, veciDatasets, fClustered, 00849 dLogPExpressionGivenIn, dLogPExpressionGivenOut ) && CalculateProbabilityMotifs( GeneScores, iGene, 00850 HistsCluster, HistsPot, fClustered, iMinimum, dLogPMotifsGivenIn, dLogPMotifsGivenOut ) ) ) 00851 return false; 00852 if( ( d = m_vecdPriors[ iGene ] ) >= 1 ) 00853 dP = 1; 00854 else if( d <= 0 ) 00855 dP = 0; 00856 else 00857 dP = 1 / ( 1 + exp( ( dBeta * ( dLogPExpressionGivenOut - dLogPExpressionGivenIn ) + ( 1 - dBeta ) * 00858 ( dLogPMotifsGivenOut - dLogPMotifsGivenIn ) ) / ( 0.5f + 2 * pow( 0.5f - dBeta, 2 ) ) + 00859 log( ( 1 - d ) / d ) ) ); 00860 00861 if( g_CatSleipnir( ).isDebugEnabled( ) ) { 00862 set<SMotifMatch>::const_iterator iterMotif; 00863 size_t iType; 00864 00865 g_CatSleipnir( ).debug( "CCoalesceClusterImpl::IsSignificant( %s ) is %g, prior %g beta %g, exp. p=%g vs. %g, seq. p=%g vs %g", 00866 PCL.GetGene( iGene ).c_str( ), dP, d, dBeta, dLogPExpressionGivenIn, dLogPExpressionGivenOut, 00867 dLogPMotifsGivenIn, dLogPMotifsGivenOut ); 00868 for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) 00869 if( ( iType = GeneScores.GetType( iterMotif->m_strType ) ) != -1 ) 00870 g_CatSleipnir( ).debug( "%g %s", GeneScores.Get( iType, iterMotif->m_eSubsequence, iGene, 00871 iterMotif->m_iMotif ), iterMotif->Save( pMotifs ).c_str( ) ); } 00872 00873 return ( dP >= dProbability ); } 00874 00875 bool CCoalesceClusterImpl::CalculateProbabilityExpression( size_t iGene, const CPCL& PCL, 00876 const vector<float>& vecdStdevs, const CCoalesceCluster& Pot, const vector<size_t>& veciDatasets, 00877 bool fClustered, float& dLogPIn, float& dLogPOut ) const { 00878 static const double c_dEpsilonZero = 1e-10; 00879 float dGene, dCluster, dPot; 00880 double dPCluster, dPPot, dStdev; 00881 size_t iDataset, iPot, iCondition; 00882 long double dPIn, dPOut; 00883 00884 iPot = PCL.GetGenes( ) - GetGenes( ).size( ); 00885 dLogPIn = dLogPOut = 0; 00886 dPIn = dPOut = 1; 00887 for( iDataset = 0; iDataset < veciDatasets.size( ); ++iDataset ) { 00888 const SDataset& sDataset = m_vecsDatasets[ veciDatasets[ iDataset ] ]; 00889 00890 if( sDataset.GetConditions( ) == 1 ) { 00891 iCondition = sDataset.GetCondition( 0 ); 00892 if( CMeta::IsNaN( dGene = PCL.Get( iGene, iCondition ) ) || 00893 CMeta::IsNaN( dCluster = m_vecdCentroid[ iCondition ] ) || 00894 CMeta::IsNaN( dPot = Pot.m_vecdCentroid[ iCondition ] ) ) 00895 continue; 00896 00897 if( fClustered ) 00898 dCluster = ( ( dCluster * GetGenes( ).size( ) ) - dGene ) / 00899 ( GetGenes( ).size( ) - 1 ); 00900 else 00901 dPot = ( ( dPot * iPot ) - dGene ) / ( iPot - 1 ); 00902 dStdev = max( c_dEpsilonZero, (double)vecdStdevs[ iCondition ] ); 00903 dPCluster = max( c_dEpsilonZero, CStatistics::NormalPDF( dGene, dCluster, dStdev ) ); 00904 dPPot = max( c_dEpsilonZero, CStatistics::NormalPDF( dGene, dPot, dStdev ) ); } 00905 else { 00906 vector<float> vecdGene; 00907 00908 vecdGene.resize( sDataset.GetConditions( ) ); 00909 for( iCondition = 0; iCondition < sDataset.GetConditions( ); ++iCondition ) 00910 vecdGene[ iCondition ] = PCL.Get( iGene, sDataset.GetCondition( iCondition ) ); 00911 dPCluster = min( 1.0, max( c_dEpsilonZero, CStatistics::MultivariateNormalPDF( vecdGene, 00912 sDataset.m_vecdCentroid, sDataset.m_psDataset->m_dSigmaDetSqrt, 00913 sDataset.m_psDataset->m_MatSigmaInv ) ) ); 00914 dPPot = min( 1.0, max( c_dEpsilonZero, CStatistics::MultivariateNormalPDF( vecdGene, 00915 Pot.m_vecsDatasets[ veciDatasets[ iDataset ] ].m_vecdCentroid, 00916 sDataset.m_psDataset->m_dSigmaDetSqrt, sDataset.m_psDataset->m_MatSigmaInv ) ) ); } 00917 dPIn *= dPCluster; 00918 dPOut *= dPPot; 00919 if( ( dPIn < DBL_MIN ) || ( dPOut < DBL_MIN ) ) { 00920 dLogPIn += (float)log( dPIn ); 00921 dLogPOut += (float)log( dPOut ); 00922 dPIn = dPOut = 1; } } 00923 dLogPIn += (float)log( dPIn ); 00924 dLogPOut += (float)log( dPOut ); 00925 00926 return true; } 00927 00928 bool CCoalesceClusterImpl::CalculateProbabilityMotifs( const CCoalesceGeneScores& GeneScores, size_t iGene, 00929 const CCoalesceGroupHistograms& HistsCluster, const CCoalesceGroupHistograms& HistsPot, 00930 bool fClustered, size_t iMinimum, float& dLogPIn, float& dLogPOut ) const { 00931 set<SMotifMatch>::const_iterator iterMotif; 00932 size_t iType, iCluster, iPot; 00933 unsigned short sCluster, sPot; 00934 double dPCluster, dPPot; 00935 float dGene; 00936 const float* adValues; 00937 long double dPIn, dPOut; 00938 00939 dLogPIn = dLogPOut = 0; 00940 if( m_setsMotifs.empty( ) ) 00941 return true; 00942 00943 // TODO: this is the slowest part of the entire algorithm 00944 dPIn = dPOut = 1; 00945 for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) { 00946 if( ( ( iType = GeneScores.GetType( iterMotif->m_strType ) ) == -1 ) || 00947 !( adValues = GeneScores.Get( iType, iterMotif->m_eSubsequence, iGene ) ) || 00948 ( HistsCluster.GetType( iterMotif->m_strType ) == -1 ) || 00949 ( HistsPot.GetType( iterMotif->m_strType ) == -1 ) ) 00950 continue; 00951 dGene = adValues[ iterMotif->m_iMotif ]; 00952 { 00953 const CCoalesceHistogramSet<>& HistSetCluster = HistsCluster.Get( iterMotif->m_strType, 00954 iterMotif->m_eSubsequence ); 00955 const CCoalesceHistogramSet<>& HistSetPot = HistsPot.Get( iterMotif->m_strType, 00956 iterMotif->m_eSubsequence ); 00957 00958 sCluster = HistSetCluster.Get( iterMotif->m_iMotif, dGene ); 00959 iCluster = HistSetCluster.GetTotal( ); 00960 sPot = HistSetPot.Get( iterMotif->m_iMotif, dGene ); 00961 iPot = HistSetPot.GetTotal( ); 00962 if( ( iCluster < iMinimum ) || ( iPot < iMinimum ) ) 00963 continue; 00964 00965 if( fClustered ) { 00966 if( sCluster ) { 00967 sCluster--; 00968 iCluster--; } 00969 else 00970 g_CatSleipnir( ).warn( "CCoalesceClusterImpl::CalculateProbabilityMotifs( %d, %d, %g, %g ) no motifs of %d in cluster: %g, %s\n%s", 00971 iGene, fClustered, dLogPIn, dLogPOut, iCluster, dGene, 00972 iterMotif->Save( NULL ).c_str( ), HistSetCluster.Save( iterMotif->m_iMotif ).c_str( ) ); } 00973 else if( sPot ) { 00974 sPot--; 00975 iPot--; } 00976 else 00977 g_CatSleipnir( ).warn( "CCoalesceClusterImpl::CalculateProbabilityMotifs( %d, %d, %g, %g ) no motifs of %d in pot: %g, %s\n%s", 00978 iGene, fClustered, dLogPIn, dLogPOut, iPot, dGene, iterMotif->Save( NULL ).c_str( ), 00979 HistSetPot.Save( iterMotif->m_iMotif ).c_str( ) ); 00980 dPCluster = (double)( sCluster + 1 ) / ( iCluster + HistSetCluster.GetEdges( ) ); 00981 dPPot = (double)( sPot + 1 ) / ( iPot + HistSetPot.GetEdges( ) ); 00982 } 00983 dPIn *= dPCluster; 00984 dPOut *= dPPot; 00985 if( ( dPIn < DBL_MIN ) || ( dPOut < DBL_MIN ) ) { 00986 dLogPIn += (float)log( dPIn ); 00987 dLogPOut += (float)log( dPOut ); 00988 dPIn = dPOut = 1; } } 00989 dLogPIn += (float)log( dPIn ); 00990 dLogPOut += (float)log( dPOut ); 00991 00992 return true; } 00993 01033 bool CCoalesceCluster::Save( const std::string& strDirectory, size_t iID, const CPCL& PCL, 01034 const CCoalesceMotifLibrary* pMotifs ) const { 01035 ofstream ofsm; 01036 char* szTmp; 01037 CPCL PCLCopy; 01038 size_t iGeneFrom, iGeneTo, iExpFrom, iExpTo, iLength; 01039 string strBase; 01040 set<SMotifMatch>::const_iterator iterMotif; 01041 set<size_t> setiConditions; 01042 01043 GetConditions( setiConditions ); 01044 szTmp = new char[ iLength = ( strDirectory.length( ) + 16 ) ]; 01045 sprintf_s( szTmp, iLength - 1, "%s/c%04d_XXXXXX", strDirectory.empty( ) ? "." : strDirectory.c_str( ), 01046 iID ); 01047 _mktemp_s( szTmp, iLength - 1 ); 01048 szTmp[ iLength - 1 ] = 0; 01049 strBase = szTmp; 01050 delete[] szTmp; 01051 01052 ofsm.open( ( strBase + c_szMotifs ).c_str( ) ); 01053 if( !ofsm.is_open( ) ) 01054 return false; 01055 for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) 01056 ofsm << iterMotif->Save( pMotifs ) << endl; 01057 ofsm.close( ); 01058 01059 PCLCopy.Open( PCL ); 01060 // Reorder header 01061 for( iExpTo = iExpFrom = 0; iExpFrom < PCL.GetExperiments( ); ++iExpFrom ) 01062 if( setiConditions.find( iExpFrom ) != setiConditions.end( ) ) 01063 PCLCopy.SetExperiment( iExpTo++, c_cStar + PCL.GetExperiment( iExpFrom ) ); 01064 for( iExpFrom = 0; iExpFrom < PCL.GetExperiments( ); ++iExpFrom ) 01065 if( setiConditions.find( iExpFrom ) == setiConditions.end( ) ) 01066 PCLCopy.SetExperiment( iExpTo++, PCL.GetExperiment( iExpFrom ) ); 01067 // Reorder genes 01068 for( iGeneTo = iGeneFrom = 0; iGeneFrom < PCL.GetGenes( ); ++iGeneFrom ) 01069 if( IsGene( iGeneFrom ) && !SaveCopy( PCL, setiConditions, iGeneFrom, PCLCopy, iGeneTo++, false ) ) 01070 return false; 01071 for( iGeneFrom = 0; iGeneFrom < PCL.GetGenes( ); ++iGeneFrom ) 01072 if( !IsGene( iGeneFrom ) ) 01073 PCLCopy.MaskGene( iGeneTo++ ); 01074 01075 ofsm.clear( ); 01076 ofsm.open( ( strBase + CPCL::GetExtension( ) ).c_str( ) ); 01077 if( !ofsm.is_open( ) ) 01078 return false; 01079 PCLCopy.Save( ofsm ); 01080 ofsm.close( ); 01081 01082 return true; } 01083 01084 bool CCoalesceClusterImpl::SaveCopy( const CPCL& PCLFrom, const std::set<size_t>& setiConditions, 01085 size_t iGeneFrom, CPCL& PCLTo, size_t iGeneTo, bool fClustered ) const { 01086 size_t i, iExpTo, iExpFrom; 01087 01088 PCLTo.SetGene( iGeneTo, PCLFrom.GetGene( iGeneFrom ) ); 01089 for( i = 1; i < PCLFrom.GetFeatures( ); ++i ) 01090 PCLTo.SetFeature( iGeneTo, i, (string)( ( fClustered && ( i == 1 ) ) ? "*" : "" ) + 01091 PCLFrom.GetFeature( iGeneFrom, i ) ); 01092 for( iExpTo = iExpFrom = 0; iExpFrom < PCLFrom.GetExperiments( ); ++iExpFrom ) 01093 if( setiConditions.find( iExpFrom ) != setiConditions.end( ) ) 01094 PCLTo.Set( iGeneTo, iExpTo++, PCLFrom.Get( iGeneFrom, iExpFrom ) ); 01095 for( iExpFrom = 0; iExpFrom < PCLFrom.GetExperiments( ); ++iExpFrom ) 01096 if( setiConditions.find( iExpFrom ) == setiConditions.end( ) ) 01097 PCLTo.Set( iGeneTo, iExpTo++, PCLFrom.Get( iGeneFrom, iExpFrom ) ); 01098 01099 return true; } 01100 01151 void CCoalesceCluster::Save( std::ostream& ostm, size_t iID, const CPCL& PCL, 01152 const CCoalesceMotifLibrary* pMotifs, float dCutoffPWMs, float dPenaltyGap, float dPenaltyMismatch, 01153 bool fNoRCs ) const { 01154 set<size_t>::const_iterator iterID; 01155 set<SMotifMatch>::const_iterator iterMotif; 01156 size_t i; 01157 string strMotif; 01158 01159 ostm << "Cluster\t" << iID << endl; 01160 ostm << c_szGenes; 01161 for( iterID = GetGenes( ).begin( ); iterID != GetGenes( ).end( ); ++iterID ) 01162 ostm << '\t' << PCL.GetGene( *iterID ); 01163 ostm << endl << c_szConditions; 01164 for( iterID = m_setiDatasets.begin( ); iterID != m_setiDatasets.end( ); ++iterID ) 01165 for( i = 0; i < GetConditions( *iterID ); ++i ) 01166 ostm << '\t' << PCL.GetExperiment( GetCondition( *iterID, i ) ); 01167 ostm << endl << "Motifs" << endl; 01168 for( iterMotif = GetMotifs( ).begin( ); iterMotif != GetMotifs( ).end( ); ++iterMotif ) 01169 if( !( strMotif = iterMotif->Save( pMotifs, true, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch, 01170 fNoRCs ) ).empty( ) ) 01171 ostm << strMotif << endl; 01172 ostm.flush( ); } 01173 01201 size_t CCoalesceCluster::Open( const std::string& strPCL, size_t iSkip, const CPCL& PCL, 01202 CCoalesceMotifLibrary* pMotifs ) { 01203 CPCL PCLCluster; 01204 size_t i, j, iGene; 01205 string strMotifs; 01206 ifstream ifsm; 01207 01208 Clear( ); 01209 if( !PCLCluster.Open( strPCL.c_str( ), iSkip ) ) 01210 return -1; 01211 01212 for( i = 0; i < PCLCluster.GetExperiments( ); ++i ) { 01213 if( PCLCluster.GetExperiment( i )[ 0 ] != c_cStar ) 01214 break; 01215 for( j = 0; j < PCL.GetExperiments( ); ++j ) 01216 if( PCL.GetExperiment( j ) == ( PCLCluster.GetExperiment( i ).c_str( ) + 1 ) ) { 01217 m_setiDatasets.insert( j ); 01218 break; } } 01219 for( i = 0; i < PCLCluster.GetGenes( ); ++i ) { 01220 if( ( iGene = PCL.GetGene( PCLCluster.GetGene( i ) ) ) == -1 ) { 01221 g_CatSleipnir( ).error( "CCoalesceCluster::Open( %s, %i ) unrecognized gene: %s", strPCL.c_str( ), 01222 iSkip, PCLCluster.GetGene( i ).c_str( ) ); 01223 return -1; } 01224 m_setiGenes.insert( iGene ); } 01225 01226 if( !pMotifs || ( ( i = strPCL.rfind( CPCL::GetExtension( ) ) ) != 01227 ( strPCL.length( ) - strlen( CPCL::GetExtension( ) ) ) ) ) 01228 return PCLCluster.GetExperiments( ); 01229 strMotifs = strPCL.substr( 0, i ) + c_szMotifs; 01230 ifsm.open( strMotifs.c_str( ) ); 01231 if( !ifsm.is_open( ) ) 01232 return PCLCluster.GetExperiments( ); 01233 while( !ifsm.eof( ) && ( ifsm.peek( ) != -1 ) ) { 01234 SMotifMatch sMotif; 01235 01236 if( !sMotif.Open( ifsm, *pMotifs ) ) { 01237 g_CatSleipnir( ).warn( "CCoalesceCluster::Open( %s, %d ) could not open: %s", strPCL.c_str( ), 01238 iSkip, strMotifs.c_str( ) ); 01239 return -1; } 01240 m_setsMotifs.insert( sMotif ); } 01241 01242 return PCLCluster.GetExperiments( ); } 01243 01268 size_t CCoalesceCluster::Open( std::istream& istm, const CPCL& PCL, CCoalesceMotifLibrary* pMotifs ) { 01269 string strBuffer; 01270 vector<string> vecstrLine; 01271 size_t i, j; 01272 vector<SMotifMatch> vecsMotifs; 01273 01274 Clear( ); 01275 strBuffer.resize( CFile::GetBufferSize( ) ); 01276 istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); 01277 01278 istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); 01279 CMeta::Tokenize( strBuffer.c_str( ), vecstrLine ); 01280 if( vecstrLine.empty( ) || ( vecstrLine[ 0 ] != c_szGenes ) ) { 01281 g_CatSleipnir( ).error( "CCoalesceCluster::Open( ) invalid line: %s", strBuffer.c_str( ) ); 01282 return -1; } 01283 for( i = 1; i < vecstrLine.size( ); ++i ) { 01284 if( ( j = PCL.GetGene( vecstrLine[ i ] ) ) == -1 ) { 01285 g_CatSleipnir( ).warn( "CCoalesceCluster::Open( ) unrecognized gene: %s", 01286 vecstrLine[ i ].c_str( ) ); 01287 continue; } 01288 m_setiGenes.insert( j ); } 01289 01290 istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); 01291 vecstrLine.clear( ); 01292 CMeta::Tokenize( strBuffer.c_str( ), vecstrLine ); 01293 if( vecstrLine.empty( ) || ( vecstrLine[ 0 ] != c_szConditions ) ) { 01294 g_CatSleipnir( ).error( "CCoalesceCluster::Open( ) invalid line: %s", strBuffer.c_str( ) ); 01295 return -1; } 01296 for( i = 1; i < vecstrLine.size( ); ++i ) { 01297 if( ( j = PCL.GetExperiment( vecstrLine[ i ] ) ) == -1 ) { 01298 g_CatSleipnir( ).error( "CCoalesceCluster::Open( ) unrecognized condition: %s", 01299 vecstrLine[ i ].c_str( ) ); 01300 return -1; } 01301 m_setiDatasets.insert( j ); } 01302 01303 istm.getline( &strBuffer[ 0 ], CFile::GetBufferSize( ) ); 01304 if( !CCoalesceMotifLibrary::Open( istm, vecsMotifs, pMotifs ) ) 01305 return -1; 01306 for( i = 0; i < vecsMotifs.size( ); ++i ) 01307 m_setsMotifs.insert( vecsMotifs[ i ] ); 01308 01309 return m_setiDatasets.size( ); } 01310 01347 bool CCoalesceCluster::Open( const CHierarchy& Hierarchy, const std::vector<CCoalesceCluster>& vecClusters, 01348 const std::vector<std::string>& vecstrClusters, float dFraction, float dCutoff, size_t iCutoff, 01349 CCoalesceMotifLibrary* pMotifs ) { 01350 map<size_t, size_t> mapiiGenes, mapiiDatasets; 01351 size_t i, iClusters; 01352 map<size_t, size_t>::const_iterator iterItem; 01353 vector<map<string, set<SMotifMatch> > > vecmapstrsetsMotifs; 01354 01355 vecmapstrsetsMotifs.resize( CCoalesceSequencerBase::ESubsequenceEnd ); 01356 g_CatSleipnir( ).notice( "CCoalesceCluster::Open( %g ) merging clusters:", dFraction ); 01357 if( !( iClusters = CCoalesceClusterImpl::Open( Hierarchy, vecClusters, vecstrClusters, mapiiGenes, 01358 mapiiDatasets, vecmapstrsetsMotifs ) ) ) { 01359 g_CatSleipnir( ).error( "CCoalesceCluster::Open( %g ) no clusters found", dFraction ); 01360 return false; } 01361 01362 Clear( ); 01363 for( iterItem = mapiiGenes.begin( ); iterItem != mapiiGenes.end( ); ++iterItem ) 01364 if( ( (float)iterItem->second / iClusters ) >= dFraction ) 01365 m_setiGenes.insert( iterItem->first ); 01366 for( iterItem = mapiiDatasets.begin( ); iterItem != mapiiDatasets.end( ); ++iterItem ) 01367 // if( ( (float)iterItem->second / iClusters ) >= dFraction ) 01368 m_setiDatasets.insert( iterItem->first ); 01369 if( !pMotifs ) 01370 return true; 01371 01372 for( i = 0; i < vecmapstrsetsMotifs.size( ); ++i ) { 01373 const map<string, set<SMotifMatch> >& mapstrsetsMotifs = vecmapstrsetsMotifs[ i ]; 01374 map<string, set<SMotifMatch> >::const_iterator iterMotifs; 01375 01376 for( iterMotifs = mapstrsetsMotifs.begin( ); iterMotifs != mapstrsetsMotifs.end( ); ++iterMotifs ) { 01377 const set<SMotifMatch>& setsMotifs = iterMotifs->second; 01378 01379 if( !( ( setsMotifs.size( ) < iCutoff ) ? 01380 CCoalesceClusterImpl::OpenMotifs( setsMotifs, *pMotifs, dCutoff ) : 01381 CCoalesceClusterImpl::OpenMotifsHeuristic( setsMotifs, *pMotifs, dCutoff, iCutoff ) ) ) 01382 return false; } } 01383 01384 return true; } 01385 01386 bool CCoalesceClusterImpl::OpenMotifsHeuristic( const std::set<SMotifMatch>& setsMotifs, 01387 CCoalesceMotifLibrary& Motifs, float dCutoff, size_t iCutoff ) { 01388 vector<SMotifMatch> vecsMotifs; 01389 bool fDone; 01390 size_t i, iMotifs; 01391 set<SMotifMatch> setsMerged; 01392 01393 iMotifs = setsMotifs.size( ); 01394 g_CatSleipnir( ).notice( "CCoalesceClusterImpl::OpenMotifsHeuristic( %g ) resolving %d motifs", dCutoff, 01395 iMotifs ); 01396 01397 vecsMotifs.resize( iMotifs ); 01398 copy( setsMotifs.begin( ), setsMotifs.end( ), vecsMotifs.begin( ) ); 01399 do { 01400 fDone = true; 01401 sort( vecsMotifs.begin( ), vecsMotifs.end( ) ); 01402 for( i = 0; ( i + 1 ) < vecsMotifs.size( ); ++i ) { 01403 SMotifMatch& sOne = vecsMotifs[ i ]; 01404 SMotifMatch& sTwo = vecsMotifs[ i + 1 ]; 01405 01406 if( ( sOne.m_iMotif == -1 ) || ( sTwo.m_iMotif == -1 ) ) 01407 break; 01408 if( Motifs.Align( sOne.m_iMotif, sTwo.m_iMotif, dCutoff ) > dCutoff ) 01409 continue; 01410 if( sTwo.Open( sOne, sTwo, Motifs ) == -1 ) 01411 return false; 01412 if( Motifs.GetPST( sTwo.m_iMotif )->Integrate( ) > iCutoff ) 01413 Motifs.Simplify( sTwo.m_iMotif ); 01414 fDone = false; 01415 iMotifs--; 01416 sOne.m_iMotif = -1; } } 01417 while( !fDone ); 01418 01419 for( i = 0; i < vecsMotifs.size( ); ++i ) 01420 if( vecsMotifs[ i ].m_iMotif != -1 ) 01421 setsMerged.insert( vecsMotifs[ i ] ); 01422 01423 return OpenMotifs( setsMerged, Motifs, dCutoff ); } 01424 01425 bool CCoalesceClusterImpl::OpenMotifs( const std::set<SMotifMatch>& setsMotifs, CCoalesceMotifLibrary& Motifs, 01426 float dCutoff ) { 01427 vector<SMotifMatch> vecsMotifs; 01428 CDistanceMatrix MatSimilarity; 01429 CHierarchy* pHierMotifs; 01430 size_t i, j; 01431 bool fRet; 01432 01433 g_CatSleipnir( ).notice( "CCoalesceClusterImpl::OpenMotifs( %g ) resolving %d motifs", dCutoff, 01434 setsMotifs.size( ) ); 01435 01436 vecsMotifs.resize( setsMotifs.size( ) ); 01437 copy( setsMotifs.begin( ), setsMotifs.end( ), vecsMotifs.begin( ) ); 01438 MatSimilarity.Initialize( vecsMotifs.size( ) ); 01439 for( i = 0; i < vecsMotifs.size( ); ++i ) 01440 for( j = ( i + 1 ); j < vecsMotifs.size( ); ++j ) 01441 MatSimilarity.Set( i, j, -Motifs.Align( vecsMotifs[ i ].m_iMotif, 01442 vecsMotifs[ j ].m_iMotif, dCutoff ) ); 01443 if( !( pHierMotifs = CClustHierarchical::Cluster( MatSimilarity ) ) ) 01444 return false; 01445 fRet = CCoalesceClusterImpl::OpenMotifs( Motifs, *pHierMotifs, vecsMotifs, dCutoff, m_setsMotifs ); 01446 pHierMotifs->Destroy( ); 01447 return fRet; } 01448 01449 size_t CCoalesceClusterImpl::Open( const CHierarchy& Hier, const std::vector<CCoalesceCluster>& vecClusters, 01450 const std::vector<std::string>& vecstrClusters, std::map<size_t, size_t>& mapiiGenes, 01451 std::map<size_t, size_t>& mapiiDatasets, TVecMapStrSetSMotifs& vecmapstrsetsMotifs ) { 01452 set<size_t>::const_iterator iterFrom; 01453 map<size_t, size_t>::iterator iterTo; 01454 set<SMotifMatch>::const_iterator iterMotif; 01455 01456 if( !Hier.IsGene( ) ) 01457 return ( Open( Hier.Get( false ), vecClusters, vecstrClusters, mapiiGenes, mapiiDatasets, 01458 vecmapstrsetsMotifs ) + Open( Hier.Get( true ), vecClusters, vecstrClusters, mapiiGenes, 01459 mapiiDatasets, vecmapstrsetsMotifs ) ); 01460 01461 const CCoalesceCluster& Cluster = vecClusters[ Hier.GetID( ) ]; 01462 01463 g_CatSleipnir( ).notice( "CCoalesceClusterImpl::Open( ) cluster %s", 01464 vecstrClusters[ Hier.GetID( ) ].c_str( ) ); 01465 for( iterFrom = Cluster.GetGenes( ).begin( ); iterFrom != Cluster.GetGenes( ).end( ); ++iterFrom ) 01466 if( ( iterTo = mapiiGenes.find( *iterFrom ) ) == mapiiGenes.end( ) ) 01467 mapiiGenes[ *iterFrom ] = 1; 01468 else 01469 iterTo->second++; 01470 for( iterFrom = Cluster.m_setiDatasets.begin( ); iterFrom != Cluster.m_setiDatasets.end( ); ++iterFrom ) 01471 if( ( iterTo = mapiiDatasets.find( *iterFrom ) ) == mapiiDatasets.end( ) ) 01472 mapiiDatasets[ *iterFrom ] = 1; 01473 else 01474 iterTo->second++; 01475 for( iterMotif = Cluster.m_setsMotifs.begin( ); iterMotif != Cluster.m_setsMotifs.end( ); ++iterMotif ) 01476 vecmapstrsetsMotifs[ iterMotif->m_eSubsequence ][ iterMotif->m_strType ].insert( *iterMotif ); 01477 01478 return 1; } 01479 01480 bool CCoalesceClusterImpl::OpenMotifs( CCoalesceMotifLibrary& Motifs, const CHierarchy& Hier, 01481 const std::vector<SMotifMatch>& vecsMotifs, float dCutoff, std::set<SMotifMatch>& setsMotifs ) { 01482 01483 if( Hier.IsGene( ) || ( -Hier.GetSimilarity( ) < dCutoff ) ) { 01484 SMotifMatch sMotif; 01485 size_t iCount; 01486 01487 sMotif.m_dZ = 0; 01488 if( sMotif.Open( Hier, vecsMotifs, Motifs, iCount = 0 ) == -1 ) 01489 return false; 01490 sMotif.m_dZ /= iCount; 01491 setsMotifs.insert( sMotif ); 01492 return true; } 01493 01494 return ( OpenMotifs( Motifs, Hier.Get( false ), vecsMotifs, dCutoff, setsMotifs ) && 01495 OpenMotifs( Motifs, Hier.Get( true ), vecsMotifs, dCutoff, setsMotifs ) ); } 01496 01517 float CCoalesceCluster::GetSimilarity( const CCoalesceCluster& Cluster, size_t iGenes, 01518 size_t iDatasets ) const { 01519 size_t iOverlapGenes; 01520 set<size_t>::const_iterator iterItem; 01521 float dRet; 01522 01523 if( GetGenes( ).empty( ) || Cluster.GetGenes( ).empty( ) ) 01524 return 0; 01525 for( iOverlapGenes = 0,iterItem = GetGenes( ).begin( ); iterItem != GetGenes( ).end( ); ++iterItem ) 01526 if( Cluster.IsGene( *iterItem ) ) 01527 iOverlapGenes++; 01528 // simple gene overlap works best with an ~0.75 cutoff 01529 dRet = (float)iOverlapGenes / min( GetGenes( ).size( ), Cluster.GetGenes( ).size( ) ); 01530 // jaccard index works best with an 0.25-0.33 cutoff 01531 // dRet = (float)iOverlapGenes / ( GetGenes( ).size( ) + Cluster.GetGenes( ).size( ) - iOverlapGenes ); 01532 g_CatSleipnir( ).debug( "CCoalesceCluster::GetSimilarity( %d, %d ): %d, %d, %d = %g", iGenes, 01533 iDatasets, iOverlapGenes, GetGenes( ).size( ), Cluster.GetGenes( ).size( ), dRet ); 01534 return dRet; } 01535 01553 void CCoalesceCluster::Snapshot( const CCoalesceGeneScores& GeneScores, 01554 CCoalesceGroupHistograms& Histograms ) { 01555 01556 Histograms.SetTotal( GeneScores, GetGenes( ) ); 01557 m_setiHistory.insert( GetHash( ) ); 01558 CCoalesceClusterImpl::Snapshot( m_setiDatasets, m_veciPrevDatasets ); 01559 CCoalesceClusterImpl::Snapshot( m_setsMotifs, m_vecsPrevMotifs ); 01560 CCoalesceClusterImpl::Snapshot( GetGenes( ), m_veciPrevGenes ); } 01561 01596 bool CCoalesceCluster::LabelMotifs( const CCoalesceMotifLibrary& Motifs, SMotifMatch::EType eMatchType, 01597 float dPenaltyGap, float dPenaltyMismatch, float dPValue ) { 01598 set<SMotifMatch>::iterator iterMotif; 01599 01600 if( !Motifs.GetKnowns( ) ) 01601 return true; 01602 for( iterMotif = m_setsMotifs.begin( ); iterMotif != m_setsMotifs.end( ); ++iterMotif ) { 01603 SMotifMatch& sMotif = (SMotifMatch&)*iterMotif; 01604 // For some obscure reason, Linux won't let me use this as a non-const iterator... 01605 if( !sMotif.Label( Motifs, eMatchType, dPenaltyGap, dPenaltyMismatch, dPValue ) ) 01606 return false; } 01607 01608 return true; } 01609 01610 }