Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "coalesce.h" 00024 #include "fasta.h" 00025 #include "pcl.h" 00026 #include "halfmatrix.h" 00027 00028 namespace Sleipnir { 00029 00030 const char* CCoalesceSequencerBase::c_aszSubsequences[] = {"total", "introns", "exons", NULL}; 00031 00032 // CCoalesceGeneScores 00033 00034 bool CCoalesceGeneScores::Add( size_t iGene, const CCoalesceMotifLibrary& Motifs, 00035 const SFASTASequence& sSequence, SCoalesceModifierCache& sModifiers, vector<vector<float> >& vecvecdCounts, 00036 vector<size_t>& veciLengths ) { 00037 size_t i, iType, iSubsequence, iLength, iOffset; 00038 uint32_t iMotif; 00039 float d; 00040 00041 sModifiers.SetType( sSequence.m_strType ); 00042 vecvecdCounts.resize( ESubsequenceEnd ); 00043 for( i = 0; i < vecvecdCounts.size( ); ++i ) 00044 fill( vecvecdCounts[ i ].begin( ), vecvecdCounts[ i ].end( ), 0.0f ); 00045 veciLengths.resize( ESubsequenceEnd ); 00046 fill( veciLengths.begin( ), veciLengths.end( ), 0 ); 00047 00048 iType = AddType( sSequence.m_strType ); 00049 for( iOffset = i = 0; i < sSequence.m_vecstrSequences.size( ); 00050 iOffset += sSequence.m_vecstrSequences[ i++ ].length( ) ) 00051 if( !Add( Motifs, sSequence.m_vecstrSequences[ i ], iType, sSequence.m_fIntronFirst == !( i % 2 ), 00052 vecvecdCounts, veciLengths, iOffset, sModifiers ) ) 00053 return false; 00054 for( iSubsequence = ESubsequenceBegin; iSubsequence < vecvecdCounts.size( ); ++iSubsequence ) { 00055 const vector<float>& vecdCounts = vecvecdCounts[ iSubsequence ]; 00056 00057 if( iLength = veciLengths[ iSubsequence ] ) { 00058 Set( iType, (ESubsequence)iSubsequence, iGene, 0, 0, Motifs.GetMotifs( ) ); 00059 for( iMotif = 0; iMotif < vecdCounts.size( ); ++iMotif ) 00060 if( d = vecdCounts[ iMotif ] ) 00061 Set( iType, (ESubsequence)iSubsequence, iGene, iMotif, d / iLength, 00062 Motifs.GetMotifs( ) ); } } 00063 00064 return true; } 00065 00066 bool CCoalesceGeneScores::Add( const CCoalesceMotifLibrary& Motifs, const std::string& strSequence, 00067 size_t iType, bool fIntron, std::vector<std::vector<float> >& vecvecdCounts, 00068 std::vector<size_t>& veciLengths, size_t iOffset, SCoalesceModifierCache& sModifiers ) { 00069 size_t i, j; 00070 ESubsequence eSubsequence; 00071 00072 eSubsequence = fIntron ? ESubsequenceIntrons : ESubsequenceExons; 00073 veciLengths[ ESubsequenceTotal ] += strSequence.length( ); 00074 veciLengths[ eSubsequence ] += strSequence.length( ); 00075 00076 sModifiers.InitializeWeight( Motifs.GetK( ), iOffset ); 00077 // BUGBUG: make me span intron/exon boundaries 00078 for( i = 0; ( i + Motifs.GetK( ) ) <= strSequence.size( ); ++i ) { 00079 string strKMer = strSequence.substr( i, Motifs.GetK( ) ); 00080 vector<uint32_t> veciMotifs; 00081 00082 if( !Motifs.GetMatches( strKMer, veciMotifs ) ) { 00083 g_CatSleipnir( ).error( "CCoalesceHistograms::Add( %s, %d, %d ) unrecognized kmer: %s", 00084 strSequence.c_str( ), iType, fIntron, strKMer.c_str( ) ); 00085 return false; } 00086 for( j = 0; j < veciMotifs.size( ); ++j ) 00087 Add( eSubsequence, veciMotifs[ j ], Motifs.GetMotifs( ), vecvecdCounts, 00088 sModifiers.GetWeight( Motifs.GetK( ) ) / Motifs.GetK( ) ); 00089 sModifiers.AddWeight( Motifs.GetK( ), iOffset, i ); } 00090 00091 return true; } 00092 00093 bool CCoalesceGeneScores::Add( size_t iGene, const CCoalesceMotifLibrary& Motifs, 00094 const SFASTASequence& sSequence, SCoalesceModifierCache& sModifiers, uint32_t iMotif, 00095 vector<float>& vecdScores, vector<size_t>& veciLengths ) { 00096 size_t i, iType, iSubsequence, iLength, iOffset; 00097 float dScore; 00098 00099 sModifiers.SetType( sSequence.m_strType ); 00100 vecdScores.resize( ESubsequenceEnd ); 00101 fill( vecdScores.begin( ), vecdScores.end( ), 0.0f ); 00102 veciLengths.resize( ESubsequenceEnd ); 00103 fill( veciLengths.begin( ), veciLengths.end( ), 0 ); 00104 00105 iType = AddType( sSequence.m_strType ); 00106 for( iOffset = i = 0; i < sSequence.m_vecstrSequences.size( ); 00107 iOffset += sSequence.m_vecstrSequences[ i++ ].length( ) ) 00108 if( !Add( Motifs, sSequence.m_vecstrSequences[ i ], iType, sSequence.m_fIntronFirst == !( i % 2 ), 00109 iMotif, vecdScores, veciLengths, iOffset, sModifiers ) ) 00110 return false; 00111 for( iSubsequence = ESubsequenceBegin; iSubsequence < vecdScores.size( ); ++iSubsequence ) 00112 if( ( iLength = veciLengths[ iSubsequence ] ) && ( dScore = vecdScores[ iSubsequence ] ) ) { 00113 // dScore *= m_vecvecdWeights[ iType ][ iGene ]; 00114 Set( iType, (ESubsequence)iSubsequence, iGene, iMotif, dScore / iLength ); } 00115 00116 return true; } 00117 00118 bool CCoalesceGeneScores::Add( const CCoalesceMotifLibrary& Motifs, const std::string& strSequence, 00119 size_t iType, bool fIntron, uint32_t iMotif, std::vector<float>& vecdScores, 00120 std::vector<size_t>& veciLengths, size_t iOffset, SCoalesceModifierCache& sModifiers ) { 00121 ESubsequence eSubsequence; 00122 float dScore; 00123 00124 eSubsequence = fIntron ? ESubsequenceIntrons : ESubsequenceExons; 00125 veciLengths[ ESubsequenceTotal ] += strSequence.length( ); 00126 veciLengths[ eSubsequence ] += strSequence.length( ); 00127 00128 // BUGBUG: make me span intron/exon boundaries 00129 if( CMeta::IsNaN( dScore = Motifs.GetMatch( strSequence, iMotif, iOffset, sModifiers ) ) ) 00130 return false; 00131 vecdScores[ ESubsequenceTotal ] += dScore; 00132 vecdScores[ eSubsequence ] += dScore; 00133 00134 return true; } 00135 00136 void CCoalesceGeneScores::Subtract( const SMotifMatch& sMotif, size_t iGene ) { 00137 size_t iType; 00138 float* adScores; 00139 00140 if( ( ( iType = GetType( sMotif.m_strType ) ) != -1 ) && 00141 ( adScores = Get( iType, sMotif.m_eSubsequence, iGene ) ) ) 00142 adScores[ sMotif.m_iMotif ] -= sMotif.m_dAverage; } 00143 00144 bool CCoalesceGeneScores::CalculateWeights( ) { 00145 static const float c_dCutoff = 0.95f; 00146 static const size_t c_iSubsample = 100; 00147 size_t i, j, k, iBin, iType; 00148 CMeasureQuickPearson MeasurePearson; 00149 uint32_t iMotif; 00150 vector<float> vecdSampleOne, vecdSampleTwo; 00151 vector<size_t> veciBins, veciCounts; 00152 float dOne, dTwo, dMaxOne, dMaxTwo; 00153 00154 g_CatSleipnir( ).notice( "CCoalesceGeneScores::CalculateWeights( ) calculating %d types for %d genes", 00155 GetTypes( ), m_iGenes ); 00156 veciBins.resize( m_iGenes ); 00157 veciCounts.resize( m_iGenes ); 00158 vecdSampleOne.resize( c_iSubsample ); 00159 vecdSampleTwo.resize( c_iSubsample ); 00160 m_vecvecdWeights.resize( GetTypes( ) ); 00161 for( iType = 0; iType < m_vecvecdWeights.size( ); ++iType ) { 00162 vector<float>& vecdWeights = m_vecvecdWeights[ iType ]; 00163 00164 vecdWeights.resize( m_iGenes ); 00165 for( i = 0; i < veciBins.size( ); ++i ) 00166 veciBins[ i ] = i; 00167 for( i = 0; i < m_iGenes; ++i ) { 00168 const float* adOne = Get( iType, ESubsequenceTotal, i ); 00169 00170 if( !( i % 1000 ) ) 00171 g_CatSleipnir( ).info( "CCoalesceGeneScores::CalculateWeights( ) calculating type %d/%d, gene %d/%d", 00172 iType, m_vecvecdWeights.size( ), i, m_iGenes ); 00173 if( !adOne ) 00174 continue; 00175 for( j = ( i + 1 ); j < m_iGenes; ++j ) { 00176 const float* adTwo = Get( iType, ESubsequenceTotal, j ); 00177 00178 if( !adTwo ) 00179 continue; 00180 dMaxOne = dMaxTwo = 0; 00181 for( iMotif = k = 0; ( k < vecdSampleOne.size( ) ) && ( iMotif < m_iMotifs ); ++iMotif ) { 00182 dOne = adOne[ iMotif ]; 00183 dTwo = adTwo[ iMotif ]; 00184 if( dOne || dTwo ) { 00185 if( dOne > dMaxOne ) 00186 dMaxOne = dOne; 00187 if( dTwo > dMaxTwo ) 00188 dMaxTwo = dTwo; 00189 vecdSampleOne[ k ] = dOne; 00190 vecdSampleTwo[ k++ ] = dTwo; } } 00191 if( !k || !dMaxOne || !dMaxTwo || ( MeasurePearson.Measure( &vecdSampleOne[ 0 ], k, 00192 &vecdSampleTwo[ 0 ], k, IMeasure::EMapNone, NULL, NULL ) < c_dCutoff ) ) 00193 continue; 00194 00195 iBin = veciBins[ j ]; 00196 for( k = 0; k < veciBins.size( ); ++k ) 00197 if( veciBins[ k ] == iBin ) 00198 veciBins[ k ] = veciBins[ i ]; } } 00199 fill( veciCounts.begin( ), veciCounts.end( ), 0 ); 00200 for( i = 0; i < veciBins.size( ); ++i ) 00201 veciCounts[ veciBins[ i ] ]++; 00202 for( i = 0; i < veciBins.size( ); ++i ) 00203 vecdWeights[ i ] = 1.0f / veciCounts[ veciBins[ i ] ]; 00204 00205 for( i = 0; i < m_iGenes; ++i ) { 00206 if( vecdWeights[ i ] == 1 ) 00207 continue; 00208 g_CatSleipnir( ).info( "CCoalesceGeneScores::CalculateWeights( ) weighting gene %d, type %d to %g", 00209 i, iType, vecdWeights[ i ] ); 00210 for( j = ESubsequenceBegin; j < ESubsequenceEnd; ++j ) { 00211 float* adOne = Get( iType, (ESubsequence)j, i ); 00212 00213 if( adOne ) 00214 for( iMotif = 0; iMotif < GetMotifs( ); ++iMotif ) 00215 adOne[ iMotif ] *= vecdWeights[ i ]; } } } 00216 00217 return true; } 00218 00219 // CCoalesceGroupHistograms 00220 00221 bool CCoalesceGroupHistograms::Add( const CCoalesceGeneScores& GeneScores, size_t iGene, bool fSubtract, 00222 uint32_t iMotifThem ) { 00223 size_t iTypeThem, iTypeUs, iSubsequence; 00224 uint32_t iMotifUs; 00225 unsigned short sDelta; 00226 float dValue; 00227 00228 sDelta = fSubtract ? -1 : 1; 00229 for( iTypeThem = 0; iTypeThem < GeneScores.GetTypes( ); ++iTypeThem ) { 00230 // lock 00231 iTypeUs = AddType( GeneScores.GetType( iTypeThem ) ); 00232 // unlock 00233 for( iSubsequence = ESubsequenceBegin; iSubsequence < ESubsequenceEnd; ++iSubsequence ) { 00234 CCoalesceHistogramSet<>& Histograms = Get( iTypeUs, (ESubsequence)iSubsequence ); 00235 const float* adScores = GeneScores.Get( iTypeThem, (ESubsequence)iSubsequence, 00236 iGene ); 00237 00238 if( !adScores ) 00239 continue; 00240 // lock 00241 if( !Histograms.GetMembers( ) ) { 00242 vector<float> vecdEdges; 00243 00244 vecdEdges.resize( m_iBins ); 00245 for( int i = 0; (size_t)i < vecdEdges.size( ); ++i ) 00246 vecdEdges[ i ] = ( i * m_dStep ) - ( m_dStep / 2 ); 00247 Histograms.Initialize( GeneScores.GetMotifs( ), vecdEdges ); } 00248 // unlock 00249 if( iMotifThem == -1 ) { 00250 for( iMotifUs = 0; iMotifUs < GeneScores.GetMotifs( ); ++iMotifUs ) 00251 if( dValue = adScores[ iMotifUs ] ) 00252 Histograms.Add( iMotifUs, dValue, sDelta ); } 00253 else if( dValue = adScores[ iMotifThem ] ) 00254 Histograms.Add( iMotifThem, dValue, sDelta ); } } 00255 00256 return true; } 00257 00258 void CCoalesceGroupHistograms::Save( std::ostream& ostm, const CCoalesceMotifLibrary* pMotifs ) const { 00259 size_t iType, iSubsequence; 00260 uint32_t iMotif; 00261 00262 for( iType = 0; iType < GetTypes( ); ++iType ) 00263 for( iSubsequence = 0; iSubsequence < GetSubsequences( iType ); ++iSubsequence ) { 00264 const CCoalesceHistogramSet<>& Histograms = Get( iType, (ESubsequence)iSubsequence ); 00265 00266 if( !Histograms.GetTotal( ) ) 00267 continue; 00268 ostm << GetType( iType ) << '\t' << GetSubsequence( (ESubsequence)iSubsequence ) << endl; 00269 for( iMotif = 0; iMotif < ( pMotifs ? pMotifs->GetMotifs( ) : Histograms.GetMembers( ) ); ++iMotif ) { 00270 if( Histograms.Get( iMotif, 0.0f ) == Histograms.GetTotal( ) ) 00271 continue; 00272 if( pMotifs ) 00273 ostm << pMotifs->GetMotif( iMotif ); 00274 else 00275 ostm << iMotif; 00276 ostm << endl << Histograms.Save( iMotif ) << endl; } } } 00277 00278 bool CCoalesceGroupHistograms::IsSimilar( const CCoalesceMotifLibrary* pMotifs, const SMotifMatch& sOne, 00279 const SMotifMatch& sTwo, float dPValue ) const { 00280 size_t iTypeOne, iTypeTwo; 00281 double dAveOne, dAverage, dZ, dP; 00282 00283 if( ( ( iTypeOne = GetType( sOne.m_strType ) ) == -1 ) || 00284 ( ( iTypeTwo = GetType( sTwo.m_strType ) ) == -1 ) ) 00285 return false; 00286 { 00287 const CCoalesceHistogramSet<>& HistOne = Get( iTypeOne, sOne.m_eSubsequence ); 00288 const CCoalesceHistogramSet<>& HistTwo = Get( iTypeTwo, sTwo.m_eSubsequence ); 00289 00290 dP = 1 - HistOne.CohensD( sOne.m_iMotif, HistTwo, sTwo.m_iMotif, false, dAveOne, dAverage, dZ ); 00291 if( g_CatSleipnir( ).isDebugEnabled( ) ) 00292 g_CatSleipnir( ).debug( "CCoalesceGroupHistograms::IsSimilar( %s, %s, %g ) got p-value %g", 00293 sOne.Save( pMotifs ).c_str( ), sTwo.Save( pMotifs ).c_str( ), dPValue, dP ); 00294 return ( dP < dPValue ); 00295 } } 00296 00297 // CCoalesce 00298 00313 void CCoalesce::SetSeed( const CPCL& PCL ) { 00314 size_t i; 00315 00316 m_vecdSeed.resize( PCL.GetExperiments( ) ); 00317 for( i = 0; i < m_vecdSeed.size( ); ++i ) 00318 m_vecdSeed[i] = PCL.Get( 0, i ); } 00319 00320 void CCoalesceImpl::Normalize( CPCL& PCL ) { 00321 static const double c_dLog2 = log( 2.0 ); 00322 vector<float> vecdMedian; 00323 vector<size_t> veciSingle; 00324 size_t i, j, k; 00325 float d, dMedian; 00326 00327 for( i = 0; i < PCL.GetExperiments( ); ++i ) { 00328 for( j = 0; j < PCL.GetGenes( ); ++j ) 00329 if( !CMeta::IsNaN( d = PCL.Get( j, i ) ) && ( d < 0 ) ) 00330 break; 00331 if( j >= PCL.GetGenes( ) ) 00332 veciSingle.push_back( i ); } 00333 if( veciSingle.empty( ) ) 00334 return; 00335 00336 vecdMedian.resize( veciSingle.size( ) ); 00337 for( i = 0; i < PCL.GetGenes( ); ++i ) { 00338 for( j = k = 0; j < veciSingle.size( ); ++j ) 00339 if( !CMeta::IsNaN( d = PCL.Get( i, veciSingle[ j ] ) ) ) 00340 vecdMedian[ k++ ] = d; 00341 if( !k ) 00342 continue; 00343 dMedian = (float)CStatistics::Percentile( vecdMedian.begin( ), vecdMedian.begin( ) + k, 0.5 ); 00344 for( j = 0; j < veciSingle.size( ); ++j ) 00345 if( !CMeta::IsNaN( d = PCL.Get( i, k = veciSingle[ j ] ) ) ) 00346 PCL.Set( i, k, (float)( log( d / dMedian ) / c_dLog2 ) ); } } 00347 00348 CCoalesceImpl::~CCoalesceImpl( ) { 00349 00350 Clear( ); } 00351 00352 void CCoalesceImpl::Clear( ) { 00353 00354 if( m_fMotifs && m_pMotifs ) 00355 delete m_pMotifs; 00356 m_pMotifs = NULL; } 00357 00358 size_t CCoalesceImpl::GetMotifCount( ) const { 00359 00360 return ( m_pMotifs ? m_pMotifs->GetMotifs( ) : 0 ); } 00361 00362 void* CCoalesceImpl::ThreadCombineMotif( void* pData ) { 00363 SThreadCombineMotif* psData; 00364 size_t i, j; 00365 vector<float> vecdScores; 00366 vector<size_t> veciLengths; 00367 00368 psData = (SThreadCombineMotif*)pData; 00369 SCoalesceModifierCache sModifiers( *psData->m_psModifiers ); 00370 00371 for( i = psData->m_iOffset; i < psData->m_pveciPCL2FASTA->size( ); i += psData->m_iStep ) 00372 if( (*psData->m_pveciPCL2FASTA)[ i ] != -1 ) { 00373 vector<SFASTASequence> vecsSequences; 00374 00375 if( psData->m_pFASTA->Get( (*psData->m_pveciPCL2FASTA)[ i ], vecsSequences ) ) { 00376 sModifiers.Get( i ); 00377 for( j = 0; j < vecsSequences.size( ); ++j ) 00378 if( !psData->m_pGeneScores->Add( i, *psData->m_pMotifs, vecsSequences[ j ], 00379 sModifiers, psData->m_iMotif, vecdScores, veciLengths ) ) 00380 break; } } 00381 00382 return NULL; } 00383 00384 bool CCoalesceImpl::CombineMotifs( const CFASTA& FASTA, const vector<size_t>& veciPCL2FASTA, 00385 SCoalesceModifiers& sModifiers, const CCoalesceCluster& Cluster, size_t iThreads, 00386 CCoalesceGeneScores& GeneScores, CCoalesceGroupHistograms& HistsCluster, 00387 CCoalesceGroupHistograms& HistsPot ) const { 00388 set<SMotifMatch>::const_iterator iterMotifOne, iterMotifTwo; 00389 uint32_t iMotif; 00390 size_t i; 00391 vector<pthread_t> vecpthdThreads; 00392 vector<SThreadCombineMotif> vecsThreads; 00393 00394 if( !GeneScores.GetMotifs( ) || !m_pMotifs || ( Cluster.GetMotifs( ).size( ) > m_iSizeMerge ) ) 00395 return true; 00396 00397 for( iterMotifOne = Cluster.GetMotifs( ).begin( ); iterMotifOne != Cluster.GetMotifs( ).end( ); 00398 ++iterMotifOne ) 00399 for( iterMotifTwo = Cluster.GetMotifs( ).begin( ); iterMotifTwo != Cluster.GetMotifs( ).end( ); 00400 ++iterMotifTwo ) { 00401 if( ( iterMotifOne->m_iMotif == iterMotifTwo->m_iMotif ) || 00402 !HistsCluster.IsSimilar( m_pMotifs, *iterMotifOne, *iterMotifTwo, m_dPValueMerge ) || 00403 ( ( iMotif = m_pMotifs->Merge( iterMotifOne->m_iMotif, iterMotifTwo->m_iMotif, 00404 m_dCutoffMerge, false ) ) == -1 ) ) 00405 continue; 00406 00407 vecpthdThreads.resize( iThreads ); 00408 vecsThreads.resize( vecpthdThreads.size( ) ); 00409 for( i = 0; i < vecsThreads.size( ); ++i ) { 00410 vecsThreads[ i ].m_iOffset = i; 00411 vecsThreads[ i ].m_iStep = vecsThreads.size( ); 00412 vecsThreads[ i ].m_pveciPCL2FASTA = &veciPCL2FASTA; 00413 vecsThreads[ i ].m_pGeneScores = &GeneScores; 00414 vecsThreads[ i ].m_pMotifs = m_pMotifs; 00415 vecsThreads[ i ].m_iMotif = iMotif; 00416 vecsThreads[ i ].m_pFASTA = &FASTA; 00417 vecsThreads[ i ].m_psModifiers = &sModifiers; 00418 if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadCombineMotif, &vecsThreads[ i ] ) ) { 00419 g_CatSleipnir( ).error( "CCoalesceImpl::CombineMotifs( %d ) could not combine motif: %s", 00420 iThreads, m_pMotifs->GetMotif( iMotif ).c_str( ) ); 00421 return false; } } 00422 for( i = 0; i < vecpthdThreads.size( ); ++i ) 00423 pthread_join( vecpthdThreads[ i ], NULL ); 00424 for( i = 0; i < veciPCL2FASTA.size( ); ++i ) 00425 if( veciPCL2FASTA[ i ] != -1 ) { 00426 if( Cluster.IsGene( i ) ) 00427 HistsCluster.Add( GeneScores, i, false, iMotif ); 00428 else 00429 HistsPot.Add( GeneScores, i, false, iMotif ); } } 00430 00431 return true; } 00432 00433 bool CCoalesceImpl::InitializeDatasets( const CPCL& PCL ) { 00434 size_t i, j; 00435 vector<bool> vecfDataset; 00436 00437 vecfDataset.resize( PCL.GetExperiments( ) ); 00438 for( i = 0; i < m_vecsDatasets.size( ); ++i ) { 00439 SCoalesceDataset& sDataset = m_vecsDatasets[ i ]; 00440 00441 for( j = 0; j < sDataset.GetConditions( ); ++j ) 00442 vecfDataset[ sDataset.GetCondition( j ) ] = true; } 00443 for( i = 0; i < vecfDataset.size( ); ++i ) 00444 if( !vecfDataset[ i ] ) 00445 m_vecsDatasets.push_back( SCoalesceDataset( i ) ); 00446 // This has to be done separately because of a weird optimization bug in GCC 00447 // If called in the same scope as the constructor, it's optimized to the wrong order and a double free 00448 for( i = 0; i < m_vecsDatasets.size( ); ++i ) 00449 m_vecsDatasets[ i ].CalculateCovariance( PCL ); 00450 00451 return true; } 00452 00453 bool CCoalesceImpl::InitializeGeneScores( const CPCL& PCL, const CFASTA& FASTA, vector<size_t>& veciPCL2FASTA, 00454 SCoalesceModifiers& sMods, CCoalesceGeneScores& GeneScores ) { 00455 size_t i, j; 00456 00457 if( FASTA.GetGenes( ) ) { 00458 veciPCL2FASTA.resize( PCL.GetGenes( ) ); 00459 for( i = 0; i < veciPCL2FASTA.size( ); ++i ) 00460 veciPCL2FASTA[ i ] = FASTA.GetGene( PCL.GetGene( i ) ); } 00461 sMods.Initialize( PCL ); 00462 if( FASTA.GetGenes( ) ) { 00463 vector<vector<float> > vecvecdCounts; 00464 vector<size_t> veciLengths; 00465 SCoalesceModifierCache sModifiers( sMods ); 00466 vector<float> vecdMotifs; 00467 uint32_t iMotif; 00468 size_t iCount, iType, iSubsequence, iGene; 00469 float* ad; 00470 00471 if( !m_pMotifs ) { 00472 m_fMotifs = true; 00473 m_pMotifs = new CCoalesceMotifLibrary( m_iK ); } 00474 GeneScores.SetGenes( PCL.GetGenes( ) ); 00475 for( i = 0; i < veciPCL2FASTA.size( ); ++i ) 00476 if( veciPCL2FASTA[ i ] != -1 ) { 00477 vector<SFASTASequence> vecsSequences; 00478 00479 if( FASTA.Get( veciPCL2FASTA[ i ], vecsSequences ) ) { 00480 sModifiers.Get( i ); 00481 for( j = 0; j < vecsSequences.size( ); ++j ) 00482 if( !GeneScores.Add( i, *m_pMotifs, vecsSequences[ j ], sModifiers, vecvecdCounts, 00483 veciLengths ) ) 00484 return false; } } 00485 00486 vecdMotifs.resize( GeneScores.GetMotifs( ) ); 00487 for( iCount = iType = 0; iType < GeneScores.GetTypes( ); ++iType ) 00488 for( iSubsequence = 0; iSubsequence < GeneScores.GetSubsequences( iType ); ++iSubsequence ) { 00489 for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene ) { 00490 if( !( ad = GeneScores.Get( iType, (CCoalesceSequencerBase::ESubsequence)iSubsequence, iGene ) ) ) 00491 continue; 00492 iCount++; 00493 for( iMotif = 0; iMotif < GeneScores.GetMotifs( ); ++iMotif ) 00494 vecdMotifs[iMotif] += ad[iMotif]; } 00495 for( iMotif = 0; iMotif < vecdMotifs.size( ); ++iMotif ) 00496 vecdMotifs[iMotif] /= iCount; 00497 for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene ) { 00498 if( ad = GeneScores.Get( iType, (CCoalesceSequencerBase::ESubsequence)iSubsequence, iGene ) ) 00499 for( iMotif = 0; iMotif < GeneScores.GetMotifs( ); ++iMotif ) 00500 ad[iMotif] += vecdMotifs[iMotif]; } } } 00501 00502 if( !GeneScores.GetMotifs( ) ) 00503 Clear( ); 00504 00505 // Disabled because it doesn't really work very well at all. 00506 return true; } // GeneScores.CalculateWeights( ); } 00507 00539 bool CCoalesce::Cluster( const CPCL& PCL, const CFASTA& FASTA, vector<CCoalesceCluster>& vecClusters ) { 00540 static const float c_dEpsilon = 1e-10f; 00541 CPCL PCLCopy; 00542 size_t i; 00543 vector<size_t> veciPCL2FASTA; 00544 CCoalesceGeneScores GeneScores; 00545 set<pair<size_t, size_t> > setpriiSeeds; 00546 SCoalesceModifiers sModifiers; 00547 float dFailure, dProbability; 00548 00549 for( i = 0; i < m_vecpWiggles.size( ); ++i ) 00550 sModifiers.Add( m_vecpWiggles[ i ] ); 00551 PCLCopy.Open( PCL ); 00552 if( GetNormalize( ) ) 00553 Normalize( PCLCopy ); 00554 if( !( InitializeDatasets( PCLCopy ) && InitializeGeneScores( PCLCopy, FASTA, veciPCL2FASTA, sModifiers, 00555 GeneScores ) ) ) 00556 return false; 00557 if( g_CatSleipnir( ).isNoticeEnabled( ) ) { 00558 size_t iSequences; 00559 ostringstream ossm; 00560 00561 for( iSequences = i = 0; i < veciPCL2FASTA.size( ); ++i ) 00562 if( veciPCL2FASTA[ i ] != -1 ) 00563 iSequences++; 00564 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) running with %d genes, %d conditions, and %d sequences", 00565 PCL.GetGenes( ), PCL.GetExperiments( ), iSequences ); 00566 for( i = 0; i < PCL.GetExperiments( ); ++i ) 00567 g_CatSleipnir( ).notice( PCL.GetExperiment( i ).c_str( ) ); 00568 // ossm << ( i ? "\t" : "" ) << PCL.GetExperiment( i ); 00569 // g_CatSleipnir( ).notice( ossm.str( ).c_str( ) ); 00570 g_CatSleipnir( ).notice( "k %d, P gene %g, p condition %g, z condition %g, p motif %g, z motif %g, p correlation %g", GetK( ), 00571 GetProbabilityGene( ), GetPValueCondition( ), GetZScoreCondition( ), GetPValueMotif( ), 00572 GetZScoreMotif( ), GetPValueCorrelation( ) ); 00573 g_CatSleipnir( ).notice( "p merge %g, cutoff merge %g, penalty gap %g, penalty mismatch %g", 00574 GetPValueMerge( ), GetCutoffMerge( ), GetMotifs( ) ? GetMotifs( )->GetPenaltyGap( ) : 0, 00575 GetMotifs( ) ? GetMotifs( )->GetPenaltyMismatch( ) : 0 ); 00576 g_CatSleipnir( ).notice( "correlation pairs %d, bases %d, min size %d, merge size %d, max size %d", 00577 GetNumberCorrelation( ), GetBasesPerMatch( ), GetSizeMinimum( ), GetSizeMerge( ), 00578 GetSizeMaximum( ) ); } 00579 for( dFailure = 1; dFailure >= c_dEpsilon; dFailure *= max( pow( c_dEpsilon, 0.125 ), (double)GetPValueCorrelation( ) ) ) { 00580 CCoalesceCluster Cluster, Pot; 00581 CCoalesceGroupHistograms HistsCluster( GetBins( ), 1.0f / GetBasesPerMatch( ) ); 00582 CCoalesceGroupHistograms HistsPot( GetBins( ), 1.0f / GetBasesPerMatch( ) ); 00583 00584 Pot.SetGenes( PCLCopy.GetGenes( ) ); 00585 Pot.CalculateHistograms( GeneScores, HistsPot, NULL ); 00586 if( !Cluster.Initialize( PCLCopy, Pot, m_vecsDatasets, setpriiSeeds, m_vecdSeed, 00587 GetNumberCorrelation( ), GetPValueCorrelation( ), GetProbabilityGene( ), GetThreads( ) ) ) 00588 continue; 00589 m_vecdSeed.clear( ); 00590 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) initialized %d genes", Cluster.GetGenes( ).size( ) ); 00591 if( g_CatSleipnir( ).isDebugEnabled( ) ) { 00592 ostringstream ossm; 00593 set<size_t>::const_iterator iterGene; 00594 00595 ossm << "CCoalesce::Cluster( ) initialized:"; 00596 for( iterGene = Cluster.GetGenes( ).begin( ); iterGene != Cluster.GetGenes( ).end( ); ++iterGene ) 00597 ossm << ' ' << PCL.GetGene( *iterGene ); 00598 g_CatSleipnir( ).debug( ossm.str( ).c_str( ) ); } 00599 if( Cluster.GetGenes( ).size( ) < GetSizeMinimum( ) ) 00600 continue; 00601 dProbability = 1 - GetProbabilityGene( ); 00602 while( !( Cluster.IsConverged( ) || Cluster.IsEmpty( ) ) ) { 00603 Cluster.CalculateHistograms( GeneScores, HistsCluster, &HistsPot ); 00604 Cluster.Snapshot( GeneScores, HistsCluster ); 00605 Pot.Snapshot( GeneScores, HistsPot ); 00606 if( !Cluster.SelectConditions( PCLCopy, Pot, GetThreads( ), GetPValueCondition( ), 00607 GetZScoreCondition( ) ) ) 00608 return false; 00609 if( Cluster.IsEmpty( ) ) { 00610 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) selected no conditions" ); 00611 break; } 00612 if( ( Cluster.GetGenes( ).size( ) >= GetSizeMinimum( ) ) && 00613 !( CombineMotifs( FASTA, veciPCL2FASTA, sModifiers, Cluster, GetThreads( ), GeneScores, 00614 HistsCluster, HistsPot ) && Cluster.SelectMotifs( HistsCluster, HistsPot, GetPValueMotif( ), 00615 GetZScoreMotif( ),GetSizeMaximum( ), GetThreads( ), GetMotifs( ) ) ) ) 00616 return false; 00617 if( !Cluster.SelectGenes( PCLCopy, GeneScores, HistsCluster, HistsPot, GetSizeMinimum( ), 00618 GetThreads( ), Pot, 1 - dProbability, GetMotifs( ) ) ) 00619 return false; 00620 dProbability *= GetProbabilityGene( ); 00621 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) processed %d genes, %d datasets, %d motifs", 00622 Cluster.GetGenes( ).size( ), Cluster.GetDatasets( ).size( ), Cluster.GetMotifs( ).size( ) ); } 00623 if( Cluster.IsConverged( ) && ( Cluster.GetGenes( ).size( ) >= GetSizeMinimum( ) ) ) { 00624 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) finalizing cluster" ); 00625 setpriiSeeds.clear( ); 00626 if( Cluster.GetMotifs( ).size( ) >= GetSizeMaximum( ) ) { 00627 if( !Cluster.SelectMotifs( HistsCluster, HistsPot, GetPValueMotif( ), GetZScoreMotif( ), -1, 00628 GetThreads( ), GetMotifs( ) ) ) 00629 return false; 00630 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) finalized %d motifs", 00631 Cluster.GetMotifs( ).size( ) ); } 00632 if( IsDirectoryIntermediate( ) ) 00633 Cluster.Save( GetDirectoryIntermediate( ), vecClusters.size( ), PCLCopy, GetMotifs( ) ); 00634 for( i = 0; i < m_vecpostm.size( ); ++i ) 00635 if( m_vecpostm[ i ] ) 00636 Cluster.Save( *m_vecpostm[ i ], vecClusters.size( ), PCLCopy, GetMotifs( ) ); 00637 vecClusters.push_back( Cluster ); 00638 Cluster.Subtract( PCLCopy, Pot ); 00639 Cluster.Subtract( GeneScores ); 00640 for( i = 0; i < m_vecsDatasets.size( ); ++i ) 00641 m_vecsDatasets[ i ].CalculateCovariance( PCLCopy ); 00642 dFailure = 1; } 00643 else 00644 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) discarding cluster (failure %g)", dFailure ); } 00645 00646 return true; } 00647 00648 }