Sleipnir
src/coalesce.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "coalesce.h"
00024 #include "fasta.h"
00025 #include "pcl.h"
00026 #include "halfmatrix.h"
00027 
00028 namespace Sleipnir {
00029 
00030 const char* CCoalesceSequencerBase::c_aszSubsequences[] = {"total", "introns", "exons", NULL};
00031 
00032 // CCoalesceGeneScores
00033 
00034 bool CCoalesceGeneScores::Add( size_t iGene, const CCoalesceMotifLibrary& Motifs,
00035     const SFASTASequence& sSequence, SCoalesceModifierCache& sModifiers, vector<vector<float> >& vecvecdCounts,
00036     vector<size_t>& veciLengths ) {
00037     size_t      i, iType, iSubsequence, iLength, iOffset;
00038     uint32_t    iMotif;
00039     float       d;
00040 
00041     sModifiers.SetType( sSequence.m_strType );
00042     vecvecdCounts.resize( ESubsequenceEnd );
00043     for( i = 0; i < vecvecdCounts.size( ); ++i )
00044         fill( vecvecdCounts[ i ].begin( ), vecvecdCounts[ i ].end( ), 0.0f );
00045     veciLengths.resize( ESubsequenceEnd );
00046     fill( veciLengths.begin( ), veciLengths.end( ), 0 );
00047 
00048     iType = AddType( sSequence.m_strType );
00049     for( iOffset = i = 0; i < sSequence.m_vecstrSequences.size( );
00050         iOffset += sSequence.m_vecstrSequences[ i++ ].length( ) )
00051         if( !Add( Motifs, sSequence.m_vecstrSequences[ i ], iType, sSequence.m_fIntronFirst == !( i % 2 ),
00052             vecvecdCounts, veciLengths, iOffset, sModifiers ) )
00053             return false;
00054     for( iSubsequence = ESubsequenceBegin; iSubsequence < vecvecdCounts.size( ); ++iSubsequence ) {
00055         const vector<float>&    vecdCounts  = vecvecdCounts[ iSubsequence ];
00056 
00057         if( iLength = veciLengths[ iSubsequence ] ) {
00058             Set( iType, (ESubsequence)iSubsequence, iGene, 0, 0, Motifs.GetMotifs( ) );
00059             for( iMotif = 0; iMotif < vecdCounts.size( ); ++iMotif )
00060                 if( d = vecdCounts[ iMotif ] )
00061                     Set( iType, (ESubsequence)iSubsequence, iGene, iMotif, d / iLength,
00062                         Motifs.GetMotifs( ) ); } }
00063 
00064     return true; }
00065 
00066 bool CCoalesceGeneScores::Add( const CCoalesceMotifLibrary& Motifs, const std::string& strSequence,
00067     size_t iType, bool fIntron, std::vector<std::vector<float> >& vecvecdCounts,
00068     std::vector<size_t>& veciLengths, size_t iOffset, SCoalesceModifierCache& sModifiers ) {
00069     size_t          i, j;
00070     ESubsequence    eSubsequence;
00071 
00072     eSubsequence = fIntron ? ESubsequenceIntrons : ESubsequenceExons;
00073     veciLengths[ ESubsequenceTotal ] += strSequence.length( );
00074     veciLengths[ eSubsequence ] += strSequence.length( );
00075 
00076     sModifiers.InitializeWeight( Motifs.GetK( ), iOffset );
00077 // BUGBUG: make me span intron/exon boundaries
00078     for( i = 0; ( i + Motifs.GetK( ) ) <= strSequence.size( ); ++i ) {
00079         string              strKMer = strSequence.substr( i, Motifs.GetK( ) );
00080         vector<uint32_t>    veciMotifs;
00081 
00082         if( !Motifs.GetMatches( strKMer, veciMotifs ) ) {
00083             g_CatSleipnir( ).error( "CCoalesceHistograms::Add( %s, %d, %d ) unrecognized kmer: %s",
00084                 strSequence.c_str( ), iType, fIntron, strKMer.c_str( ) );
00085             return false; }
00086         for( j = 0; j < veciMotifs.size( ); ++j )
00087             Add( eSubsequence, veciMotifs[ j ], Motifs.GetMotifs( ), vecvecdCounts,
00088                 sModifiers.GetWeight( Motifs.GetK( ) ) / Motifs.GetK( ) );
00089         sModifiers.AddWeight( Motifs.GetK( ), iOffset, i ); }
00090 
00091     return true; }
00092 
00093 bool CCoalesceGeneScores::Add( size_t iGene, const CCoalesceMotifLibrary& Motifs,
00094     const SFASTASequence& sSequence, SCoalesceModifierCache& sModifiers, uint32_t iMotif,
00095     vector<float>& vecdScores, vector<size_t>& veciLengths ) {
00096     size_t  i, iType, iSubsequence, iLength, iOffset;
00097     float   dScore;
00098 
00099     sModifiers.SetType( sSequence.m_strType );
00100     vecdScores.resize( ESubsequenceEnd );
00101     fill( vecdScores.begin( ), vecdScores.end( ), 0.0f );
00102     veciLengths.resize( ESubsequenceEnd );
00103     fill( veciLengths.begin( ), veciLengths.end( ), 0 );
00104 
00105     iType = AddType( sSequence.m_strType );
00106     for( iOffset = i = 0; i < sSequence.m_vecstrSequences.size( );
00107         iOffset += sSequence.m_vecstrSequences[ i++ ].length( ) )
00108         if( !Add( Motifs, sSequence.m_vecstrSequences[ i ], iType, sSequence.m_fIntronFirst == !( i % 2 ),
00109             iMotif, vecdScores, veciLengths, iOffset, sModifiers ) )
00110             return false;
00111     for( iSubsequence = ESubsequenceBegin; iSubsequence < vecdScores.size( ); ++iSubsequence )
00112         if( ( iLength = veciLengths[ iSubsequence ] ) && ( dScore = vecdScores[ iSubsequence ] ) ) {
00113 //          dScore *= m_vecvecdWeights[ iType ][ iGene ];
00114             Set( iType, (ESubsequence)iSubsequence, iGene, iMotif, dScore / iLength ); }
00115 
00116     return true; }
00117 
00118 bool CCoalesceGeneScores::Add( const CCoalesceMotifLibrary& Motifs, const std::string& strSequence,
00119     size_t iType, bool fIntron, uint32_t iMotif, std::vector<float>& vecdScores,
00120     std::vector<size_t>& veciLengths, size_t iOffset, SCoalesceModifierCache& sModifiers ) {
00121     ESubsequence    eSubsequence;
00122     float           dScore;
00123 
00124     eSubsequence = fIntron ? ESubsequenceIntrons : ESubsequenceExons;
00125     veciLengths[ ESubsequenceTotal ] += strSequence.length( );
00126     veciLengths[ eSubsequence ] += strSequence.length( );
00127 
00128 // BUGBUG: make me span intron/exon boundaries
00129     if( CMeta::IsNaN( dScore = Motifs.GetMatch( strSequence, iMotif, iOffset, sModifiers ) ) )
00130         return false;
00131     vecdScores[ ESubsequenceTotal ] += dScore;
00132     vecdScores[ eSubsequence ] += dScore;
00133 
00134     return true; }
00135 
00136 void CCoalesceGeneScores::Subtract( const SMotifMatch& sMotif, size_t iGene ) {
00137     size_t  iType;
00138     float*  adScores;
00139 
00140     if( ( ( iType = GetType( sMotif.m_strType ) ) != -1 ) &&
00141         ( adScores = Get( iType, sMotif.m_eSubsequence, iGene ) ) )
00142         adScores[ sMotif.m_iMotif ] -= sMotif.m_dAverage; }
00143 
00144 bool CCoalesceGeneScores::CalculateWeights( ) {
00145     static const float      c_dCutoff       = 0.95f;
00146     static const size_t     c_iSubsample    = 100;
00147     size_t                  i, j, k, iBin, iType;
00148     CMeasureQuickPearson    MeasurePearson;
00149     uint32_t                iMotif;
00150     vector<float>           vecdSampleOne, vecdSampleTwo;
00151     vector<size_t>          veciBins, veciCounts;
00152     float                   dOne, dTwo, dMaxOne, dMaxTwo;
00153 
00154     g_CatSleipnir( ).notice( "CCoalesceGeneScores::CalculateWeights( ) calculating %d types for %d genes",
00155         GetTypes( ), m_iGenes );
00156     veciBins.resize( m_iGenes );
00157     veciCounts.resize( m_iGenes );
00158     vecdSampleOne.resize( c_iSubsample );
00159     vecdSampleTwo.resize( c_iSubsample );
00160     m_vecvecdWeights.resize( GetTypes( ) );
00161     for( iType = 0; iType < m_vecvecdWeights.size( ); ++iType ) {
00162         vector<float>&  vecdWeights = m_vecvecdWeights[ iType ];
00163 
00164         vecdWeights.resize( m_iGenes );
00165         for( i = 0; i < veciBins.size( ); ++i )
00166             veciBins[ i ] = i;
00167         for( i = 0; i < m_iGenes; ++i ) {
00168             const float*    adOne   = Get( iType, ESubsequenceTotal, i );
00169 
00170             if( !( i % 1000 ) )
00171                 g_CatSleipnir( ).info( "CCoalesceGeneScores::CalculateWeights( ) calculating type %d/%d, gene %d/%d",
00172                     iType, m_vecvecdWeights.size( ), i, m_iGenes );
00173             if( !adOne )
00174                 continue;
00175             for( j = ( i + 1 ); j < m_iGenes; ++j ) {
00176                 const float*    adTwo   = Get( iType, ESubsequenceTotal, j );
00177 
00178                 if( !adTwo )
00179                     continue;
00180                 dMaxOne = dMaxTwo = 0;
00181                 for( iMotif = k = 0; ( k < vecdSampleOne.size( ) ) && ( iMotif < m_iMotifs ); ++iMotif ) {
00182                     dOne = adOne[ iMotif ];
00183                     dTwo = adTwo[ iMotif ];
00184                     if( dOne || dTwo ) {
00185                         if( dOne > dMaxOne )
00186                             dMaxOne = dOne;
00187                         if( dTwo > dMaxTwo )
00188                             dMaxTwo = dTwo;
00189                         vecdSampleOne[ k ] = dOne;
00190                         vecdSampleTwo[ k++ ] = dTwo; } }
00191                 if( !k || !dMaxOne || !dMaxTwo || ( MeasurePearson.Measure( &vecdSampleOne[ 0 ], k,
00192                     &vecdSampleTwo[ 0 ], k, IMeasure::EMapNone, NULL, NULL ) < c_dCutoff ) )
00193                     continue;
00194 
00195                 iBin = veciBins[ j ];
00196                 for( k = 0; k < veciBins.size( ); ++k )
00197                     if( veciBins[ k ] == iBin )
00198                         veciBins[ k ] = veciBins[ i ]; } }
00199         fill( veciCounts.begin( ), veciCounts.end( ), 0 );
00200         for( i = 0; i < veciBins.size( ); ++i )
00201             veciCounts[ veciBins[ i ] ]++;
00202         for( i = 0; i < veciBins.size( ); ++i )
00203             vecdWeights[ i ] = 1.0f / veciCounts[ veciBins[ i ] ];
00204 
00205         for( i = 0; i < m_iGenes; ++i ) {
00206             if( vecdWeights[ i ] == 1 )
00207                 continue;
00208             g_CatSleipnir( ).info( "CCoalesceGeneScores::CalculateWeights( ) weighting gene %d, type %d to %g",
00209                 i, iType, vecdWeights[ i ] );
00210             for( j = ESubsequenceBegin; j < ESubsequenceEnd; ++j ) {
00211                 float*  adOne   = Get( iType, (ESubsequence)j, i );
00212 
00213                 if( adOne )
00214                     for( iMotif = 0; iMotif < GetMotifs( ); ++iMotif )
00215                         adOne[ iMotif ] *= vecdWeights[ i ]; } } }
00216 
00217     return true; }
00218 
00219 // CCoalesceGroupHistograms
00220 
00221 bool CCoalesceGroupHistograms::Add( const CCoalesceGeneScores& GeneScores, size_t iGene, bool fSubtract,
00222     uint32_t iMotifThem ) {
00223     size_t          iTypeThem, iTypeUs, iSubsequence;
00224     uint32_t        iMotifUs;
00225     unsigned short  sDelta;
00226     float           dValue;
00227 
00228     sDelta = fSubtract ? -1 : 1;
00229     for( iTypeThem = 0; iTypeThem < GeneScores.GetTypes( ); ++iTypeThem ) {
00230 // lock
00231         iTypeUs = AddType( GeneScores.GetType( iTypeThem ) );
00232 // unlock
00233         for( iSubsequence = ESubsequenceBegin; iSubsequence < ESubsequenceEnd; ++iSubsequence ) {
00234             CCoalesceHistogramSet<>&    Histograms  = Get( iTypeUs, (ESubsequence)iSubsequence );
00235             const float*                adScores    = GeneScores.Get( iTypeThem, (ESubsequence)iSubsequence,
00236                                                         iGene );
00237 
00238             if( !adScores )
00239                 continue;
00240 // lock
00241             if( !Histograms.GetMembers( ) ) {
00242                 vector<float>   vecdEdges;
00243 
00244                 vecdEdges.resize( m_iBins );
00245                 for( int i = 0; (size_t)i < vecdEdges.size( ); ++i )
00246                     vecdEdges[ i ] = ( i * m_dStep ) - ( m_dStep / 2 );
00247                 Histograms.Initialize( GeneScores.GetMotifs( ), vecdEdges ); }
00248 // unlock
00249             if( iMotifThem == -1 ) {
00250                 for( iMotifUs = 0; iMotifUs < GeneScores.GetMotifs( ); ++iMotifUs )
00251                     if( dValue = adScores[ iMotifUs ] )
00252                         Histograms.Add( iMotifUs, dValue, sDelta ); }
00253             else if( dValue = adScores[ iMotifThem ] )
00254                 Histograms.Add( iMotifThem, dValue, sDelta ); } }
00255 
00256     return true; }
00257 
00258 void CCoalesceGroupHistograms::Save( std::ostream& ostm, const CCoalesceMotifLibrary* pMotifs ) const {
00259     size_t      iType, iSubsequence;
00260     uint32_t    iMotif;
00261 
00262     for( iType = 0; iType < GetTypes( ); ++iType )
00263         for( iSubsequence = 0; iSubsequence < GetSubsequences( iType ); ++iSubsequence ) {
00264             const CCoalesceHistogramSet<>&  Histograms  = Get( iType, (ESubsequence)iSubsequence );
00265 
00266             if( !Histograms.GetTotal( ) )
00267                 continue;
00268             ostm << GetType( iType ) << '\t' << GetSubsequence( (ESubsequence)iSubsequence ) << endl;
00269             for( iMotif = 0; iMotif < ( pMotifs ? pMotifs->GetMotifs( ) : Histograms.GetMembers( ) ); ++iMotif ) {
00270                 if( Histograms.Get( iMotif, 0.0f ) == Histograms.GetTotal( ) )
00271                     continue;
00272                 if( pMotifs )
00273                     ostm << pMotifs->GetMotif( iMotif );
00274                 else
00275                     ostm << iMotif;
00276                 ostm << endl << Histograms.Save( iMotif ) << endl; } } }
00277 
00278 bool CCoalesceGroupHistograms::IsSimilar( const CCoalesceMotifLibrary* pMotifs, const SMotifMatch& sOne,
00279     const SMotifMatch& sTwo, float dPValue ) const {
00280     size_t  iTypeOne, iTypeTwo;
00281     double  dAveOne, dAverage, dZ, dP;
00282 
00283     if( ( ( iTypeOne = GetType( sOne.m_strType ) ) == -1 ) ||
00284         ( ( iTypeTwo = GetType( sTwo.m_strType ) ) == -1 ) )
00285         return false;
00286     {
00287         const CCoalesceHistogramSet<>&  HistOne = Get( iTypeOne, sOne.m_eSubsequence );
00288         const CCoalesceHistogramSet<>&  HistTwo = Get( iTypeTwo, sTwo.m_eSubsequence );
00289 
00290         dP = 1 - HistOne.CohensD( sOne.m_iMotif, HistTwo, sTwo.m_iMotif, false, dAveOne, dAverage, dZ );
00291         if( g_CatSleipnir( ).isDebugEnabled( ) )
00292             g_CatSleipnir( ).debug( "CCoalesceGroupHistograms::IsSimilar( %s, %s, %g ) got p-value %g",
00293                 sOne.Save( pMotifs ).c_str( ), sTwo.Save( pMotifs ).c_str( ), dPValue, dP );
00294         return ( dP < dPValue );
00295     } }
00296 
00297 // CCoalesce
00298 
00313 void CCoalesce::SetSeed( const CPCL& PCL ) {
00314     size_t  i;
00315 
00316     m_vecdSeed.resize( PCL.GetExperiments( ) );
00317     for( i = 0; i < m_vecdSeed.size( ); ++i )
00318         m_vecdSeed[i] = PCL.Get( 0, i ); }
00319 
00320 void CCoalesceImpl::Normalize( CPCL& PCL ) {
00321     static const double c_dLog2     = log( 2.0 );
00322     vector<float>       vecdMedian;
00323     vector<size_t>      veciSingle;
00324     size_t              i, j, k;
00325     float               d, dMedian;
00326 
00327     for( i = 0; i < PCL.GetExperiments( ); ++i ) {
00328         for( j = 0; j < PCL.GetGenes( ); ++j )
00329             if( !CMeta::IsNaN( d = PCL.Get( j, i ) ) && ( d < 0 ) )
00330                 break;
00331         if( j >= PCL.GetGenes( ) )
00332             veciSingle.push_back( i ); }
00333     if( veciSingle.empty( ) )
00334         return;
00335 
00336     vecdMedian.resize( veciSingle.size( ) );
00337     for( i = 0; i < PCL.GetGenes( ); ++i ) {
00338         for( j = k = 0; j < veciSingle.size( ); ++j )
00339             if( !CMeta::IsNaN( d = PCL.Get( i, veciSingle[ j ] ) ) )
00340                 vecdMedian[ k++ ] = d;
00341         if( !k )
00342             continue;
00343         dMedian = (float)CStatistics::Percentile( vecdMedian.begin( ), vecdMedian.begin( ) + k, 0.5 );
00344         for( j = 0; j < veciSingle.size( ); ++j )
00345             if( !CMeta::IsNaN( d = PCL.Get( i, k = veciSingle[ j ] ) ) )
00346                 PCL.Set( i, k, (float)( log( d / dMedian ) / c_dLog2 ) ); } }
00347 
00348 CCoalesceImpl::~CCoalesceImpl( ) {
00349 
00350     Clear( ); }
00351 
00352 void CCoalesceImpl::Clear( ) {
00353 
00354     if( m_fMotifs && m_pMotifs )
00355         delete m_pMotifs;
00356     m_pMotifs = NULL; }
00357 
00358 size_t CCoalesceImpl::GetMotifCount( ) const {
00359 
00360     return ( m_pMotifs ? m_pMotifs->GetMotifs( ) : 0 ); }
00361 
00362 void* CCoalesceImpl::ThreadCombineMotif( void* pData ) {
00363     SThreadCombineMotif*    psData;
00364     size_t                  i, j;
00365     vector<float>           vecdScores;
00366     vector<size_t>          veciLengths;
00367 
00368     psData = (SThreadCombineMotif*)pData;
00369     SCoalesceModifierCache  sModifiers( *psData->m_psModifiers );
00370 
00371     for( i = psData->m_iOffset; i < psData->m_pveciPCL2FASTA->size( ); i += psData->m_iStep )
00372         if( (*psData->m_pveciPCL2FASTA)[ i ] != -1 ) {
00373             vector<SFASTASequence>  vecsSequences;
00374 
00375             if( psData->m_pFASTA->Get( (*psData->m_pveciPCL2FASTA)[ i ], vecsSequences ) ) {
00376                 sModifiers.Get( i );
00377                 for( j = 0; j < vecsSequences.size( ); ++j )
00378                     if( !psData->m_pGeneScores->Add( i, *psData->m_pMotifs, vecsSequences[ j ],
00379                         sModifiers, psData->m_iMotif, vecdScores, veciLengths ) )
00380                         break; } }
00381 
00382     return NULL; }
00383 
00384 bool CCoalesceImpl::CombineMotifs( const CFASTA& FASTA, const vector<size_t>& veciPCL2FASTA,
00385     SCoalesceModifiers& sModifiers, const CCoalesceCluster& Cluster, size_t iThreads,
00386     CCoalesceGeneScores& GeneScores, CCoalesceGroupHistograms& HistsCluster,
00387     CCoalesceGroupHistograms& HistsPot ) const {
00388     set<SMotifMatch>::const_iterator    iterMotifOne, iterMotifTwo;
00389     uint32_t                            iMotif;
00390     size_t                              i;
00391     vector<pthread_t>                   vecpthdThreads;
00392     vector<SThreadCombineMotif>         vecsThreads;
00393 
00394     if( !GeneScores.GetMotifs( ) || !m_pMotifs || ( Cluster.GetMotifs( ).size( ) > m_iSizeMerge ) )
00395         return true;
00396 
00397     for( iterMotifOne = Cluster.GetMotifs( ).begin( ); iterMotifOne != Cluster.GetMotifs( ).end( );
00398         ++iterMotifOne )
00399         for( iterMotifTwo = Cluster.GetMotifs( ).begin( ); iterMotifTwo != Cluster.GetMotifs( ).end( );
00400             ++iterMotifTwo ) {
00401             if( ( iterMotifOne->m_iMotif == iterMotifTwo->m_iMotif ) ||
00402                 !HistsCluster.IsSimilar( m_pMotifs, *iterMotifOne, *iterMotifTwo, m_dPValueMerge ) ||
00403                 ( ( iMotif = m_pMotifs->Merge( iterMotifOne->m_iMotif, iterMotifTwo->m_iMotif,
00404                 m_dCutoffMerge, false ) ) == -1 ) )
00405                 continue;
00406 
00407             vecpthdThreads.resize( iThreads );
00408             vecsThreads.resize( vecpthdThreads.size( ) );
00409             for( i = 0; i < vecsThreads.size( ); ++i ) {
00410                 vecsThreads[ i ].m_iOffset = i;
00411                 vecsThreads[ i ].m_iStep = vecsThreads.size( );
00412                 vecsThreads[ i ].m_pveciPCL2FASTA = &veciPCL2FASTA;
00413                 vecsThreads[ i ].m_pGeneScores = &GeneScores;
00414                 vecsThreads[ i ].m_pMotifs = m_pMotifs;
00415                 vecsThreads[ i ].m_iMotif = iMotif;
00416                 vecsThreads[ i ].m_pFASTA = &FASTA;
00417                 vecsThreads[ i ].m_psModifiers = &sModifiers;
00418                 if( pthread_create( &vecpthdThreads[ i ], NULL, ThreadCombineMotif, &vecsThreads[ i ] ) ) {
00419                     g_CatSleipnir( ).error( "CCoalesceImpl::CombineMotifs( %d ) could not combine motif: %s",
00420                         iThreads, m_pMotifs->GetMotif( iMotif ).c_str( ) );
00421                     return false; } }
00422             for( i = 0; i < vecpthdThreads.size( ); ++i )
00423                 pthread_join( vecpthdThreads[ i ], NULL );
00424             for( i = 0; i < veciPCL2FASTA.size( ); ++i )
00425                 if( veciPCL2FASTA[ i ] != -1 ) {
00426                     if( Cluster.IsGene( i ) )
00427                         HistsCluster.Add( GeneScores, i, false, iMotif );
00428                     else
00429                         HistsPot.Add( GeneScores, i, false, iMotif ); } }
00430 
00431     return true; }
00432 
00433 bool CCoalesceImpl::InitializeDatasets( const CPCL& PCL ) {
00434     size_t          i, j;
00435     vector<bool>    vecfDataset;
00436 
00437     vecfDataset.resize( PCL.GetExperiments( ) );
00438     for( i = 0; i < m_vecsDatasets.size( ); ++i ) {
00439         SCoalesceDataset&   sDataset    = m_vecsDatasets[ i ];
00440 
00441         for( j = 0; j < sDataset.GetConditions( ); ++j )
00442             vecfDataset[ sDataset.GetCondition( j ) ] = true; }
00443     for( i = 0; i < vecfDataset.size( ); ++i )
00444         if( !vecfDataset[ i ] )
00445             m_vecsDatasets.push_back( SCoalesceDataset( i ) );
00446 // This has to be done separately because of a weird optimization bug in GCC
00447 // If called in the same scope as the constructor, it's optimized to the wrong order and a double free
00448     for( i = 0; i < m_vecsDatasets.size( ); ++i )
00449         m_vecsDatasets[ i ].CalculateCovariance( PCL );
00450 
00451     return true; }
00452 
00453 bool CCoalesceImpl::InitializeGeneScores( const CPCL& PCL, const CFASTA& FASTA, vector<size_t>& veciPCL2FASTA,
00454     SCoalesceModifiers& sMods, CCoalesceGeneScores& GeneScores ) {
00455     size_t  i, j;
00456 
00457     if( FASTA.GetGenes( ) ) {
00458         veciPCL2FASTA.resize( PCL.GetGenes( ) );
00459         for( i = 0; i < veciPCL2FASTA.size( ); ++i )
00460             veciPCL2FASTA[ i ] = FASTA.GetGene( PCL.GetGene( i ) ); }
00461     sMods.Initialize( PCL );
00462     if( FASTA.GetGenes( ) ) {
00463         vector<vector<float> >  vecvecdCounts;
00464         vector<size_t>          veciLengths;
00465         SCoalesceModifierCache  sModifiers( sMods );
00466         vector<float>           vecdMotifs;
00467         uint32_t                iMotif;
00468         size_t                  iCount, iType, iSubsequence, iGene;
00469         float*                  ad;
00470 
00471         if( !m_pMotifs ) {
00472             m_fMotifs = true;
00473             m_pMotifs = new CCoalesceMotifLibrary( m_iK ); }
00474         GeneScores.SetGenes( PCL.GetGenes( ) );
00475         for( i = 0; i < veciPCL2FASTA.size( ); ++i )
00476             if( veciPCL2FASTA[ i ] != -1 ) {
00477                 vector<SFASTASequence>  vecsSequences;
00478 
00479                 if( FASTA.Get( veciPCL2FASTA[ i ], vecsSequences ) ) {
00480                     sModifiers.Get( i );
00481                     for( j = 0; j < vecsSequences.size( ); ++j )
00482                         if( !GeneScores.Add( i, *m_pMotifs, vecsSequences[ j ], sModifiers, vecvecdCounts,
00483                             veciLengths ) )
00484                             return false; } }
00485 
00486         vecdMotifs.resize( GeneScores.GetMotifs( ) );
00487         for( iCount = iType = 0; iType < GeneScores.GetTypes( ); ++iType )
00488             for( iSubsequence = 0; iSubsequence < GeneScores.GetSubsequences( iType ); ++iSubsequence ) {
00489                 for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene ) {
00490                     if( !( ad = GeneScores.Get( iType, (CCoalesceSequencerBase::ESubsequence)iSubsequence, iGene ) ) )
00491                         continue;
00492                     iCount++;
00493                     for( iMotif = 0; iMotif < GeneScores.GetMotifs( ); ++iMotif )
00494                         vecdMotifs[iMotif] += ad[iMotif]; }
00495                 for( iMotif = 0; iMotif < vecdMotifs.size( ); ++iMotif )
00496                     vecdMotifs[iMotif] /= iCount;
00497                 for( iGene = 0; iGene < PCL.GetGenes( ); ++iGene ) {
00498                     if( ad = GeneScores.Get( iType, (CCoalesceSequencerBase::ESubsequence)iSubsequence, iGene ) )
00499                         for( iMotif = 0; iMotif < GeneScores.GetMotifs( ); ++iMotif )
00500                             ad[iMotif] += vecdMotifs[iMotif]; } } }
00501 
00502     if( !GeneScores.GetMotifs( ) )
00503         Clear( );
00504 
00505 // Disabled because it doesn't really work very well at all.
00506     return true; } // GeneScores.CalculateWeights( ); }
00507 
00539 bool CCoalesce::Cluster( const CPCL& PCL, const CFASTA& FASTA, vector<CCoalesceCluster>& vecClusters ) {
00540     static const float          c_dEpsilon  = 1e-10f;
00541     CPCL                        PCLCopy;
00542     size_t                      i;
00543     vector<size_t>              veciPCL2FASTA;
00544     CCoalesceGeneScores         GeneScores;
00545     set<pair<size_t, size_t> >  setpriiSeeds;
00546     SCoalesceModifiers          sModifiers;
00547     float                       dFailure, dProbability;
00548 
00549     for( i = 0; i < m_vecpWiggles.size( ); ++i )
00550         sModifiers.Add( m_vecpWiggles[ i ] );
00551     PCLCopy.Open( PCL );
00552     if( GetNormalize( ) )
00553         Normalize( PCLCopy );
00554     if( !( InitializeDatasets( PCLCopy ) && InitializeGeneScores( PCLCopy, FASTA, veciPCL2FASTA, sModifiers,
00555         GeneScores ) ) )
00556         return false;
00557     if( g_CatSleipnir( ).isNoticeEnabled( ) ) {
00558         size_t          iSequences;
00559         ostringstream   ossm;
00560 
00561         for( iSequences = i = 0; i < veciPCL2FASTA.size( ); ++i )
00562             if( veciPCL2FASTA[ i ] != -1 )
00563                 iSequences++;
00564         g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) running with %d genes, %d conditions, and %d sequences",
00565             PCL.GetGenes( ), PCL.GetExperiments( ), iSequences );
00566         for( i = 0; i < PCL.GetExperiments( ); ++i )
00567             g_CatSleipnir( ).notice( PCL.GetExperiment( i ).c_str( ) );
00568 //          ossm << ( i ? "\t" : "" ) << PCL.GetExperiment( i );
00569 //      g_CatSleipnir( ).notice( ossm.str( ).c_str( ) );
00570         g_CatSleipnir( ).notice( "k %d, P gene %g, p condition %g, z condition %g, p motif %g, z motif %g, p correlation %g", GetK( ),
00571             GetProbabilityGene( ), GetPValueCondition( ), GetZScoreCondition( ), GetPValueMotif( ),
00572             GetZScoreMotif( ), GetPValueCorrelation( ) );
00573         g_CatSleipnir( ).notice( "p merge %g, cutoff merge %g, penalty gap %g, penalty mismatch %g",
00574             GetPValueMerge( ), GetCutoffMerge( ), GetMotifs( ) ? GetMotifs( )->GetPenaltyGap( ) : 0,
00575             GetMotifs( ) ? GetMotifs( )->GetPenaltyMismatch( ) : 0 );
00576         g_CatSleipnir( ).notice( "correlation pairs %d, bases %d, min size %d, merge size %d, max size %d",
00577             GetNumberCorrelation( ), GetBasesPerMatch( ), GetSizeMinimum( ), GetSizeMerge( ),
00578             GetSizeMaximum( ) ); }
00579     for( dFailure = 1; dFailure >= c_dEpsilon; dFailure *= max( pow( c_dEpsilon, 0.125 ), (double)GetPValueCorrelation( ) ) ) {
00580         CCoalesceCluster            Cluster, Pot;
00581         CCoalesceGroupHistograms    HistsCluster( GetBins( ), 1.0f / GetBasesPerMatch( ) );
00582         CCoalesceGroupHistograms    HistsPot( GetBins( ), 1.0f / GetBasesPerMatch( ) );
00583 
00584         Pot.SetGenes( PCLCopy.GetGenes( ) );
00585         Pot.CalculateHistograms( GeneScores, HistsPot, NULL );
00586         if( !Cluster.Initialize( PCLCopy, Pot, m_vecsDatasets, setpriiSeeds, m_vecdSeed,
00587             GetNumberCorrelation( ), GetPValueCorrelation( ), GetProbabilityGene( ), GetThreads( ) ) )
00588             continue;
00589         m_vecdSeed.clear( );
00590         g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) initialized %d genes", Cluster.GetGenes( ).size( ) );
00591         if( g_CatSleipnir( ).isDebugEnabled( ) ) {
00592             ostringstream               ossm;
00593             set<size_t>::const_iterator iterGene;
00594 
00595             ossm << "CCoalesce::Cluster( ) initialized:";
00596             for( iterGene = Cluster.GetGenes( ).begin( ); iterGene != Cluster.GetGenes( ).end( ); ++iterGene )
00597                 ossm << ' ' << PCL.GetGene( *iterGene );
00598             g_CatSleipnir( ).debug( ossm.str( ).c_str( ) ); }
00599         if( Cluster.GetGenes( ).size( ) < GetSizeMinimum( ) )
00600             continue;
00601         dProbability = 1 - GetProbabilityGene( );
00602         while( !( Cluster.IsConverged( ) || Cluster.IsEmpty( ) ) ) {
00603             Cluster.CalculateHistograms( GeneScores, HistsCluster, &HistsPot );
00604             Cluster.Snapshot( GeneScores, HistsCluster );
00605             Pot.Snapshot( GeneScores, HistsPot );
00606             if( !Cluster.SelectConditions( PCLCopy, Pot, GetThreads( ), GetPValueCondition( ),
00607                 GetZScoreCondition( ) ) )
00608                 return false;
00609             if( Cluster.IsEmpty( ) ) {
00610                 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) selected no conditions" );
00611                 break; }
00612             if( ( Cluster.GetGenes( ).size( ) >= GetSizeMinimum( ) ) &&
00613                 !( CombineMotifs( FASTA, veciPCL2FASTA, sModifiers, Cluster, GetThreads( ), GeneScores,
00614                 HistsCluster, HistsPot ) && Cluster.SelectMotifs( HistsCluster, HistsPot, GetPValueMotif( ),
00615                 GetZScoreMotif( ),GetSizeMaximum( ), GetThreads( ), GetMotifs( ) ) ) )
00616                 return false;
00617             if( !Cluster.SelectGenes( PCLCopy, GeneScores, HistsCluster, HistsPot, GetSizeMinimum( ),
00618                 GetThreads( ), Pot, 1 - dProbability, GetMotifs( ) ) )
00619                 return false;
00620             dProbability *= GetProbabilityGene( );
00621             g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) processed %d genes, %d datasets, %d motifs",
00622                 Cluster.GetGenes( ).size( ), Cluster.GetDatasets( ).size( ), Cluster.GetMotifs( ).size( ) ); }
00623         if( Cluster.IsConverged( ) && ( Cluster.GetGenes( ).size( ) >= GetSizeMinimum( ) ) ) {
00624             g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) finalizing cluster" );
00625             setpriiSeeds.clear( );
00626             if( Cluster.GetMotifs( ).size( ) >= GetSizeMaximum( ) ) {
00627                 if( !Cluster.SelectMotifs( HistsCluster, HistsPot, GetPValueMotif( ), GetZScoreMotif( ), -1,
00628                     GetThreads( ), GetMotifs( ) ) )
00629                     return false;
00630                 g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) finalized %d motifs",
00631                     Cluster.GetMotifs( ).size( ) ); }
00632             if( IsDirectoryIntermediate( ) )
00633                 Cluster.Save( GetDirectoryIntermediate( ), vecClusters.size( ), PCLCopy, GetMotifs( ) );
00634             for( i = 0; i < m_vecpostm.size( ); ++i )
00635                 if( m_vecpostm[ i ] )
00636                     Cluster.Save( *m_vecpostm[ i ], vecClusters.size( ), PCLCopy, GetMotifs( ) );
00637             vecClusters.push_back( Cluster );
00638             Cluster.Subtract( PCLCopy, Pot );
00639             Cluster.Subtract( GeneScores );
00640             for( i = 0; i < m_vecsDatasets.size( ); ++i )
00641                 m_vecsDatasets[ i ].CalculateCovariance( PCLCopy );
00642             dFailure = 1; }
00643         else
00644             g_CatSleipnir( ).notice( "CCoalesce::Cluster( ) discarding cluster (failure %g)", dFailure ); }
00645 
00646     return true; }
00647 
00648 }