Sleipnir
src/coalescemotifs.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "coalescemotifs.h"
00024 #include "coalescestructsi.h"
00025 #include "pst.h"
00026 #include "measure.h"
00027 #include "statistics.h"
00028 
00029 namespace Sleipnir {
00030 
00031 const char  CCoalesceMotifLibraryImpl::c_szBases[]          = "ACGT";
00032 const char  CCoalesceMotifLibraryImpl::c_szComplements[]    = "TGCA";
00033 
00034 void CCoalesceMotifLibraryImpl::SKnowns::Match( const std::vector<float>& vecdMotif,
00035     SMotifMatch::EType eMatchType, std::map<std::string, float>& mapstrdKnown ) const {
00036     static const size_t                 c_iOverlap  = 5;
00037     map<string, TVecPr>::const_iterator iterKnown;
00038     size_t                              i, j, k, iMin, iMax, iTests;
00039     float                               dP, dR, dMin;
00040     const float*                        adMotif;
00041     const float*                        adPWM;
00042     const float*                        adRC;
00043 
00044     for( iterKnown = m_mapstrvecKnown.begin( ); iterKnown != m_mapstrvecKnown.end( ); ++iterKnown ) {
00045         dMin = FLT_MAX;
00046         for( iTests = i = 0; i < iterKnown->second.size( ); ++i ) {
00047             const vector<float>&    vecdPWM = iterKnown->second[ i ].first;
00048             const vector<float>&    vecdRC  = iterKnown->second[ i ].second;
00049 
00050             iMin = strlen( c_szBases ) * c_iOverlap;
00051             iMax = vecdMotif.size( ) + vecdPWM.size( ) - iMin;
00052             if( iMax < iMin )
00053                 continue;
00054             for( j = iMin; j <= iMax; j += strlen( c_szBases ) ) {
00055                 if( j < vecdMotif.size( ) ) {
00056                     adMotif = &vecdMotif[ vecdMotif.size( ) - j - 1 ];
00057                     adPWM = &vecdPWM.front( );
00058                     adRC = &vecdRC.front( );
00059                     k = min( vecdPWM.size( ), j ); }
00060                 else {
00061                     adMotif = &vecdMotif.front( );
00062                     k = j - vecdMotif.size( );
00063                     adPWM = &vecdPWM[ k ];
00064                     adRC = &vecdRC[ k ];
00065                     k = min( vecdMotif.size( ), vecdPWM.size( ) - k ); }
00066                 switch( eMatchType ) {
00067                     case SMotifMatch::ETypeRMSE:
00068                         dP = (float)min( CStatistics::RootMeanSquareError( adMotif, adMotif + k, adPWM,
00069                             adPWM + k ), CStatistics::RootMeanSquareError( adMotif, adMotif + k, adRC,
00070                             adRC + k ) );
00071                         break;
00072 
00073                     case SMotifMatch::ETypeJensenShannon:
00074                         dP = (float)min( CStatistics::JensenShannonDivergence( adMotif, adMotif + k, adPWM,
00075                             adPWM + k ), CStatistics::JensenShannonDivergence( adMotif, adMotif + k, adRC,
00076                             adRC + k ) );
00077                         break;
00078 
00079                     case SMotifMatch::ETypePValue:
00080                         iTests += 2;
00081                         if( ( dR = (float)max( CMeasurePearson::Pearson( adMotif, k, adPWM, k,
00082                             IMeasure::EMapNone ), CMeasurePearson::Pearson( adMotif, k, adRC, k,
00083                             IMeasure::EMapNone ) ) ) > 0 )
00084                             dP = (float)CStatistics::PValuePearson( dR, k );
00085                         else
00086                             dP = FLT_MAX;
00087                         break; }
00088                 if( dP < dMin )
00089                     dMin = dP; } }
00090         if( dMin != FLT_MAX )
00091             mapstrdKnown[ iterKnown->first ] = dMin * max( 1, iTests ); } }
00092 
00118 bool CCoalesceMotifLibrary::Open( std::istream& istm, std::vector<SMotifMatch>& vecsMotifs,
00119     CCoalesceMotifLibrary* pMotifs ) {
00120     string  strBuffer;
00121     size_t  i;
00122 
00123     strBuffer.resize( CFile::GetBufferSize( ) );
00124     while( CFile::IsNewline( istm.peek( ) ) )
00125         istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
00126     while( !istm.eof( ) && ( istm.peek( ) != c_cCluster ) ) {
00127         SMotifMatch sMotif;
00128 
00129         if( pMotifs ) {
00130             if( !sMotif.Open( istm, *pMotifs ) )
00131                 break;
00132             vecsMotifs.push_back( sMotif );
00133             if( isdigit( istm.peek( ) ) )
00134                 for( i = 0; i < 4; ++i )
00135                     istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); }
00136         else
00137             istm.getline( &strBuffer[ 0 ], strBuffer.size( ) );
00138         while( CFile::IsNewline( istm.peek( ) ) )
00139             istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); }
00140 
00141     return true; }
00142 
00143 CCoalesceMotifLibraryImpl::~CCoalesceMotifLibraryImpl( ) {
00144     size_t  i;
00145 
00146     for( i = 0; i < m_vecpPSTs.size( ); ++i )
00147         delete m_vecpPSTs[ i ]; }
00148 
00178 float CCoalesceMotifLibrary::GetMatch( const std::string& strSequence, uint32_t iMotif, size_t iOffset,
00179     SCoalesceModifierCache& sModifiers ) const {
00180     size_t      i, iDepth;
00181     float       d, dRet;
00182     const CPST* pPST;
00183     string      strMotif, strRC;
00184     EType       eType;
00185 
00186     switch( eType = GetType( iMotif ) ) {
00187         case ETypePST:
00188             if( !( pPST = GetPST( iMotif ) ) ) {
00189                 g_CatSleipnir( ).error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) could not find PST",
00190                     strSequence.c_str( ), iMotif, iOffset );
00191                 return CMeta::GetNaN( ); }
00192             break;
00193 
00194         case ETypeKMer:
00195             strMotif = GetMotif( iMotif );
00196             break;
00197 
00198         case ETypeRC:
00199             strMotif = GetRCOne( iMotif );
00200             strRC = GetReverseComplement( strMotif );
00201             break; }
00202 
00203     iDepth = ( eType == ETypePST ) ? pPST->GetDepth( ) : GetK( );
00204     sModifiers.InitializeWeight( 0, 0 );
00205     dRet = 0;
00206     for( i = 1; i < iDepth; ++i ) {
00207         sModifiers.AddWeight( 0, iOffset, i - 1 );
00208         if( eType == ETypePST )
00209             dRet += pPST->GetMatch( strSequence, iDepth - i ) * sModifiers.GetWeight( i ) / i; }
00210     sModifiers.AddWeight( 0, iOffset, i - 1 );
00211     for( i = 0; i < strSequence.length( ); ++i ) {
00212         switch( eType ) {
00213             case ETypePST:
00214                 d = pPST->GetMatch( strSequence.substr( i ) ) * sModifiers.GetWeight( iDepth ) / iDepth;
00215                 break;
00216 
00217             case ETypeKMer:
00218             case ETypeRC:
00219                 d = ( strSequence.compare( i, iDepth, strMotif ) &&
00220                     ( strRC.empty( ) || strSequence.compare( i, iDepth, strRC ) ) ) ? 0 :
00221                     sModifiers.GetWeight( iDepth );
00222                 break; }
00223         dRet += d;
00224         sModifiers.AddWeight( iDepth, iOffset, i ); }
00225     if( CMeta::IsNaN( dRet ) || ( dRet < 0 ) ) {
00226         g_CatSleipnir( ).error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) found negative score: %g",
00227             strSequence.c_str( ), iMotif, iOffset, dRet );
00228         return CMeta::GetNaN( ); }
00229 
00230     return dRet; }
00231 
00232 std::string CCoalesceMotifLibraryImpl::GetMotif( uint32_t iMotif ) const {
00233     std::string strKMer;
00234 
00235 // kmer
00236     if( iMotif < GetKMers( ) )
00237         return ID2KMer( iMotif, m_iK );
00238 // reverse complement
00239     if( iMotif < GetBasePSTs( ) ) {
00240         strKMer = GetRCOne( iMotif );
00241         return ( strKMer + c_cSeparator + GetReverseComplement( strKMer ) ); }
00242 // pst
00243     return GetPST( iMotif )->GetMotif( ); }
00244 
00245 CPST* CCoalesceMotifLibraryImpl::CreatePST( uint32_t& iMotif ) {
00246     CPST*   pRet;
00247 
00248     iMotif = GetBasePSTs( ) + GetPSTs( );
00249     m_vecpPSTs.push_back( pRet = new CPST( strlen( c_szBases ) ) );
00250     return pRet; }
00251 
00265 uint32_t CCoalesceMotifLibrary::Open( const std::string& strMotif ) {
00266     uint32_t    iMotif;
00267     CPST*       pPST;
00268 
00269     if( strMotif.length( ) == GetK( ) && !IsIgnorableKMer( strMotif ) )
00270         return KMer2ID( strMotif );
00271     if( ( strMotif.length( ) == ( ( 2 * GetK( ) ) + 1 ) ) &&
00272         !IsIgnorableKMer( strMotif.substr( 0, GetK( ) ) ) &&
00273         ( strMotif.substr( 0, GetK( ) ) == GetReverseComplement( strMotif.substr( GetK( ) + 1 ) ) ) &&
00274         ( ( iMotif = KMer2ID( strMotif.substr( 0, GetK( ) ) ) ) != -1 ) )
00275         return ( GetBaseRCs( ) + m_veciKMer2RC[ iMotif ] );
00276 
00277     pPST = CreatePST( iMotif );
00278     if( !pPST->Open( strMotif ) ) {
00279         delete pPST;
00280         m_vecpPSTs.pop_back( );
00281         return -1; }
00282 
00283     return iMotif; }
00284 
00285 float CCoalesceMotifLibraryImpl::AlignKMers( const std::string& strOne, const std::string& strTwo,
00286     float dCutoff ) const {
00287     int iOffset;
00288 
00289     return Align( strOne, strTwo, dCutoff, iOffset ); }
00290 
00291 uint32_t CCoalesceMotifLibraryImpl::MergeKMers( const std::string& strOne, const std::string& strTwo,
00292     float dCutoff, bool fAllowDuplicates ) {
00293     int         iOffset;
00294     float       dScore;
00295     uint32_t    iRet;
00296     CPST*       pPST;
00297 
00298     if( ( ( dScore = Align( strOne, strTwo, dCutoff, iOffset ) ) > dCutoff ) ||
00299         !( fAllowDuplicates || dScore ) )
00300         return -1;
00301 
00302     pPST = CreatePST( iRet );
00303     pPST->Add( strOne, strTwo, iOffset );
00304     if( g_CatSleipnir( ).isInfoEnabled( ) )
00305         g_CatSleipnir( ).info( "CCoalesceMotifLibraryImpl::MergeKMers( %s, %s, %g ) merged at %g to %s",
00306             strOne.c_str( ), strTwo.c_str( ), dCutoff, dScore, pPST->GetMotif( ).c_str( ) );
00307     return iRet; }
00308 
00309 float CCoalesceMotifLibraryImpl::AlignKMerRC( const std::string& strKMer, uint32_t iRC, float dCutoff ) const {
00310     string  strOne, strTwo;
00311     float   dOne, dTwo;
00312     int     iOne, iTwo;
00313 
00314     strOne = GetRCOne( iRC );
00315     strTwo = GetReverseComplement( strOne );
00316     dOne = Align( strKMer, strOne, dCutoff, iOne );
00317     dTwo = Align( strKMer, strTwo, dCutoff, iTwo );
00318 
00319     return min( dOne, dTwo ); }
00320 
00321 uint32_t CCoalesceMotifLibraryImpl::MergeKMerRC( uint32_t iKMer, uint32_t iRC, float dCutoff,
00322     bool fAllowDuplicates ) {
00323     string      strKMer, strOne, strTwo;
00324     float       dOne, dTwo, dMin;
00325     int         iOne, iTwo;
00326     uint32_t    iRet;
00327     CPST*       pPST;
00328 
00329     if( m_veciKMer2RC[ iKMer ] == iRC )
00330         return -1;
00331     strKMer = GetMotif( iKMer );
00332     strOne = GetRCOne( iRC );
00333     strTwo = GetReverseComplement( strOne );
00334     dOne = Align( strKMer, strOne, dCutoff, iOne );
00335     dTwo = Align( strKMer, strTwo, dCutoff, iTwo );
00336     dMin = min( dOne, dTwo );
00337     if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
00338         return -1;
00339 
00340     pPST = CreatePST( iRet );
00341     if( dOne < dTwo ) {
00342         pPST->Add( strKMer, strOne, iOne );
00343         pPST->Add( strTwo ); }
00344     else {
00345         pPST->Add( strKMer, strTwo, iTwo );
00346         pPST->Add( strOne ); }
00347     if( g_CatSleipnir( ).isInfoEnabled( ) )
00348         g_CatSleipnir( ).info( "CCoalesceMotifLibraryImpl::MergeKMerRC( %s, %s, %g ) merged at %g to %s",
00349             strKMer.c_str( ), GetMotif( iRC ).c_str( ), dCutoff, min( dOne, dTwo ),
00350             pPST->GetMotif( ).c_str( ) );
00351     return iRet; }
00352 
00353 struct SCrossRCs {
00354     string  m_strOne;
00355     string  m_strTwo;
00356     float   m_dScore;
00357     int     m_iOffset;
00358 };
00359 
00360 float CCoalesceMotifLibraryImpl::AlignRCs( uint32_t iOne, uint32_t iTwo, float dCutoff ) const {
00361     SCrossRCs   asCrosses[ 4 ];
00362     size_t      i;
00363     float       dMin;
00364 
00365     asCrosses[ 0 ].m_strOne = asCrosses[ 1 ].m_strOne = GetRCOne( iOne );
00366     asCrosses[ 0 ].m_strTwo = asCrosses[ 2 ].m_strTwo = GetRCOne( iTwo );
00367     asCrosses[ 1 ].m_strTwo = asCrosses[ 3 ].m_strTwo = GetReverseComplement( asCrosses[ 0 ].m_strTwo );
00368     asCrosses[ 2 ].m_strOne = asCrosses[ 3 ].m_strOne = GetReverseComplement( asCrosses[ 0 ].m_strOne );
00369     dMin = FLT_MAX;
00370     for( i = 0; i < ARRAYSIZE(asCrosses); ++i ) {
00371         asCrosses[ i ].m_dScore = Align( asCrosses[ i ].m_strOne, asCrosses[ i ].m_strTwo, dCutoff,
00372             asCrosses[ i ].m_iOffset );
00373         if( asCrosses[ i ].m_dScore < dMin )
00374             dMin = asCrosses[ i ].m_dScore; }
00375 
00376     return dMin; }
00377 
00378 uint32_t CCoalesceMotifLibraryImpl::MergeRCs( uint32_t iOne, uint32_t iTwo, float dCutoff,
00379     bool fAllowDuplicates ) {
00380     SCrossRCs   asCrosses[ 4 ];
00381     uint32_t    iRet;
00382     CPST*       pPST;
00383     size_t      i, iMin;
00384     float       dMin;
00385 
00386     asCrosses[ 0 ].m_strOne = asCrosses[ 1 ].m_strOne = GetRCOne( iOne );
00387     asCrosses[ 0 ].m_strTwo = asCrosses[ 2 ].m_strTwo = GetRCOne( iTwo );
00388     asCrosses[ 1 ].m_strTwo = asCrosses[ 3 ].m_strTwo = GetReverseComplement( asCrosses[ 0 ].m_strTwo );
00389     asCrosses[ 2 ].m_strOne = asCrosses[ 3 ].m_strOne = GetReverseComplement( asCrosses[ 0 ].m_strOne );
00390     dMin = FLT_MAX;
00391     for( iMin = i = 0; i < ARRAYSIZE(asCrosses); ++i ) {
00392         asCrosses[ i ].m_dScore = Align( asCrosses[ i ].m_strOne, asCrosses[ i ].m_strTwo, dCutoff,
00393             asCrosses[ i ].m_iOffset );
00394         if( asCrosses[ i ].m_dScore < dMin ) {
00395             dMin = asCrosses[ i ].m_dScore;
00396             iMin = i; } }
00397     if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
00398         return -1;
00399 
00400     pPST = CreatePST( iRet );
00401     pPST->Add( asCrosses[ iMin ].m_strOne, asCrosses[ iMin ].m_strTwo, asCrosses[ iMin ].m_iOffset );
00402     {
00403         CPST    PST( strlen( c_szBases ) );
00404 
00405         PST.Add( asCrosses[ ( iMin + 1 ) % ARRAYSIZE(asCrosses) ].m_strTwo,
00406             asCrosses[ ( iMin + 2 ) % ARRAYSIZE(asCrosses) ].m_strOne, asCrosses[ iMin ].m_iOffset );
00407         pPST->Add( PST );
00408     }
00409     if( g_CatSleipnir( ).isInfoEnabled( ) )
00410         g_CatSleipnir( ).info( "CCoalesceMotifLibraryImpl::MergeRCs( %s, %s, %g ) merged at %g to %s",
00411             GetMotif( iOne ).c_str( ), GetMotif( iTwo ).c_str( ), dCutoff, dMin, pPST->GetMotif( ).c_str( ) );
00412     return iRet; }
00413 
00414 float CCoalesceMotifLibraryImpl::AlignKMerPST( const std::string& strKMer, const CPST& PSTIn,
00415     float dCutoff ) const {
00416     int iOffset;
00417 
00418     return PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); }
00419 
00420 uint32_t CCoalesceMotifLibraryImpl::MergeKMerPST( const std::string& strKMer, const CPST& PSTIn,
00421     float dCutoff, bool fAllowDuplicates ) {
00422     int         iOffset;
00423     float       dScore;
00424     uint32_t    iRet;
00425     CPST*       pPSTOut;
00426 
00427     if( ( ( dScore = PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) >
00428         dCutoff ) || !( fAllowDuplicates || dScore ) )
00429         return -1;
00430 
00431     pPSTOut = CreatePST( iRet );
00432     pPSTOut->Add( strKMer, PSTIn, iOffset );
00433     if( g_CatSleipnir( ).isInfoEnabled( ) ) {
00434         ostringstream   ossm;
00435 
00436         ossm << "CCoalesceMotifLibraryImpl::MergeKMerPST( " << strKMer << ", " << PSTIn.GetMotif( ) <<
00437             ", " << dCutoff << " ) merged at " << dScore << " to " << pPSTOut->GetMotif( );
00438         g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); }
00439     return iRet; }
00440 
00441 float CCoalesceMotifLibraryImpl::AlignRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff ) const {
00442     int     iOne, iTwo;
00443     string  strOne, strTwo;
00444     float   dOne, dTwo;
00445 
00446     strOne = GetRCOne( iRC );
00447     strTwo = GetReverseComplement( strOne );
00448     dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne );
00449     dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo );
00450 
00451     return min( dOne, dTwo ); }
00452 
00453 uint32_t CCoalesceMotifLibraryImpl::MergeRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff,
00454     bool fAllowDuplicates ) {
00455     int         iOne, iTwo;
00456     uint32_t    iRet;
00457     CPST*       pPSTOut;
00458     string      strOne, strTwo;
00459     float       dOne, dTwo, dMin;
00460 
00461     strOne = GetRCOne( iRC );
00462     strTwo = GetReverseComplement( strOne );
00463     dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne );
00464     dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo );
00465     dMin = min( dOne, dTwo );
00466     if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) )
00467         return -1;
00468 
00469     pPSTOut = CreatePST( iRet );
00470     if( dOne < dTwo ) {
00471         pPSTOut->Add( strOne, PSTIn, iOne );
00472         pPSTOut->Add( strTwo ); }
00473     else {
00474         pPSTOut->Add( strTwo, PSTIn, iTwo );
00475         pPSTOut->Add( strOne ); }
00476     if( g_CatSleipnir( ).isInfoEnabled( ) ) {
00477         ostringstream   ossm;
00478 
00479         ossm << "CCoalesceMotifLibraryImpl::MergeRCPST( " << GetMotif( iRC ) << ", " << PSTIn.GetMotif( ) <<
00480             ", " << dCutoff << " ) merged at " << min( dOne, dTwo ) << " to " << pPSTOut->GetMotif( );
00481         g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); }
00482     return iRet; }
00483 
00484 float CCoalesceMotifLibraryImpl::AlignPSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff ) const {
00485     int iOffset;
00486 
00487     return PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); }
00488 
00489 uint32_t CCoalesceMotifLibraryImpl::MergePSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff,
00490     bool fAllowDuplicates ) {
00491     int         iOffset;
00492     uint32_t    iRet;
00493     CPST*       pPSTOut;
00494     float       dScore;
00495 
00496     if( ( ( dScore = PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) >
00497         dCutoff ) || !( fAllowDuplicates || dScore ) )
00498         return -1;
00499 
00500     pPSTOut = CreatePST( iRet );
00501     if( iOffset < 0 ) {
00502         pPSTOut->Add( PSTOne );
00503         pPSTOut->Add( PSTTwo, -iOffset ); }
00504     else {
00505         pPSTOut->Add( PSTTwo );
00506         pPSTOut->Add( PSTOne, iOffset ); }
00507     if( g_CatSleipnir( ).isInfoEnabled( ) ) {
00508         ostringstream   ossm;
00509 
00510         ossm << "CCoalesceMotifLibraryImpl::MergePSTs( " << PSTOne.GetMotif( ) << ", " <<
00511             PSTTwo.GetMotif( ) << ", " << dCutoff << " ) merged at " << dScore << " to " <<
00512             pPSTOut->GetMotif( );
00513         g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); }
00514 
00515     return iRet; }
00516 
00517 uint32_t CCoalesceMotifLibraryImpl::RemoveRCs( const CPST& PST, float dPenaltyGap, float dPenaltyMismatch ) {
00518     CPST*       pPST;
00519     uint32_t    iRet;
00520 
00521     if( !( pPST = CreatePST( iRet ) ) )
00522         return -1;
00523     PST.RemoveRCs( dPenaltyGap, dPenaltyMismatch, *pPST );
00524     return iRet; }
00525 
00553 string CCoalesceMotifLibrary::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
00554     float dPenaltyMismatch, bool fNoRCs ) const {
00555     CFullMatrix<uint16_t>   MatPWM;
00556     size_t                  i, j;
00557     ostringstream           ossm;
00558 
00559     if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch, fNoRCs,
00560         MatPWM ) )
00561         return "";
00562     for( i = 0; i < MatPWM.GetRows( ); ++i ) {
00563         for( j = 0; j < MatPWM.GetColumns( ); ++j )
00564             ossm << ( j ? "\t" : "" ) << MatPWM.Get( i, j );
00565         ossm << endl; }
00566 
00567     return ossm.str( ); }
00568 
00569 bool CCoalesceMotifLibraryImpl::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap,
00570     float dPenaltyMismatch, bool fNoRCs, CFullMatrix<uint16_t>& MatPWM ) const {
00571     string  strMotif;
00572     float   d;
00573 
00574     if( fNoRCs )
00575         iMotif = ((CCoalesceMotifLibrary*)this)->RemoveRCs( iMotif, dPenaltyGap, dPenaltyMismatch );
00576     switch( GetType( iMotif ) ) {
00577         case ETypeKMer:
00578             if( !CCoalesceMotifLibraryImpl::GetPWM( GetMotif( iMotif ), MatPWM ) )
00579                 return false;
00580             break;
00581 
00582         case ETypeRC:
00583             if( !( CCoalesceMotifLibraryImpl::GetPWM( strMotif = GetRCOne( iMotif ), MatPWM ) &&
00584                 CCoalesceMotifLibraryImpl::GetPWM( GetReverseComplement( strMotif ), MatPWM ) ) )
00585                 return false;
00586             break;
00587 
00588         case ETypePST:
00589             GetPST( iMotif )->GetPWM( MatPWM, c_szBases );
00590             break; }
00591 
00592     if( dCutoffPWMs ) {
00593         d = GetInformation( MatPWM );
00594         if( d < dCutoffPWMs ) {
00595             if( g_CatSleipnir( ).isInfoEnabled( ) ) {
00596                 ostringstream   ossm;
00597 
00598                 ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
00599                     fNoRCs << " ) rejected (" << d << "):" << endl;
00600                 MatPWM.Save( ossm, false );
00601                 g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); }
00602             return false; }
00603         if( g_CatSleipnir( ).isDebugEnabled( ) ) {
00604             ostringstream   ossm;
00605 
00606             ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " <<
00607                 fNoRCs << " ) got information (" << d << "):" << endl;
00608             MatPWM.Save( ossm, false );
00609             g_CatSleipnir( ).debug( ossm.str( ).c_str( ) ); } }
00610 
00611     return true; }
00612 
00626 bool CCoalesceMotifLibrary::Simplify( uint32_t iMotif ) const {
00627 
00628     return ( ( GetType( iMotif ) == ETypePST ) ? CCoalesceMotifLibraryImpl::GetPST( iMotif )->Simplify( ) :
00629         false ); }
00630 
00651 bool CCoalesceMotifLibrary::OpenKnown( std::istream& istm ) {
00652     char*           szBuffer;
00653     vector<string>  vecstrLine;
00654 
00655     szBuffer = new char[ CFile::GetBufferSize( ) ];
00656     while( !istm.eof( ) ) {
00657         istm.getline( szBuffer, CFile::GetBufferSize( ) );
00658         if( !szBuffer[ 0 ] )
00659             continue;
00660         vecstrLine.clear( );
00661         CMeta::Tokenize( szBuffer, vecstrLine );
00662         if( vecstrLine.empty( ) )
00663             continue;
00664         if( ( vecstrLine.size( ) - 1 ) % 4 ) {
00665             g_CatSleipnir( ).warn( "CCoalesceMotifLibrary::OpenKnown( ) invalid line: %s" );
00666             continue; }
00667 
00668         m_sKnowns.Add( vecstrLine[ 0 ], vecstrLine ); }
00669     delete[] szBuffer;
00670 
00671     return true; }
00672 
00705 bool CCoalesceMotifLibrary::GetKnown( uint32_t iMotif, SMotifMatch::EType eMatchType, float dPenaltyGap,
00706     float dPenaltyMismatch, std::vector<std::pair<std::string, float> >& vecprstrdKnown, float dPValue ) const {
00707     size_t                          i, j, k, iSum;
00708     vector<float>                   vecdMotif;
00709     CFullMatrix<uint16_t>           MatPWM;
00710     map<string, float>              mapstrdKnown;
00711     map<string, float>::iterator    iterKnown;
00712 
00713     if( !m_sKnowns.GetSize( ) )
00714         return true;
00715     if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, 0, dPenaltyGap, dPenaltyMismatch, true, MatPWM ) )
00716         return false;
00717     vecdMotif.resize( MatPWM.GetRows( ) * MatPWM.GetColumns( ) );
00718     for( k = i = 0; i < MatPWM.GetColumns( ); ++i ) {
00719         for( iSum = j = 0; j < MatPWM.GetRows( ); ++j )
00720             iSum += MatPWM.Get( j, i );
00721         if( !iSum )
00722             iSum = 1;
00723         for( j = 0; j < MatPWM.GetRows( ); ++j )
00724             vecdMotif[ k++ ] = (float)MatPWM.Get( j, i ) / iSum; }
00725 
00726     m_sKnowns.Match( vecdMotif, eMatchType, mapstrdKnown );
00727     if( eMatchType == SMotifMatch::ETypePValue )
00728         dPValue /= m_sKnowns.GetSize( );
00729     for( iterKnown = mapstrdKnown.begin( ); iterKnown != mapstrdKnown.end( ); ++iterKnown )
00730         if( iterKnown->second < dPValue )
00731             vecprstrdKnown.push_back( *iterKnown );
00732 
00733     return true; }
00734 }