Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "coalescemotifs.h" 00024 #include "coalescestructsi.h" 00025 #include "pst.h" 00026 #include "measure.h" 00027 #include "statistics.h" 00028 00029 namespace Sleipnir { 00030 00031 const char CCoalesceMotifLibraryImpl::c_szBases[] = "ACGT"; 00032 const char CCoalesceMotifLibraryImpl::c_szComplements[] = "TGCA"; 00033 00034 void CCoalesceMotifLibraryImpl::SKnowns::Match( const std::vector<float>& vecdMotif, 00035 SMotifMatch::EType eMatchType, std::map<std::string, float>& mapstrdKnown ) const { 00036 static const size_t c_iOverlap = 5; 00037 map<string, TVecPr>::const_iterator iterKnown; 00038 size_t i, j, k, iMin, iMax, iTests; 00039 float dP, dR, dMin; 00040 const float* adMotif; 00041 const float* adPWM; 00042 const float* adRC; 00043 00044 for( iterKnown = m_mapstrvecKnown.begin( ); iterKnown != m_mapstrvecKnown.end( ); ++iterKnown ) { 00045 dMin = FLT_MAX; 00046 for( iTests = i = 0; i < iterKnown->second.size( ); ++i ) { 00047 const vector<float>& vecdPWM = iterKnown->second[ i ].first; 00048 const vector<float>& vecdRC = iterKnown->second[ i ].second; 00049 00050 iMin = strlen( c_szBases ) * c_iOverlap; 00051 iMax = vecdMotif.size( ) + vecdPWM.size( ) - iMin; 00052 if( iMax < iMin ) 00053 continue; 00054 for( j = iMin; j <= iMax; j += strlen( c_szBases ) ) { 00055 if( j < vecdMotif.size( ) ) { 00056 adMotif = &vecdMotif[ vecdMotif.size( ) - j - 1 ]; 00057 adPWM = &vecdPWM.front( ); 00058 adRC = &vecdRC.front( ); 00059 k = min( vecdPWM.size( ), j ); } 00060 else { 00061 adMotif = &vecdMotif.front( ); 00062 k = j - vecdMotif.size( ); 00063 adPWM = &vecdPWM[ k ]; 00064 adRC = &vecdRC[ k ]; 00065 k = min( vecdMotif.size( ), vecdPWM.size( ) - k ); } 00066 switch( eMatchType ) { 00067 case SMotifMatch::ETypeRMSE: 00068 dP = (float)min( CStatistics::RootMeanSquareError( adMotif, adMotif + k, adPWM, 00069 adPWM + k ), CStatistics::RootMeanSquareError( adMotif, adMotif + k, adRC, 00070 adRC + k ) ); 00071 break; 00072 00073 case SMotifMatch::ETypeJensenShannon: 00074 dP = (float)min( CStatistics::JensenShannonDivergence( adMotif, adMotif + k, adPWM, 00075 adPWM + k ), CStatistics::JensenShannonDivergence( adMotif, adMotif + k, adRC, 00076 adRC + k ) ); 00077 break; 00078 00079 case SMotifMatch::ETypePValue: 00080 iTests += 2; 00081 if( ( dR = (float)max( CMeasurePearson::Pearson( adMotif, k, adPWM, k, 00082 IMeasure::EMapNone ), CMeasurePearson::Pearson( adMotif, k, adRC, k, 00083 IMeasure::EMapNone ) ) ) > 0 ) 00084 dP = (float)CStatistics::PValuePearson( dR, k ); 00085 else 00086 dP = FLT_MAX; 00087 break; } 00088 if( dP < dMin ) 00089 dMin = dP; } } 00090 if( dMin != FLT_MAX ) 00091 mapstrdKnown[ iterKnown->first ] = dMin * max( 1, iTests ); } } 00092 00118 bool CCoalesceMotifLibrary::Open( std::istream& istm, std::vector<SMotifMatch>& vecsMotifs, 00119 CCoalesceMotifLibrary* pMotifs ) { 00120 string strBuffer; 00121 size_t i; 00122 00123 strBuffer.resize( CFile::GetBufferSize( ) ); 00124 while( CFile::IsNewline( istm.peek( ) ) ) 00125 istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); 00126 while( !istm.eof( ) && ( istm.peek( ) != c_cCluster ) ) { 00127 SMotifMatch sMotif; 00128 00129 if( pMotifs ) { 00130 if( !sMotif.Open( istm, *pMotifs ) ) 00131 break; 00132 vecsMotifs.push_back( sMotif ); 00133 if( isdigit( istm.peek( ) ) ) 00134 for( i = 0; i < 4; ++i ) 00135 istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); } 00136 else 00137 istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); 00138 while( CFile::IsNewline( istm.peek( ) ) ) 00139 istm.getline( &strBuffer[ 0 ], strBuffer.size( ) ); } 00140 00141 return true; } 00142 00143 CCoalesceMotifLibraryImpl::~CCoalesceMotifLibraryImpl( ) { 00144 size_t i; 00145 00146 for( i = 0; i < m_vecpPSTs.size( ); ++i ) 00147 delete m_vecpPSTs[ i ]; } 00148 00178 float CCoalesceMotifLibrary::GetMatch( const std::string& strSequence, uint32_t iMotif, size_t iOffset, 00179 SCoalesceModifierCache& sModifiers ) const { 00180 size_t i, iDepth; 00181 float d, dRet; 00182 const CPST* pPST; 00183 string strMotif, strRC; 00184 EType eType; 00185 00186 switch( eType = GetType( iMotif ) ) { 00187 case ETypePST: 00188 if( !( pPST = GetPST( iMotif ) ) ) { 00189 g_CatSleipnir( ).error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) could not find PST", 00190 strSequence.c_str( ), iMotif, iOffset ); 00191 return CMeta::GetNaN( ); } 00192 break; 00193 00194 case ETypeKMer: 00195 strMotif = GetMotif( iMotif ); 00196 break; 00197 00198 case ETypeRC: 00199 strMotif = GetRCOne( iMotif ); 00200 strRC = GetReverseComplement( strMotif ); 00201 break; } 00202 00203 iDepth = ( eType == ETypePST ) ? pPST->GetDepth( ) : GetK( ); 00204 sModifiers.InitializeWeight( 0, 0 ); 00205 dRet = 0; 00206 for( i = 1; i < iDepth; ++i ) { 00207 sModifiers.AddWeight( 0, iOffset, i - 1 ); 00208 if( eType == ETypePST ) 00209 dRet += pPST->GetMatch( strSequence, iDepth - i ) * sModifiers.GetWeight( i ) / i; } 00210 sModifiers.AddWeight( 0, iOffset, i - 1 ); 00211 for( i = 0; i < strSequence.length( ); ++i ) { 00212 switch( eType ) { 00213 case ETypePST: 00214 d = pPST->GetMatch( strSequence.substr( i ) ) * sModifiers.GetWeight( iDepth ) / iDepth; 00215 break; 00216 00217 case ETypeKMer: 00218 case ETypeRC: 00219 d = ( strSequence.compare( i, iDepth, strMotif ) && 00220 ( strRC.empty( ) || strSequence.compare( i, iDepth, strRC ) ) ) ? 0 : 00221 sModifiers.GetWeight( iDepth ); 00222 break; } 00223 dRet += d; 00224 sModifiers.AddWeight( iDepth, iOffset, i ); } 00225 if( CMeta::IsNaN( dRet ) || ( dRet < 0 ) ) { 00226 g_CatSleipnir( ).error( "CCoalesceMotifLibrary::GetMatch( %s, %d, %d ) found negative score: %g", 00227 strSequence.c_str( ), iMotif, iOffset, dRet ); 00228 return CMeta::GetNaN( ); } 00229 00230 return dRet; } 00231 00232 std::string CCoalesceMotifLibraryImpl::GetMotif( uint32_t iMotif ) const { 00233 std::string strKMer; 00234 00235 // kmer 00236 if( iMotif < GetKMers( ) ) 00237 return ID2KMer( iMotif, m_iK ); 00238 // reverse complement 00239 if( iMotif < GetBasePSTs( ) ) { 00240 strKMer = GetRCOne( iMotif ); 00241 return ( strKMer + c_cSeparator + GetReverseComplement( strKMer ) ); } 00242 // pst 00243 return GetPST( iMotif )->GetMotif( ); } 00244 00245 CPST* CCoalesceMotifLibraryImpl::CreatePST( uint32_t& iMotif ) { 00246 CPST* pRet; 00247 00248 iMotif = GetBasePSTs( ) + GetPSTs( ); 00249 m_vecpPSTs.push_back( pRet = new CPST( strlen( c_szBases ) ) ); 00250 return pRet; } 00251 00265 uint32_t CCoalesceMotifLibrary::Open( const std::string& strMotif ) { 00266 uint32_t iMotif; 00267 CPST* pPST; 00268 00269 if( strMotif.length( ) == GetK( ) && !IsIgnorableKMer( strMotif ) ) 00270 return KMer2ID( strMotif ); 00271 if( ( strMotif.length( ) == ( ( 2 * GetK( ) ) + 1 ) ) && 00272 !IsIgnorableKMer( strMotif.substr( 0, GetK( ) ) ) && 00273 ( strMotif.substr( 0, GetK( ) ) == GetReverseComplement( strMotif.substr( GetK( ) + 1 ) ) ) && 00274 ( ( iMotif = KMer2ID( strMotif.substr( 0, GetK( ) ) ) ) != -1 ) ) 00275 return ( GetBaseRCs( ) + m_veciKMer2RC[ iMotif ] ); 00276 00277 pPST = CreatePST( iMotif ); 00278 if( !pPST->Open( strMotif ) ) { 00279 delete pPST; 00280 m_vecpPSTs.pop_back( ); 00281 return -1; } 00282 00283 return iMotif; } 00284 00285 float CCoalesceMotifLibraryImpl::AlignKMers( const std::string& strOne, const std::string& strTwo, 00286 float dCutoff ) const { 00287 int iOffset; 00288 00289 return Align( strOne, strTwo, dCutoff, iOffset ); } 00290 00291 uint32_t CCoalesceMotifLibraryImpl::MergeKMers( const std::string& strOne, const std::string& strTwo, 00292 float dCutoff, bool fAllowDuplicates ) { 00293 int iOffset; 00294 float dScore; 00295 uint32_t iRet; 00296 CPST* pPST; 00297 00298 if( ( ( dScore = Align( strOne, strTwo, dCutoff, iOffset ) ) > dCutoff ) || 00299 !( fAllowDuplicates || dScore ) ) 00300 return -1; 00301 00302 pPST = CreatePST( iRet ); 00303 pPST->Add( strOne, strTwo, iOffset ); 00304 if( g_CatSleipnir( ).isInfoEnabled( ) ) 00305 g_CatSleipnir( ).info( "CCoalesceMotifLibraryImpl::MergeKMers( %s, %s, %g ) merged at %g to %s", 00306 strOne.c_str( ), strTwo.c_str( ), dCutoff, dScore, pPST->GetMotif( ).c_str( ) ); 00307 return iRet; } 00308 00309 float CCoalesceMotifLibraryImpl::AlignKMerRC( const std::string& strKMer, uint32_t iRC, float dCutoff ) const { 00310 string strOne, strTwo; 00311 float dOne, dTwo; 00312 int iOne, iTwo; 00313 00314 strOne = GetRCOne( iRC ); 00315 strTwo = GetReverseComplement( strOne ); 00316 dOne = Align( strKMer, strOne, dCutoff, iOne ); 00317 dTwo = Align( strKMer, strTwo, dCutoff, iTwo ); 00318 00319 return min( dOne, dTwo ); } 00320 00321 uint32_t CCoalesceMotifLibraryImpl::MergeKMerRC( uint32_t iKMer, uint32_t iRC, float dCutoff, 00322 bool fAllowDuplicates ) { 00323 string strKMer, strOne, strTwo; 00324 float dOne, dTwo, dMin; 00325 int iOne, iTwo; 00326 uint32_t iRet; 00327 CPST* pPST; 00328 00329 if( m_veciKMer2RC[ iKMer ] == iRC ) 00330 return -1; 00331 strKMer = GetMotif( iKMer ); 00332 strOne = GetRCOne( iRC ); 00333 strTwo = GetReverseComplement( strOne ); 00334 dOne = Align( strKMer, strOne, dCutoff, iOne ); 00335 dTwo = Align( strKMer, strTwo, dCutoff, iTwo ); 00336 dMin = min( dOne, dTwo ); 00337 if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) ) 00338 return -1; 00339 00340 pPST = CreatePST( iRet ); 00341 if( dOne < dTwo ) { 00342 pPST->Add( strKMer, strOne, iOne ); 00343 pPST->Add( strTwo ); } 00344 else { 00345 pPST->Add( strKMer, strTwo, iTwo ); 00346 pPST->Add( strOne ); } 00347 if( g_CatSleipnir( ).isInfoEnabled( ) ) 00348 g_CatSleipnir( ).info( "CCoalesceMotifLibraryImpl::MergeKMerRC( %s, %s, %g ) merged at %g to %s", 00349 strKMer.c_str( ), GetMotif( iRC ).c_str( ), dCutoff, min( dOne, dTwo ), 00350 pPST->GetMotif( ).c_str( ) ); 00351 return iRet; } 00352 00353 struct SCrossRCs { 00354 string m_strOne; 00355 string m_strTwo; 00356 float m_dScore; 00357 int m_iOffset; 00358 }; 00359 00360 float CCoalesceMotifLibraryImpl::AlignRCs( uint32_t iOne, uint32_t iTwo, float dCutoff ) const { 00361 SCrossRCs asCrosses[ 4 ]; 00362 size_t i; 00363 float dMin; 00364 00365 asCrosses[ 0 ].m_strOne = asCrosses[ 1 ].m_strOne = GetRCOne( iOne ); 00366 asCrosses[ 0 ].m_strTwo = asCrosses[ 2 ].m_strTwo = GetRCOne( iTwo ); 00367 asCrosses[ 1 ].m_strTwo = asCrosses[ 3 ].m_strTwo = GetReverseComplement( asCrosses[ 0 ].m_strTwo ); 00368 asCrosses[ 2 ].m_strOne = asCrosses[ 3 ].m_strOne = GetReverseComplement( asCrosses[ 0 ].m_strOne ); 00369 dMin = FLT_MAX; 00370 for( i = 0; i < ARRAYSIZE(asCrosses); ++i ) { 00371 asCrosses[ i ].m_dScore = Align( asCrosses[ i ].m_strOne, asCrosses[ i ].m_strTwo, dCutoff, 00372 asCrosses[ i ].m_iOffset ); 00373 if( asCrosses[ i ].m_dScore < dMin ) 00374 dMin = asCrosses[ i ].m_dScore; } 00375 00376 return dMin; } 00377 00378 uint32_t CCoalesceMotifLibraryImpl::MergeRCs( uint32_t iOne, uint32_t iTwo, float dCutoff, 00379 bool fAllowDuplicates ) { 00380 SCrossRCs asCrosses[ 4 ]; 00381 uint32_t iRet; 00382 CPST* pPST; 00383 size_t i, iMin; 00384 float dMin; 00385 00386 asCrosses[ 0 ].m_strOne = asCrosses[ 1 ].m_strOne = GetRCOne( iOne ); 00387 asCrosses[ 0 ].m_strTwo = asCrosses[ 2 ].m_strTwo = GetRCOne( iTwo ); 00388 asCrosses[ 1 ].m_strTwo = asCrosses[ 3 ].m_strTwo = GetReverseComplement( asCrosses[ 0 ].m_strTwo ); 00389 asCrosses[ 2 ].m_strOne = asCrosses[ 3 ].m_strOne = GetReverseComplement( asCrosses[ 0 ].m_strOne ); 00390 dMin = FLT_MAX; 00391 for( iMin = i = 0; i < ARRAYSIZE(asCrosses); ++i ) { 00392 asCrosses[ i ].m_dScore = Align( asCrosses[ i ].m_strOne, asCrosses[ i ].m_strTwo, dCutoff, 00393 asCrosses[ i ].m_iOffset ); 00394 if( asCrosses[ i ].m_dScore < dMin ) { 00395 dMin = asCrosses[ i ].m_dScore; 00396 iMin = i; } } 00397 if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) ) 00398 return -1; 00399 00400 pPST = CreatePST( iRet ); 00401 pPST->Add( asCrosses[ iMin ].m_strOne, asCrosses[ iMin ].m_strTwo, asCrosses[ iMin ].m_iOffset ); 00402 { 00403 CPST PST( strlen( c_szBases ) ); 00404 00405 PST.Add( asCrosses[ ( iMin + 1 ) % ARRAYSIZE(asCrosses) ].m_strTwo, 00406 asCrosses[ ( iMin + 2 ) % ARRAYSIZE(asCrosses) ].m_strOne, asCrosses[ iMin ].m_iOffset ); 00407 pPST->Add( PST ); 00408 } 00409 if( g_CatSleipnir( ).isInfoEnabled( ) ) 00410 g_CatSleipnir( ).info( "CCoalesceMotifLibraryImpl::MergeRCs( %s, %s, %g ) merged at %g to %s", 00411 GetMotif( iOne ).c_str( ), GetMotif( iTwo ).c_str( ), dCutoff, dMin, pPST->GetMotif( ).c_str( ) ); 00412 return iRet; } 00413 00414 float CCoalesceMotifLibraryImpl::AlignKMerPST( const std::string& strKMer, const CPST& PSTIn, 00415 float dCutoff ) const { 00416 int iOffset; 00417 00418 return PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); } 00419 00420 uint32_t CCoalesceMotifLibraryImpl::MergeKMerPST( const std::string& strKMer, const CPST& PSTIn, 00421 float dCutoff, bool fAllowDuplicates ) { 00422 int iOffset; 00423 float dScore; 00424 uint32_t iRet; 00425 CPST* pPSTOut; 00426 00427 if( ( ( dScore = PSTIn.Align( strKMer, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) > 00428 dCutoff ) || !( fAllowDuplicates || dScore ) ) 00429 return -1; 00430 00431 pPSTOut = CreatePST( iRet ); 00432 pPSTOut->Add( strKMer, PSTIn, iOffset ); 00433 if( g_CatSleipnir( ).isInfoEnabled( ) ) { 00434 ostringstream ossm; 00435 00436 ossm << "CCoalesceMotifLibraryImpl::MergeKMerPST( " << strKMer << ", " << PSTIn.GetMotif( ) << 00437 ", " << dCutoff << " ) merged at " << dScore << " to " << pPSTOut->GetMotif( ); 00438 g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); } 00439 return iRet; } 00440 00441 float CCoalesceMotifLibraryImpl::AlignRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff ) const { 00442 int iOne, iTwo; 00443 string strOne, strTwo; 00444 float dOne, dTwo; 00445 00446 strOne = GetRCOne( iRC ); 00447 strTwo = GetReverseComplement( strOne ); 00448 dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne ); 00449 dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo ); 00450 00451 return min( dOne, dTwo ); } 00452 00453 uint32_t CCoalesceMotifLibraryImpl::MergeRCPST( uint32_t iRC, const CPST& PSTIn, float dCutoff, 00454 bool fAllowDuplicates ) { 00455 int iOne, iTwo; 00456 uint32_t iRet; 00457 CPST* pPSTOut; 00458 string strOne, strTwo; 00459 float dOne, dTwo, dMin; 00460 00461 strOne = GetRCOne( iRC ); 00462 strTwo = GetReverseComplement( strOne ); 00463 dOne = PSTIn.Align( strOne, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOne ); 00464 dTwo = PSTIn.Align( strTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iTwo ); 00465 dMin = min( dOne, dTwo ); 00466 if( ( dMin > dCutoff ) || !( fAllowDuplicates || dMin ) ) 00467 return -1; 00468 00469 pPSTOut = CreatePST( iRet ); 00470 if( dOne < dTwo ) { 00471 pPSTOut->Add( strOne, PSTIn, iOne ); 00472 pPSTOut->Add( strTwo ); } 00473 else { 00474 pPSTOut->Add( strTwo, PSTIn, iTwo ); 00475 pPSTOut->Add( strOne ); } 00476 if( g_CatSleipnir( ).isInfoEnabled( ) ) { 00477 ostringstream ossm; 00478 00479 ossm << "CCoalesceMotifLibraryImpl::MergeRCPST( " << GetMotif( iRC ) << ", " << PSTIn.GetMotif( ) << 00480 ", " << dCutoff << " ) merged at " << min( dOne, dTwo ) << " to " << pPSTOut->GetMotif( ); 00481 g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); } 00482 return iRet; } 00483 00484 float CCoalesceMotifLibraryImpl::AlignPSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff ) const { 00485 int iOffset; 00486 00487 return PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ); } 00488 00489 uint32_t CCoalesceMotifLibraryImpl::MergePSTs( const CPST& PSTOne, const CPST& PSTTwo, float dCutoff, 00490 bool fAllowDuplicates ) { 00491 int iOffset; 00492 uint32_t iRet; 00493 CPST* pPSTOut; 00494 float dScore; 00495 00496 if( ( ( dScore = PSTOne.Align( PSTTwo, m_dPenaltyGap, m_dPenaltyMismatch, dCutoff, iOffset ) ) > 00497 dCutoff ) || !( fAllowDuplicates || dScore ) ) 00498 return -1; 00499 00500 pPSTOut = CreatePST( iRet ); 00501 if( iOffset < 0 ) { 00502 pPSTOut->Add( PSTOne ); 00503 pPSTOut->Add( PSTTwo, -iOffset ); } 00504 else { 00505 pPSTOut->Add( PSTTwo ); 00506 pPSTOut->Add( PSTOne, iOffset ); } 00507 if( g_CatSleipnir( ).isInfoEnabled( ) ) { 00508 ostringstream ossm; 00509 00510 ossm << "CCoalesceMotifLibraryImpl::MergePSTs( " << PSTOne.GetMotif( ) << ", " << 00511 PSTTwo.GetMotif( ) << ", " << dCutoff << " ) merged at " << dScore << " to " << 00512 pPSTOut->GetMotif( ); 00513 g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); } 00514 00515 return iRet; } 00516 00517 uint32_t CCoalesceMotifLibraryImpl::RemoveRCs( const CPST& PST, float dPenaltyGap, float dPenaltyMismatch ) { 00518 CPST* pPST; 00519 uint32_t iRet; 00520 00521 if( !( pPST = CreatePST( iRet ) ) ) 00522 return -1; 00523 PST.RemoveRCs( dPenaltyGap, dPenaltyMismatch, *pPST ); 00524 return iRet; } 00525 00553 string CCoalesceMotifLibrary::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap, 00554 float dPenaltyMismatch, bool fNoRCs ) const { 00555 CFullMatrix<uint16_t> MatPWM; 00556 size_t i, j; 00557 ostringstream ossm; 00558 00559 if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch, fNoRCs, 00560 MatPWM ) ) 00561 return ""; 00562 for( i = 0; i < MatPWM.GetRows( ); ++i ) { 00563 for( j = 0; j < MatPWM.GetColumns( ); ++j ) 00564 ossm << ( j ? "\t" : "" ) << MatPWM.Get( i, j ); 00565 ossm << endl; } 00566 00567 return ossm.str( ); } 00568 00569 bool CCoalesceMotifLibraryImpl::GetPWM( uint32_t iMotif, float dCutoffPWMs, float dPenaltyGap, 00570 float dPenaltyMismatch, bool fNoRCs, CFullMatrix<uint16_t>& MatPWM ) const { 00571 string strMotif; 00572 float d; 00573 00574 if( fNoRCs ) 00575 iMotif = ((CCoalesceMotifLibrary*)this)->RemoveRCs( iMotif, dPenaltyGap, dPenaltyMismatch ); 00576 switch( GetType( iMotif ) ) { 00577 case ETypeKMer: 00578 if( !CCoalesceMotifLibraryImpl::GetPWM( GetMotif( iMotif ), MatPWM ) ) 00579 return false; 00580 break; 00581 00582 case ETypeRC: 00583 if( !( CCoalesceMotifLibraryImpl::GetPWM( strMotif = GetRCOne( iMotif ), MatPWM ) && 00584 CCoalesceMotifLibraryImpl::GetPWM( GetReverseComplement( strMotif ), MatPWM ) ) ) 00585 return false; 00586 break; 00587 00588 case ETypePST: 00589 GetPST( iMotif )->GetPWM( MatPWM, c_szBases ); 00590 break; } 00591 00592 if( dCutoffPWMs ) { 00593 d = GetInformation( MatPWM ); 00594 if( d < dCutoffPWMs ) { 00595 if( g_CatSleipnir( ).isInfoEnabled( ) ) { 00596 ostringstream ossm; 00597 00598 ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " << 00599 fNoRCs << " ) rejected (" << d << "):" << endl; 00600 MatPWM.Save( ossm, false ); 00601 g_CatSleipnir( ).info( ossm.str( ).c_str( ) ); } 00602 return false; } 00603 if( g_CatSleipnir( ).isDebugEnabled( ) ) { 00604 ostringstream ossm; 00605 00606 ossm << "CCoalesceMotifLibraryImpl::GetPWM( " << iMotif << ", " << dCutoffPWMs << ", " << 00607 fNoRCs << " ) got information (" << d << "):" << endl; 00608 MatPWM.Save( ossm, false ); 00609 g_CatSleipnir( ).debug( ossm.str( ).c_str( ) ); } } 00610 00611 return true; } 00612 00626 bool CCoalesceMotifLibrary::Simplify( uint32_t iMotif ) const { 00627 00628 return ( ( GetType( iMotif ) == ETypePST ) ? CCoalesceMotifLibraryImpl::GetPST( iMotif )->Simplify( ) : 00629 false ); } 00630 00651 bool CCoalesceMotifLibrary::OpenKnown( std::istream& istm ) { 00652 char* szBuffer; 00653 vector<string> vecstrLine; 00654 00655 szBuffer = new char[ CFile::GetBufferSize( ) ]; 00656 while( !istm.eof( ) ) { 00657 istm.getline( szBuffer, CFile::GetBufferSize( ) ); 00658 if( !szBuffer[ 0 ] ) 00659 continue; 00660 vecstrLine.clear( ); 00661 CMeta::Tokenize( szBuffer, vecstrLine ); 00662 if( vecstrLine.empty( ) ) 00663 continue; 00664 if( ( vecstrLine.size( ) - 1 ) % 4 ) { 00665 g_CatSleipnir( ).warn( "CCoalesceMotifLibrary::OpenKnown( ) invalid line: %s" ); 00666 continue; } 00667 00668 m_sKnowns.Add( vecstrLine[ 0 ], vecstrLine ); } 00669 delete[] szBuffer; 00670 00671 return true; } 00672 00705 bool CCoalesceMotifLibrary::GetKnown( uint32_t iMotif, SMotifMatch::EType eMatchType, float dPenaltyGap, 00706 float dPenaltyMismatch, std::vector<std::pair<std::string, float> >& vecprstrdKnown, float dPValue ) const { 00707 size_t i, j, k, iSum; 00708 vector<float> vecdMotif; 00709 CFullMatrix<uint16_t> MatPWM; 00710 map<string, float> mapstrdKnown; 00711 map<string, float>::iterator iterKnown; 00712 00713 if( !m_sKnowns.GetSize( ) ) 00714 return true; 00715 if( !CCoalesceMotifLibraryImpl::GetPWM( iMotif, 0, dPenaltyGap, dPenaltyMismatch, true, MatPWM ) ) 00716 return false; 00717 vecdMotif.resize( MatPWM.GetRows( ) * MatPWM.GetColumns( ) ); 00718 for( k = i = 0; i < MatPWM.GetColumns( ); ++i ) { 00719 for( iSum = j = 0; j < MatPWM.GetRows( ); ++j ) 00720 iSum += MatPWM.Get( j, i ); 00721 if( !iSum ) 00722 iSum = 1; 00723 for( j = 0; j < MatPWM.GetRows( ); ++j ) 00724 vecdMotif[ k++ ] = (float)MatPWM.Get( j, i ) / iSum; } 00725 00726 m_sKnowns.Match( vecdMotif, eMatchType, mapstrdKnown ); 00727 if( eMatchType == SMotifMatch::ETypePValue ) 00728 dPValue /= m_sKnowns.GetSize( ); 00729 for( iterKnown = mapstrdKnown.begin( ); iterKnown != mapstrdKnown.end( ); ++iterKnown ) 00730 if( iterKnown->second < dPValue ) 00731 vecprstrdKnown.push_back( *iterKnown ); 00732 00733 return true; } 00734 }