Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef PSTI_H 00023 #define PSTI_H 00024 00025 #include <sstream> 00026 #include <stack> 00027 #include <string> 00028 #include <vector> 00029 00030 #include "fullmatrix.h" 00031 00032 namespace Sleipnir { 00033 00034 class CPSTImpl { 00035 protected: 00036 struct SNode { 00037 SNode( ) : m_iTotal(0), m_iCount(0) { } 00038 00039 SNode( unsigned char cCharacter, uint16_t iCount = 1 ) : m_cCharacter(cCharacter), m_iCount(iCount), 00040 m_iTotal(0) { } 00041 00042 unsigned char m_cCharacter; 00043 uint16_t m_iCount; 00044 uint16_t m_iTotal; 00045 std::vector<SNode> m_vecsChildren; 00046 }; 00047 00048 static void RemoveRCs( const SNode& sNode, float dCutoff, std::string& strSeq, 00049 std::vector<std::string>& vecstrOut ) { 00050 size_t i; 00051 00052 if( sNode.m_vecsChildren.empty( ) ) { 00053 vecstrOut.push_back( strSeq ); 00054 return; } 00055 00056 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) { 00057 const SNode& sChild = sNode.m_vecsChildren[ i ]; 00058 00059 if( ( (float)sChild.m_iCount / sNode.m_iTotal ) < dCutoff ) 00060 continue; 00061 strSeq.push_back( sChild.m_cCharacter ); 00062 RemoveRCs( sChild, dCutoff, strSeq, vecstrOut ); 00063 strSeq.resize( strSeq.size( ) - 1 ); } } 00064 00065 static std::string GetMotif( const SNode& sNode ) { 00066 std::ostringstream ossm; 00067 size_t i; 00068 00069 switch( sNode.m_vecsChildren.size( ) ) { 00070 case 0: 00071 break; 00072 00073 case 1: 00074 ossm << sNode.m_vecsChildren[ 0 ].m_cCharacter << ':' << sNode.m_vecsChildren[ 0 ].m_iCount; 00075 if( !sNode.m_vecsChildren[ 0 ].m_vecsChildren.empty( ) ) 00076 ossm << ' '; 00077 ossm << GetMotif( sNode.m_vecsChildren[ 0 ] ); 00078 break; 00079 00080 default: 00081 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) { 00082 const SNode& sChild = sNode.m_vecsChildren[ i ]; 00083 00084 ossm << ( i ? "|" : "" ) << '(' << sChild.m_cCharacter << ':' << sChild.m_iCount; 00085 if( !sChild.m_vecsChildren.empty( ) ) 00086 ossm << ' '; 00087 ossm << GetMotif( sChild ) << ')'; } } 00088 00089 return ossm.str( ); } 00090 00091 static size_t Add( unsigned char cCharacter, SNode& sNode, uint16_t iCount = 1 ) { 00092 size_t i; 00093 00094 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) 00095 if( sNode.m_vecsChildren[ i ].m_cCharacter == cCharacter ) 00096 break; 00097 if( i < sNode.m_vecsChildren.size( ) ) 00098 sNode.m_vecsChildren[ i ].m_iCount += iCount; 00099 else 00100 sNode.m_vecsChildren.push_back( SNode( cCharacter, iCount ) ); 00101 sNode.m_iTotal += iCount; 00102 00103 return i; } 00104 00105 static size_t Add( const std::string& str, SNode& sNode ) { 00106 00107 return ( str.empty( ) ? 0 : 00108 ( 1 + Add( str.substr( 1 ), sNode.m_vecsChildren[ Add( str[ 0 ], sNode ) ] ) ) ); } 00109 00110 static size_t Add( const SNode& sIn, SNode& sOut ) { 00111 size_t i, j, iCur, iRet; 00112 00113 sOut.m_iCount += sIn.m_iCount; 00114 sOut.m_iTotal += sIn.m_iTotal; 00115 for( iRet = i = 0; i < sIn.m_vecsChildren.size( ); ++i ) { 00116 const SNode& sChildIn = sIn.m_vecsChildren[ i ]; 00117 00118 for( j = 0; j < sOut.m_vecsChildren.size( ); ++j ) 00119 if( sChildIn.m_cCharacter == sOut.m_vecsChildren[ j ].m_cCharacter ) 00120 break; 00121 if( j >= sOut.m_vecsChildren.size( ) ) { 00122 sOut.m_vecsChildren.push_back( SNode( sChildIn.m_cCharacter ) ); 00123 sOut.m_vecsChildren.back( ).m_iCount = 0; } 00124 if( ( iCur = ( 1 + Add( sChildIn, sOut.m_vecsChildren[ j ] ) ) ) > iRet ) 00125 iRet = iCur; } 00126 00127 return iRet; } 00128 00129 template<class tType> 00130 static size_t Add( const tType& Addition, size_t iOffset, SNode& sNode ) { 00131 size_t i, iRet, iCur; 00132 00133 if( !iOffset ) 00134 return Add( Addition, sNode ); 00135 00136 iOffset--; 00137 iRet = 0; 00138 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) { 00139 iCur = ( 1 + Add( Addition, iOffset, sNode.m_vecsChildren[ i ] ) ); 00140 if( iCur > iRet ) 00141 iRet = iCur; } 00142 00143 return iRet; } 00144 00145 static size_t Open( const std::string& strPST, SNode& sNode ) { 00146 std::stack<SNode*> stckpsBranch; 00147 std::stack<size_t> stckiDepth; 00148 SNode* psNode; 00149 size_t i, iCur, iMax; 00150 char c; 00151 uint16_t iCount; 00152 00153 psNode = &sNode; 00154 iCount = 1; 00155 for( iCur = iMax = i = 0; i < strPST.size( ); ++i ) 00156 switch( c = strPST[ i ] ) { 00157 case '(': 00158 stckpsBranch.push( psNode ); 00159 stckiDepth.push( iCur ); 00160 break; 00161 00162 case ')': 00163 psNode = stckpsBranch.top( ); 00164 stckpsBranch.pop( ); 00165 iCur = stckiDepth.top( ); 00166 stckiDepth.pop( ); 00167 break; 00168 00169 case '|': 00170 case ' ': 00171 break; 00172 00173 default: 00174 if( ( ( i + 2 ) >= strPST.length( ) ) || ( strPST[ i + 1 ] != ':' ) ) 00175 return -1; 00176 iCount = atoi( strPST.c_str( ) + ++i + 1 ); 00177 while( isdigit( strPST[ ++i ] ) ); 00178 --i; 00179 psNode = &psNode->m_vecsChildren[ Add( c, *psNode, iCount ) ]; 00180 if( ++iCur > iMax ) 00181 iMax = iCur; } 00182 00183 return iMax; } 00184 00185 template<class tType> 00186 static float Align( const SNode& sRoot, size_t iDepth, const tType& Data, size_t iLength, 00187 float dPenaltyGap, float dPenaltyMismatch, float dCutoff, int& iOffset ) { 00188 size_t iBegin, iEnd, iCur, iMin, iGaps; 00189 float dRet, dCur; 00190 00191 dRet = FLT_MAX; 00192 dCur = dCutoff / dPenaltyGap; 00193 iBegin = ( dCur > iLength ) ? 0 : (size_t)ceil( iLength - dCur ); 00194 iEnd = iLength + iDepth - iBegin; 00195 for( iMin = 0,iCur = iBegin; iCur < iEnd; ++iCur ) { 00196 iGaps = 0; 00197 if( iCur < iLength ) 00198 iGaps += iLength - iCur; 00199 if( iCur > iDepth ) 00200 iGaps += iCur - iDepth; 00201 dCur = dPenaltyGap * iGaps; 00202 dCur += dPenaltyMismatch * CPSTImpl::Align( Data, iLength, iCur, sRoot ); 00203 if( dCur < dRet ) { 00204 dRet = dCur; 00205 iMin = iCur; } } 00206 if( dRet == FLT_MAX ) 00207 dRet = ( max( iLength, iDepth ) - min( iLength, iDepth ) ) * dPenaltyGap; 00208 00209 iOffset = (int)iLength - (int)iMin; 00210 return dRet; } 00211 00212 static size_t Align( const std::string& strSequence, size_t, size_t iOffset, const SNode& sNode ) { 00213 size_t i, iCur, iRet; 00214 00215 if( strSequence.empty( ) || !iOffset ) 00216 return 0; 00217 00218 iRet = strSequence.length( ); 00219 if( iOffset > iRet ) { 00220 iOffset--; 00221 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) 00222 if( ( iCur = Align( strSequence, 0, iOffset, sNode.m_vecsChildren[ i ] ) ) < iRet ) 00223 iRet = iCur; 00224 return iRet; } 00225 00226 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) { 00227 const SNode& sChild = sNode.m_vecsChildren[ i ]; 00228 00229 iCur = ( ( sChild.m_cCharacter == strSequence[ strSequence.length( ) - iOffset ] ) ? 0 : 1 ) + 00230 Align( strSequence.substr( strSequence.length( ) - iOffset + 1 ), 0, iOffset - 1, sChild ); 00231 if( iCur < iRet ) 00232 iRet = iCur; } 00233 00234 return iRet; } 00235 00236 static size_t Align( const SNode& sThem, size_t iDepth, size_t iOffset, const SNode& sUs ) { 00237 size_t iBest, iRet; 00238 00239 Align( sThem, iDepth, iOffset, sUs, iBest = -1, iRet = 0 ); 00240 // assert( iBest == iRet ); 00241 return iRet; } 00242 00243 static void Align( const SNode& sThem, size_t iDepth, size_t iOffset, const SNode& sUs, size_t& iBest, 00244 size_t& iOut ) { 00245 size_t i, j, iCur; 00246 00247 if( sThem.m_vecsChildren.empty( ) || sUs.m_vecsChildren.empty( ) ) { 00248 if( iOut < iBest ) 00249 iBest = iOut; 00250 return; } 00251 00252 if( iOffset > iDepth ) { 00253 iOffset--; 00254 for( i = 0; i < sUs.m_vecsChildren.size( ); ++i ) 00255 Align( sThem, iDepth, iOffset, sUs.m_vecsChildren[ i ], iBest, iOut ); 00256 return; } 00257 if( iOffset < iDepth ) { 00258 iDepth--; 00259 for( i = 0; i < sThem.m_vecsChildren.size( ); ++i ) 00260 Align( sThem.m_vecsChildren[ i ], iDepth, iOffset, sUs, iBest, iOut ); 00261 return; } 00262 00263 iDepth--; 00264 iOffset--; 00265 for( i = 0; i < sUs.m_vecsChildren.size( ); ++i ) { 00266 const SNode& sChildUs = sUs.m_vecsChildren[ i ]; 00267 00268 for( j = 0; j < sThem.m_vecsChildren.size( ); ++j ) { 00269 const SNode& sChildThem = sThem.m_vecsChildren[ j ]; 00270 00271 if( ( iCur = ( iOut + ( ( sChildUs.m_cCharacter == sChildThem.m_cCharacter ) ? 0 : 1 ) ) ) >= 00272 iBest ) 00273 continue; 00274 Align( sChildThem, iDepth, iOffset, sChildUs, iBest, iCur ); } } 00275 iOut = iBest; } 00276 00277 00278 static void Integrate( const SNode& sNode, size_t& iSum ) { 00279 size_t i; 00280 00281 iSum += sNode.m_iTotal; 00282 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) 00283 Integrate( sNode.m_vecsChildren[ i ], iSum ); } 00284 00285 static bool Simplify( float dMinFrequency, SNode& sNode ) { 00286 size_t i; 00287 uint16_t iTotal; 00288 00289 iTotal = sNode.m_iTotal; 00290 sNode.m_iTotal = 0; 00291 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) { 00292 SNode& sChild = sNode.m_vecsChildren[ i ]; 00293 00294 if( ( (float)sChild.m_iCount / iTotal ) < dMinFrequency ) 00295 sNode.m_vecsChildren.erase( sNode.m_vecsChildren.begin( ) + i-- ); 00296 else { 00297 if( !Simplify( dMinFrequency, sChild ) ) 00298 return false; 00299 sNode.m_iTotal += ( sChild.m_iCount = max( (uint16_t)1, (uint16_t)( sChild.m_iCount * 00300 dMinFrequency ) ) ); } } 00301 00302 return true; } 00303 00304 CPSTImpl( size_t iArity ) : m_iDepth(0), m_iArity(iArity) { } 00305 00306 void Clear( ) { 00307 00308 m_iDepth = m_sRoot.m_iTotal = m_sRoot.m_iCount = 0; 00309 m_sRoot.m_vecsChildren.clear( ); } 00310 00311 float GetMatch( const std::string& strTarget, const SNode& sNode, size_t iOffset, 00312 size_t& iMatched ) const { 00313 size_t i, iCur, iMax; 00314 float dRet, dCur; 00315 00316 if( strTarget.empty( ) || sNode.m_vecsChildren.empty( ) ) 00317 return 1; 00318 00319 if( iOffset ) { 00320 iOffset--; 00321 dRet = 0; 00322 for( iMax = i = 0; i < sNode.m_vecsChildren.size( ); ++i ) { 00323 iCur = 0; 00324 dCur = GetMatch( strTarget, sNode.m_vecsChildren[ i ], iOffset, iCur ); 00325 dCur *= (float)sNode.m_vecsChildren[ i ].m_iCount / sNode.m_iTotal; 00326 if( !iMax || ( ( dCur / iCur ) > ( dRet / iMax ) ) ) { 00327 dRet = dCur; 00328 iMax = iCur; } } 00329 iMatched = iMax; 00330 return dRet; } 00331 00332 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) { 00333 const SNode& sChild = sNode.m_vecsChildren[ i ]; 00334 00335 if( strTarget[ 0 ] == sChild.m_cCharacter ) { 00336 iMatched++; 00337 return ( sChild.m_iCount * GetMatch( strTarget.substr( 1 ), sChild, 0, iMatched ) / 00338 sNode.m_iTotal ); } } 00339 00340 return 1; } 00341 00342 bool GetPWM( const SNode& sNode, size_t iDepth, std::map<unsigned char, size_t>& mapciCharacters, 00343 CFullMatrix<uint16_t>& MatPWM ) const { 00344 size_t i, iChar; 00345 std::map<unsigned char, size_t>::const_iterator iterChar; 00346 00347 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) { 00348 const SNode& sChild = sNode.m_vecsChildren[ i ]; 00349 00350 if( ( iterChar = mapciCharacters.find( sChild.m_cCharacter ) ) == mapciCharacters.end( ) ) { 00351 iChar = mapciCharacters.size( ); 00352 mapciCharacters[ sChild.m_cCharacter ] = iChar; } 00353 else 00354 iChar = iterChar->second; 00355 MatPWM.Get( iChar, iDepth ) += sChild.m_iCount; 00356 if( !GetPWM( sChild, iDepth + 1, mapciCharacters, MatPWM ) ) 00357 return false; } 00358 00359 return true; } 00360 00361 SNode m_sRoot; 00362 size_t m_iDepth; 00363 size_t m_iArity; 00364 }; 00365 00366 } 00367 00368 #endif // PSTI_H