Sleipnir
src/psti.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef PSTI_H
00023 #define PSTI_H
00024 
00025 #include <sstream>
00026 #include <stack>
00027 #include <string>
00028 #include <vector>
00029 
00030 #include "fullmatrix.h"
00031 
00032 namespace Sleipnir {
00033 
00034 class CPSTImpl {
00035 protected:
00036     struct SNode {
00037         SNode( ) : m_iTotal(0), m_iCount(0) { }
00038 
00039         SNode( unsigned char cCharacter, uint16_t iCount = 1 ) : m_cCharacter(cCharacter), m_iCount(iCount),
00040             m_iTotal(0) { }
00041 
00042         unsigned char       m_cCharacter;
00043         uint16_t            m_iCount;
00044         uint16_t            m_iTotal;
00045         std::vector<SNode>  m_vecsChildren;
00046     };
00047 
00048     static void RemoveRCs( const SNode& sNode, float dCutoff, std::string& strSeq,
00049         std::vector<std::string>& vecstrOut ) {
00050         size_t  i;
00051 
00052         if( sNode.m_vecsChildren.empty( ) ) {
00053             vecstrOut.push_back( strSeq );
00054             return; }
00055 
00056         for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) {
00057             const SNode&    sChild  = sNode.m_vecsChildren[ i ];
00058 
00059             if( ( (float)sChild.m_iCount / sNode.m_iTotal ) < dCutoff )
00060                 continue;
00061             strSeq.push_back( sChild.m_cCharacter );
00062             RemoveRCs( sChild, dCutoff, strSeq, vecstrOut );
00063             strSeq.resize( strSeq.size( ) - 1 ); } }
00064 
00065     static std::string GetMotif( const SNode& sNode ) {
00066         std::ostringstream  ossm;
00067         size_t              i;
00068 
00069         switch( sNode.m_vecsChildren.size( ) ) {
00070             case 0:
00071                 break;
00072 
00073             case 1:
00074                 ossm << sNode.m_vecsChildren[ 0 ].m_cCharacter << ':' << sNode.m_vecsChildren[ 0 ].m_iCount;
00075                 if( !sNode.m_vecsChildren[ 0 ].m_vecsChildren.empty( ) )
00076                     ossm << ' ';
00077                 ossm << GetMotif( sNode.m_vecsChildren[ 0 ] );
00078                 break;
00079 
00080             default:
00081                 for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) {
00082                     const SNode&    sChild  = sNode.m_vecsChildren[ i ];
00083 
00084                     ossm << ( i ? "|" : "" ) << '(' << sChild.m_cCharacter << ':' << sChild.m_iCount;
00085                     if( !sChild.m_vecsChildren.empty( ) )
00086                         ossm << ' ';
00087                     ossm << GetMotif( sChild ) << ')'; } }
00088 
00089         return ossm.str( ); }
00090 
00091     static size_t Add( unsigned char cCharacter, SNode& sNode, uint16_t iCount = 1 ) {
00092         size_t  i;
00093 
00094         for( i = 0; i < sNode.m_vecsChildren.size( ); ++i )
00095             if( sNode.m_vecsChildren[ i ].m_cCharacter == cCharacter )
00096                 break;
00097         if( i < sNode.m_vecsChildren.size( ) )
00098             sNode.m_vecsChildren[ i ].m_iCount += iCount;
00099         else
00100             sNode.m_vecsChildren.push_back( SNode( cCharacter, iCount ) );
00101         sNode.m_iTotal += iCount;
00102 
00103         return i; }
00104 
00105     static size_t Add( const std::string& str, SNode& sNode ) {
00106 
00107         return ( str.empty( ) ? 0 :
00108             ( 1 + Add( str.substr( 1 ), sNode.m_vecsChildren[ Add( str[ 0 ], sNode ) ] ) ) ); }
00109 
00110     static size_t Add( const SNode& sIn, SNode& sOut ) {
00111         size_t  i, j, iCur, iRet;
00112 
00113         sOut.m_iCount += sIn.m_iCount;
00114         sOut.m_iTotal += sIn.m_iTotal;
00115         for( iRet = i = 0; i < sIn.m_vecsChildren.size( ); ++i ) {
00116             const SNode&    sChildIn    = sIn.m_vecsChildren[ i ];
00117 
00118             for( j = 0; j < sOut.m_vecsChildren.size( ); ++j )
00119                 if( sChildIn.m_cCharacter == sOut.m_vecsChildren[ j ].m_cCharacter )
00120                     break;
00121             if( j >= sOut.m_vecsChildren.size( ) ) {
00122                 sOut.m_vecsChildren.push_back( SNode( sChildIn.m_cCharacter ) );
00123                 sOut.m_vecsChildren.back( ).m_iCount = 0; }
00124             if( ( iCur = ( 1 + Add( sChildIn, sOut.m_vecsChildren[ j ] ) ) ) > iRet )
00125                 iRet = iCur; }
00126 
00127         return iRet; }
00128 
00129     template<class tType>
00130     static size_t Add( const tType& Addition, size_t iOffset, SNode& sNode ) {
00131         size_t  i, iRet, iCur;
00132 
00133         if( !iOffset )
00134             return Add( Addition, sNode );
00135 
00136         iOffset--;
00137         iRet = 0;
00138         for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) {
00139             iCur = ( 1 + Add( Addition, iOffset, sNode.m_vecsChildren[ i ] ) );
00140             if( iCur > iRet )
00141                 iRet = iCur; }
00142 
00143         return iRet; }
00144 
00145     static size_t Open( const std::string& strPST, SNode& sNode ) {
00146         std::stack<SNode*>  stckpsBranch;
00147         std::stack<size_t>  stckiDepth;
00148         SNode*              psNode;
00149         size_t              i, iCur, iMax;
00150         char                c;
00151         uint16_t            iCount;
00152 
00153         psNode = &sNode;
00154         iCount = 1;
00155         for( iCur = iMax = i = 0; i < strPST.size( ); ++i )
00156             switch( c = strPST[ i ] ) {
00157                 case '(':
00158                     stckpsBranch.push( psNode );
00159                     stckiDepth.push( iCur );
00160                     break;
00161 
00162                 case ')':
00163                     psNode = stckpsBranch.top( );
00164                     stckpsBranch.pop( );
00165                     iCur = stckiDepth.top( );
00166                     stckiDepth.pop( );
00167                     break;
00168 
00169                 case '|':
00170                 case ' ':
00171                     break;
00172 
00173                 default:
00174                     if( ( ( i + 2 ) >= strPST.length( ) ) || ( strPST[ i + 1 ] != ':' ) )
00175                         return -1;
00176                     iCount = atoi( strPST.c_str( ) + ++i + 1 );
00177                     while( isdigit( strPST[ ++i ] ) );
00178                     --i;
00179                     psNode = &psNode->m_vecsChildren[ Add( c, *psNode, iCount ) ];
00180                     if( ++iCur > iMax )
00181                         iMax = iCur; }
00182 
00183         return iMax; }
00184 
00185     template<class tType>
00186     static float Align( const SNode& sRoot, size_t iDepth, const tType& Data, size_t iLength,
00187         float dPenaltyGap, float dPenaltyMismatch, float dCutoff, int& iOffset ) {
00188         size_t  iBegin, iEnd, iCur, iMin, iGaps;
00189         float   dRet, dCur;
00190 
00191         dRet = FLT_MAX;
00192         dCur = dCutoff / dPenaltyGap;
00193         iBegin = ( dCur > iLength ) ? 0 : (size_t)ceil( iLength - dCur );
00194         iEnd = iLength + iDepth - iBegin;
00195         for( iMin = 0,iCur = iBegin; iCur < iEnd; ++iCur ) {
00196             iGaps = 0;
00197             if( iCur < iLength )
00198                 iGaps += iLength - iCur;
00199             if( iCur > iDepth )
00200                 iGaps += iCur - iDepth;
00201             dCur = dPenaltyGap * iGaps;
00202             dCur += dPenaltyMismatch * CPSTImpl::Align( Data, iLength, iCur, sRoot ); 
00203             if( dCur < dRet ) {
00204                 dRet = dCur;
00205                 iMin = iCur; } }
00206         if( dRet == FLT_MAX )
00207             dRet = ( max( iLength, iDepth ) - min( iLength, iDepth ) ) * dPenaltyGap;
00208 
00209         iOffset = (int)iLength - (int)iMin;
00210         return dRet; }
00211 
00212     static size_t Align( const std::string& strSequence, size_t, size_t iOffset, const SNode& sNode ) {
00213         size_t  i, iCur, iRet;
00214 
00215         if( strSequence.empty( ) || !iOffset )
00216             return 0;
00217 
00218         iRet = strSequence.length( );
00219         if( iOffset > iRet ) {
00220             iOffset--;
00221             for( i = 0; i < sNode.m_vecsChildren.size( ); ++i )
00222                 if( ( iCur = Align( strSequence, 0, iOffset, sNode.m_vecsChildren[ i ] ) ) < iRet )
00223                     iRet = iCur;
00224             return iRet; }
00225 
00226         for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) {
00227             const SNode&    sChild  = sNode.m_vecsChildren[ i ];
00228 
00229             iCur = ( ( sChild.m_cCharacter == strSequence[ strSequence.length( ) - iOffset ] ) ? 0 : 1 ) +
00230                 Align( strSequence.substr( strSequence.length( ) - iOffset + 1 ), 0, iOffset - 1, sChild );
00231             if( iCur < iRet )
00232                 iRet = iCur; }
00233 
00234         return iRet; }
00235 
00236     static size_t Align( const SNode& sThem, size_t iDepth, size_t iOffset, const SNode& sUs ) {
00237         size_t  iBest, iRet;
00238 
00239         Align( sThem, iDepth, iOffset, sUs, iBest = -1, iRet = 0 );
00240 //      assert( iBest == iRet );
00241         return iRet; }
00242 
00243     static void Align( const SNode& sThem, size_t iDepth, size_t iOffset, const SNode& sUs, size_t& iBest,
00244         size_t& iOut ) {
00245         size_t  i, j, iCur;
00246 
00247         if( sThem.m_vecsChildren.empty( ) || sUs.m_vecsChildren.empty( ) ) {
00248             if( iOut < iBest )
00249                 iBest = iOut;
00250             return; }
00251 
00252         if( iOffset > iDepth ) {
00253             iOffset--;
00254             for( i = 0; i < sUs.m_vecsChildren.size( ); ++i )
00255                 Align( sThem, iDepth, iOffset, sUs.m_vecsChildren[ i ], iBest, iOut );
00256             return; }
00257         if( iOffset < iDepth ) {
00258             iDepth--;
00259             for( i = 0; i < sThem.m_vecsChildren.size( ); ++i )
00260                 Align( sThem.m_vecsChildren[ i ], iDepth, iOffset, sUs, iBest, iOut );
00261             return; }
00262 
00263         iDepth--;
00264         iOffset--;
00265         for( i = 0; i < sUs.m_vecsChildren.size( ); ++i ) {
00266             const SNode&    sChildUs    = sUs.m_vecsChildren[ i ];
00267 
00268             for( j = 0; j < sThem.m_vecsChildren.size( ); ++j ) {
00269                 const SNode&    sChildThem  = sThem.m_vecsChildren[ j ];
00270 
00271                 if( ( iCur = ( iOut + ( ( sChildUs.m_cCharacter == sChildThem.m_cCharacter ) ? 0 : 1 ) ) ) >=
00272                     iBest )
00273                     continue;
00274                 Align( sChildThem, iDepth, iOffset, sChildUs, iBest, iCur ); } }
00275         iOut = iBest; }
00276 
00277 
00278     static void Integrate( const SNode& sNode, size_t& iSum ) {
00279         size_t  i;
00280 
00281         iSum += sNode.m_iTotal;
00282         for( i = 0; i < sNode.m_vecsChildren.size( ); ++i )
00283             Integrate( sNode.m_vecsChildren[ i ], iSum ); }
00284 
00285     static bool Simplify( float dMinFrequency, SNode& sNode ) {
00286         size_t      i;
00287         uint16_t    iTotal;
00288 
00289         iTotal = sNode.m_iTotal;
00290         sNode.m_iTotal = 0;
00291         for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) {
00292             SNode&  sChild  = sNode.m_vecsChildren[ i ];
00293 
00294             if( ( (float)sChild.m_iCount / iTotal ) < dMinFrequency )
00295                 sNode.m_vecsChildren.erase( sNode.m_vecsChildren.begin( ) + i-- );
00296             else {
00297                 if( !Simplify( dMinFrequency, sChild ) )
00298                     return false;
00299                 sNode.m_iTotal += ( sChild.m_iCount = max( (uint16_t)1, (uint16_t)( sChild.m_iCount *
00300                     dMinFrequency ) ) ); } }
00301 
00302         return true; }
00303 
00304     CPSTImpl( size_t iArity ) : m_iDepth(0), m_iArity(iArity) { }
00305 
00306     void Clear( ) {
00307 
00308         m_iDepth = m_sRoot.m_iTotal = m_sRoot.m_iCount = 0;
00309         m_sRoot.m_vecsChildren.clear( ); }
00310 
00311     float GetMatch( const std::string& strTarget, const SNode& sNode, size_t iOffset,
00312         size_t& iMatched ) const {
00313         size_t  i, iCur, iMax;
00314         float   dRet, dCur;
00315 
00316         if( strTarget.empty( ) || sNode.m_vecsChildren.empty( ) )
00317             return 1;
00318 
00319         if( iOffset ) {
00320             iOffset--;
00321             dRet = 0;
00322             for( iMax = i = 0; i < sNode.m_vecsChildren.size( ); ++i ) {
00323                 iCur = 0;
00324                 dCur = GetMatch( strTarget, sNode.m_vecsChildren[ i ], iOffset, iCur );
00325                 dCur *= (float)sNode.m_vecsChildren[ i ].m_iCount / sNode.m_iTotal;
00326                 if( !iMax || ( ( dCur / iCur ) > ( dRet / iMax ) ) ) {
00327                     dRet = dCur;
00328                     iMax = iCur; } }
00329             iMatched = iMax;
00330             return dRet; }
00331 
00332         for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) {
00333             const SNode&    sChild  = sNode.m_vecsChildren[ i ];
00334 
00335             if( strTarget[ 0 ] == sChild.m_cCharacter ) {
00336                 iMatched++;
00337                 return ( sChild.m_iCount * GetMatch( strTarget.substr( 1 ), sChild, 0, iMatched ) /
00338                     sNode.m_iTotal ); } }
00339 
00340         return 1; }
00341 
00342     bool GetPWM( const SNode& sNode, size_t iDepth, std::map<unsigned char, size_t>& mapciCharacters,
00343         CFullMatrix<uint16_t>& MatPWM ) const {
00344         size_t                                          i, iChar;
00345         std::map<unsigned char, size_t>::const_iterator iterChar;
00346 
00347         for( i = 0; i < sNode.m_vecsChildren.size( ); ++i ) {
00348             const SNode&    sChild  = sNode.m_vecsChildren[ i ];
00349 
00350             if( ( iterChar = mapciCharacters.find( sChild.m_cCharacter ) ) == mapciCharacters.end( ) ) {
00351                 iChar = mapciCharacters.size( );
00352                 mapciCharacters[ sChild.m_cCharacter ] = iChar; }
00353             else
00354                 iChar = iterChar->second;
00355             MatPWM.Get( iChar, iDepth ) += sChild.m_iCount;
00356             if( !GetPWM( sChild, iDepth + 1, mapciCharacters, MatPWM ) )
00357                 return false; }
00358 
00359         return true; }
00360 
00361     SNode   m_sRoot;
00362     size_t  m_iDepth;
00363     size_t  m_iArity;
00364 };
00365 
00366 }
00367 
00368 #endif // PSTI_H