Sleipnir
src/coalescemotifsi.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef COALESCEMOTIFSI_H
00023 #define COALESCEMOTIFSI_H
00024 
00025 #include <float.h>
00026 #include <math.h>
00027 
00028 #include <string.h>
00029 #include <map>
00030 
00031 #include "coalescestructsi.h"
00032 
00033 namespace Sleipnir {
00034 
00035 class CPST;
00036 
00037 class CCoalesceMotifLibraryImpl {
00038 protected:
00039     static const char   c_szBases[];
00040     static const char   c_szComplements[];
00041     static const size_t c_iShift        = 2; // ceil( log2( strlen( c_szBases ) ) )
00042     static const char   c_cSeparator    = '|';
00043     static const char   c_cCluster      = 'C';
00044 
00045     typedef std::map<std::pair<uint32_t, uint32_t>, uint32_t>   TMapPrIII;
00046 
00047     enum EType {
00048         ETypeKMer,
00049         ETypeRC,
00050         ETypePST
00051     };
00052 
00053     struct SKnowns {
00054         typedef std::vector<float>          TVecD;
00055         typedef std::pair<TVecD, TVecD>     TPrVecDVecD;
00056         typedef std::vector<TPrVecDVecD>    TVecPr;
00057 
00058         void Match( const std::vector<float>&, SMotifMatch::EType, std::map<std::string, float>& ) const;
00059 
00060         size_t GetSize( ) const {
00061 
00062             return m_mapstrvecKnown.size( ); }
00063 
00064         void Add( const std::string& strID, const std::vector<std::string>& vecstrLine ) {
00065             size_t  i, iFromPos, iFromBase, iToPos, iToBase;
00066             TVecPr& vecprMotif  = m_mapstrvecKnown[ strID ];
00067 
00068             vecprMotif.push_back( TPrVecDVecD( ) );
00069             {
00070                 TPrVecDVecD&    prvecdvecdMotif = vecprMotif.back( );
00071 
00072                 prvecdvecdMotif.first.resize( vecstrLine.size( ) - 1 );
00073                 for( i = 1; i < vecstrLine.size( ); ++i )
00074                     prvecdvecdMotif.first[ i - 1 ] = (float)atof( vecstrLine[ i ].c_str( ) );
00075 
00076                 prvecdvecdMotif.second.resize( prvecdvecdMotif.first.size( ) );
00077                 i = prvecdvecdMotif.second.size( ) / 4;
00078                 for( iFromPos = 0; iFromPos < i; ++iFromPos ) {
00079                     iToPos = i - iFromPos - 1;
00080                     for( iFromBase = 0; iFromBase < 4; ++iFromBase ) {
00081                         iToBase = 3 - iFromBase;
00082                         prvecdvecdMotif.second[ ( 4 * iToPos ) + iToBase ] =
00083                             prvecdvecdMotif.first[ ( 4 * iFromPos ) + iFromBase ]; } }
00084             } }
00085 
00086         std::map<std::string, TVecPr>   m_mapstrvecKnown;
00087     };
00088 
00089     static size_t CountKMers( size_t iK ) {
00090 
00091         return ( 1 << ( iK << 1 ) ); }
00092 
00093     static size_t CountRCs( size_t iK ) {
00094         size_t  iKMers;
00095 
00096         iKMers = CountKMers( iK ) >> 1;
00097         return ( iKMers - ( ( iK % 2 ) ? 0 : ( CountKMers( iK >> 1 ) >> 1 ) ) ); }
00098 
00099     static std::string GetComplement( const std::string& strKMer ) {
00100         std::string strRet;
00101         size_t      i;
00102 
00103         strRet.resize( strKMer.length( ) );
00104         for( i = 0; i < strRet.length( ); ++i )
00105             strRet[ i ] = GetComplement( strKMer[ i ] );
00106 
00107         return strRet; }
00108 
00109     static char GetComplement( char cBase ) {
00110         const char* pc;
00111 
00112         return ( ( pc = strchr( c_szBases, cBase ) ) ? c_szComplements[ pc - c_szBases ] : cBase ); }
00113 
00114     static uint32_t KMer2ID( const std::string& strKMer, bool fRC = false ) {
00115         size_t          i, iIndex;
00116         uint32_t        iRet;
00117         const char*     pc;
00118         unsigned char   c;
00119 
00120         for( i = iRet = 0; i < strKMer.length( ); ++i ) {
00121             iIndex = fRC ? ( strKMer.length( ) - i - 1 ) : i;
00122             if( !( pc = strchr( c_szBases, strKMer[ iIndex ] ) ) )
00123                 return -1;
00124             c = (unsigned char)( pc - c_szBases );
00125             if( fRC ) {
00126                 if( !( pc = strchr( c_szBases, c_szComplements[ c ] ) ) )
00127                     return -1;
00128                 c = (unsigned char)( pc - c_szBases ); }
00129             iRet = ( iRet << c_iShift ) | c; }
00130 
00131         return iRet; }
00132 
00133     static std::string ID2KMer( uint32_t iID, size_t iK ) {
00134         static const size_t c_iMask = ( 1 << c_iShift ) - 1;
00135         std::string strRet;
00136         size_t      i;
00137 
00138         strRet.resize( iK );
00139         for( i = 0; i < iK; ++i ) {
00140             strRet[ iK - i - 1 ] = c_szBases[ iID & c_iMask ];
00141             iID >>= c_iShift; }
00142 
00143         return strRet; }
00144 
00145     static bool IsIgnorableKMer( const std::string& strKMer ) {
00146         size_t  i;
00147 
00148         for( i = 0; i < strKMer.size( ); ++i )
00149             if( !strchr( c_szBases, strKMer[ i ] ) )
00150                 return true;
00151 
00152         return false; }
00153 
00154     static bool GetPWM( const std::string& strKMer, CFullMatrix<uint16_t>& MatPWM ) {
00155         size_t  i, j;
00156 
00157         if( ( MatPWM.GetColumns( ) != strlen( c_szBases ) ) || ( MatPWM.GetRows( ) != strKMer.length( ) ) ) {
00158             MatPWM.Initialize( strlen( c_szBases ), strKMer.length( ) );
00159             MatPWM.Clear( ); }
00160         for( i = 0; i < strKMer.length( ); ++i ) {
00161             for( j = 0; j < MatPWM.GetRows( ); ++j )
00162                 if( strKMer[ i ] == c_szBases[ j ] )
00163                     break;
00164             if( j >= MatPWM.GetRows( ) )
00165                 return false;
00166             MatPWM.Get( j, i )++; }
00167 
00168         return true; }
00169 
00170     static std::string GetReverseComplement( const std::string& strKMer ) {
00171         std::string strReverse;
00172 
00173         strReverse = strKMer;
00174         std::reverse( strReverse.begin( ), strReverse.end( ) );
00175         return GetComplement( strReverse ); }
00176 
00177     static float GetInformation( const CFullMatrix<uint16_t>& MatPWM ) {
00178         CDataMatrix         MatProbs;
00179         size_t              iPos, iFrom, iTo;
00180         std::vector<size_t> veciTotals;
00181         float               d, dIC, dFrom, dTo;
00182 
00183         if( MatPWM.GetColumns( ) < 2 )
00184             return 0;
00185 
00186         veciTotals.resize( MatPWM.GetColumns( ) );
00187         for( iPos = 0; iPos < MatPWM.GetColumns( ); ++iPos )
00188             for( iFrom = 0; iFrom < MatPWM.GetRows( ); ++iFrom )
00189                 veciTotals[ iPos ] += MatPWM.Get( iFrom, iPos );
00190 
00191         MatProbs.Initialize( MatPWM.GetRows( ), MatPWM.GetRows( ) );
00192         MatProbs.Clear( );
00193         for( iPos = 1; iPos < MatPWM.GetColumns( ); ++iPos )
00194             for( iFrom = 0; iFrom < MatPWM.GetRows( ); ++iFrom ) {
00195                 if( !( dFrom = ( (float)MatPWM.Get( iFrom, iPos - 1 ) / veciTotals[ iPos - 1 ] ) ) )
00196                     continue;
00197                 for( iTo = 0; iTo < MatPWM.GetRows( ); ++iTo )
00198                     if( dTo = ( (float)MatPWM.Get( iTo, iPos ) / veciTotals[ iPos ] ) )
00199                         MatProbs.Get( iFrom, iTo ) += dFrom * dTo; }
00200         for( iFrom = 0; iFrom < MatProbs.GetRows( ); ++iFrom ) {
00201             for( d = 0,iTo = 0; iTo < MatProbs.GetColumns( ); ++iTo )
00202                 d += MatProbs.Get( iFrom, iTo );
00203             if( !d )
00204                 continue;
00205             for( iTo = 0; iTo < MatProbs.GetColumns( ); ++iTo )
00206                 MatProbs.Get( iFrom, iTo ) /= d; }
00207         dIC = 0;
00208         for( iFrom = 0; iFrom < MatProbs.GetRows( ); ++iFrom )
00209             for( iTo = 0; iTo < MatProbs.GetColumns( ); ++iTo )
00210                 if( d = MatProbs.Get( iFrom, iTo ) / ( MatPWM.GetColumns( ) - 1 ) )
00211                     dIC += d * log( d );
00212 
00213         return ( dIC / ( -log( 2.0f ) * MatProbs.GetColumns( ) ) ); }
00214 
00215     CCoalesceMotifLibraryImpl( size_t iK ) : m_iK(iK), m_dPenaltyGap(1), m_dPenaltyMismatch(2) {
00216         uint32_t    i, j, iRC;
00217 
00218 // TODO: if I was smart, I could do this with a direct encoding...
00219         m_veciKMer2RC.resize( GetKMers( ) );
00220         m_veciRC2KMer.resize( GetRCs( ) );
00221         std::fill( m_veciKMer2RC.begin( ), m_veciKMer2RC.end( ), -1 );
00222         for( i = j = 0; i < m_veciKMer2RC.size( ); ++i )
00223             if( m_veciKMer2RC[ i ] == -1 ) {
00224                 iRC = KMer2ID( ID2KMer( i, m_iK ), true );
00225                 if( iRC != i ) {
00226                     m_veciKMer2RC[ i ] = m_veciKMer2RC[ iRC ] = j;
00227                     m_veciRC2KMer[ j++ ] = i; } } }
00228 
00229     virtual ~CCoalesceMotifLibraryImpl( );
00230 
00231     std::string GetMotif( uint32_t ) const;
00232     CPST* CreatePST( uint32_t& );
00233     uint32_t MergeKMers( const std::string&, const std::string&, float, bool );
00234     uint32_t MergeKMerRC( uint32_t, uint32_t, float, bool );
00235     uint32_t MergeKMerPST( const std::string&, const CPST&, float, bool );
00236     uint32_t MergeRCs( uint32_t, uint32_t, float, bool );
00237     uint32_t MergeRCPST( uint32_t, const CPST&, float, bool );
00238     uint32_t MergePSTs( const CPST&, const CPST&, float, bool );
00239     float AlignKMers( const std::string&, const std::string&, float ) const;
00240     float AlignKMerRC( const std::string&, uint32_t, float ) const;
00241     float AlignKMerPST( const std::string&, const CPST&, float ) const;
00242     float AlignRCs( uint32_t, uint32_t, float ) const;
00243     float AlignRCPST( uint32_t, const CPST&, float ) const;
00244     float AlignPSTs( const CPST&, const CPST&, float ) const;
00245     uint32_t RemoveRCs( const CPST&, float, float );
00246     bool GetPWM( uint32_t, float, float, float, bool, CFullMatrix<uint16_t>& ) const;
00247 
00248     EType GetType( uint32_t iMotif ) const {
00249 
00250         if( iMotif < GetKMers( ) )
00251             return ETypeKMer;
00252         if( iMotif < GetBasePSTs( ) )
00253             return ETypeRC;
00254 
00255         return ETypePST; }
00256 
00257     uint32_t GetKMers( ) const {
00258 
00259         return (uint32_t)CountKMers( m_iK ); }
00260 
00261     uint32_t GetRCs( ) const {
00262 
00263             return (uint32_t)CountRCs( m_iK ); }
00264 
00265     uint32_t GetBaseRCs( ) const {
00266 
00267         return GetKMers( ); }
00268 
00269     uint32_t GetBasePSTs( ) const {
00270 
00271         return ( GetBaseRCs( ) + GetRCs( ) ); }
00272 
00273     CPST* GetPST( uint32_t iMotif ) const {
00274 
00275         return m_vecpPSTs[ iMotif - GetBasePSTs( ) ]; }
00276 
00277     uint32_t GetPSTs( ) const {
00278 
00279         return (uint32_t)m_vecpPSTs.size( ); }
00280 
00281     float Align( const std::string& strFixed, const std::string& strMobile, float dCutoff, int& iRet ) const {
00282         const std::string&  strShort    = ( strFixed.length( ) < strMobile.length( ) ) ? strFixed : strMobile;
00283         const std::string&  strLong     = ( strFixed.length( ) < strMobile.length( ) ) ? strMobile : strFixed;
00284         size_t              iBegin, iEnd, iOffset, i, iLength, iMin;
00285         float               dRet, dCur;
00286 
00287         dRet = FLT_MAX;
00288         dCur = dCutoff / m_dPenaltyGap;
00289         iLength = strShort.length( ) + strLong.length( );
00290         iBegin = ( dCur < iLength ) ? (size_t)ceil( ( iLength - dCur ) / 2 ) : 0;
00291         iEnd = iLength + 1 - iBegin;
00292         for( iMin = 0,iOffset = iBegin; iOffset < iEnd; ++iOffset ) {
00293             i = ( iOffset <= strShort.length( ) ) ? iOffset : min( strShort.length( ), iLength - iOffset );
00294             dCur = m_dPenaltyGap * ( iLength - ( 2 * i ) );
00295             for( i = ( iOffset <= strShort.length( ) ) ? 0 : ( iOffset - strShort.length( ) );
00296                 i < min( strShort.length( ), iOffset ); ++i )
00297                 if( strShort[ i ] != strLong[ strLong.length( ) - iOffset + i ] )
00298                     dCur += m_dPenaltyMismatch;
00299             if( dCur < dRet ) {
00300                 dRet = dCur;
00301                 iMin = iOffset; } }
00302 
00303         iRet = (int)strLong.length( ) - (int)iMin;
00304         if( strFixed.length( ) < strMobile.length( ) )
00305             iRet = -iRet;
00306         return dRet; }
00307 
00308     std::string GetRCOne( uint32_t iMotif ) const {
00309 
00310         return ID2KMer( (uint32_t)m_veciRC2KMer[ iMotif - GetBaseRCs( ) ], m_iK ); }
00311 
00312     float                   m_dPenaltyGap;
00313     float                   m_dPenaltyMismatch;
00314     size_t                  m_iK;
00315     std::vector<uint32_t>   m_veciKMer2RC;
00316     std::vector<uint32_t>   m_veciRC2KMer;
00317     std::vector<CPST*>      m_vecpPSTs;
00318     TMapPrIII               m_mappriiiMerged;
00319     SKnowns                 m_sKnowns;
00320 };
00321 
00322 }
00323 
00324 #endif // COALESCEMOTIFSI_H