Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef COALESCEMOTIFSI_H 00023 #define COALESCEMOTIFSI_H 00024 00025 #include <float.h> 00026 #include <math.h> 00027 00028 #include <string.h> 00029 #include <map> 00030 00031 #include "coalescestructsi.h" 00032 00033 namespace Sleipnir { 00034 00035 class CPST; 00036 00037 class CCoalesceMotifLibraryImpl { 00038 protected: 00039 static const char c_szBases[]; 00040 static const char c_szComplements[]; 00041 static const size_t c_iShift = 2; // ceil( log2( strlen( c_szBases ) ) ) 00042 static const char c_cSeparator = '|'; 00043 static const char c_cCluster = 'C'; 00044 00045 typedef std::map<std::pair<uint32_t, uint32_t>, uint32_t> TMapPrIII; 00046 00047 enum EType { 00048 ETypeKMer, 00049 ETypeRC, 00050 ETypePST 00051 }; 00052 00053 struct SKnowns { 00054 typedef std::vector<float> TVecD; 00055 typedef std::pair<TVecD, TVecD> TPrVecDVecD; 00056 typedef std::vector<TPrVecDVecD> TVecPr; 00057 00058 void Match( const std::vector<float>&, SMotifMatch::EType, std::map<std::string, float>& ) const; 00059 00060 size_t GetSize( ) const { 00061 00062 return m_mapstrvecKnown.size( ); } 00063 00064 void Add( const std::string& strID, const std::vector<std::string>& vecstrLine ) { 00065 size_t i, iFromPos, iFromBase, iToPos, iToBase; 00066 TVecPr& vecprMotif = m_mapstrvecKnown[ strID ]; 00067 00068 vecprMotif.push_back( TPrVecDVecD( ) ); 00069 { 00070 TPrVecDVecD& prvecdvecdMotif = vecprMotif.back( ); 00071 00072 prvecdvecdMotif.first.resize( vecstrLine.size( ) - 1 ); 00073 for( i = 1; i < vecstrLine.size( ); ++i ) 00074 prvecdvecdMotif.first[ i - 1 ] = (float)atof( vecstrLine[ i ].c_str( ) ); 00075 00076 prvecdvecdMotif.second.resize( prvecdvecdMotif.first.size( ) ); 00077 i = prvecdvecdMotif.second.size( ) / 4; 00078 for( iFromPos = 0; iFromPos < i; ++iFromPos ) { 00079 iToPos = i - iFromPos - 1; 00080 for( iFromBase = 0; iFromBase < 4; ++iFromBase ) { 00081 iToBase = 3 - iFromBase; 00082 prvecdvecdMotif.second[ ( 4 * iToPos ) + iToBase ] = 00083 prvecdvecdMotif.first[ ( 4 * iFromPos ) + iFromBase ]; } } 00084 } } 00085 00086 std::map<std::string, TVecPr> m_mapstrvecKnown; 00087 }; 00088 00089 static size_t CountKMers( size_t iK ) { 00090 00091 return ( 1 << ( iK << 1 ) ); } 00092 00093 static size_t CountRCs( size_t iK ) { 00094 size_t iKMers; 00095 00096 iKMers = CountKMers( iK ) >> 1; 00097 return ( iKMers - ( ( iK % 2 ) ? 0 : ( CountKMers( iK >> 1 ) >> 1 ) ) ); } 00098 00099 static std::string GetComplement( const std::string& strKMer ) { 00100 std::string strRet; 00101 size_t i; 00102 00103 strRet.resize( strKMer.length( ) ); 00104 for( i = 0; i < strRet.length( ); ++i ) 00105 strRet[ i ] = GetComplement( strKMer[ i ] ); 00106 00107 return strRet; } 00108 00109 static char GetComplement( char cBase ) { 00110 const char* pc; 00111 00112 return ( ( pc = strchr( c_szBases, cBase ) ) ? c_szComplements[ pc - c_szBases ] : cBase ); } 00113 00114 static uint32_t KMer2ID( const std::string& strKMer, bool fRC = false ) { 00115 size_t i, iIndex; 00116 uint32_t iRet; 00117 const char* pc; 00118 unsigned char c; 00119 00120 for( i = iRet = 0; i < strKMer.length( ); ++i ) { 00121 iIndex = fRC ? ( strKMer.length( ) - i - 1 ) : i; 00122 if( !( pc = strchr( c_szBases, strKMer[ iIndex ] ) ) ) 00123 return -1; 00124 c = (unsigned char)( pc - c_szBases ); 00125 if( fRC ) { 00126 if( !( pc = strchr( c_szBases, c_szComplements[ c ] ) ) ) 00127 return -1; 00128 c = (unsigned char)( pc - c_szBases ); } 00129 iRet = ( iRet << c_iShift ) | c; } 00130 00131 return iRet; } 00132 00133 static std::string ID2KMer( uint32_t iID, size_t iK ) { 00134 static const size_t c_iMask = ( 1 << c_iShift ) - 1; 00135 std::string strRet; 00136 size_t i; 00137 00138 strRet.resize( iK ); 00139 for( i = 0; i < iK; ++i ) { 00140 strRet[ iK - i - 1 ] = c_szBases[ iID & c_iMask ]; 00141 iID >>= c_iShift; } 00142 00143 return strRet; } 00144 00145 static bool IsIgnorableKMer( const std::string& strKMer ) { 00146 size_t i; 00147 00148 for( i = 0; i < strKMer.size( ); ++i ) 00149 if( !strchr( c_szBases, strKMer[ i ] ) ) 00150 return true; 00151 00152 return false; } 00153 00154 static bool GetPWM( const std::string& strKMer, CFullMatrix<uint16_t>& MatPWM ) { 00155 size_t i, j; 00156 00157 if( ( MatPWM.GetColumns( ) != strlen( c_szBases ) ) || ( MatPWM.GetRows( ) != strKMer.length( ) ) ) { 00158 MatPWM.Initialize( strlen( c_szBases ), strKMer.length( ) ); 00159 MatPWM.Clear( ); } 00160 for( i = 0; i < strKMer.length( ); ++i ) { 00161 for( j = 0; j < MatPWM.GetRows( ); ++j ) 00162 if( strKMer[ i ] == c_szBases[ j ] ) 00163 break; 00164 if( j >= MatPWM.GetRows( ) ) 00165 return false; 00166 MatPWM.Get( j, i )++; } 00167 00168 return true; } 00169 00170 static std::string GetReverseComplement( const std::string& strKMer ) { 00171 std::string strReverse; 00172 00173 strReverse = strKMer; 00174 std::reverse( strReverse.begin( ), strReverse.end( ) ); 00175 return GetComplement( strReverse ); } 00176 00177 static float GetInformation( const CFullMatrix<uint16_t>& MatPWM ) { 00178 CDataMatrix MatProbs; 00179 size_t iPos, iFrom, iTo; 00180 std::vector<size_t> veciTotals; 00181 float d, dIC, dFrom, dTo; 00182 00183 if( MatPWM.GetColumns( ) < 2 ) 00184 return 0; 00185 00186 veciTotals.resize( MatPWM.GetColumns( ) ); 00187 for( iPos = 0; iPos < MatPWM.GetColumns( ); ++iPos ) 00188 for( iFrom = 0; iFrom < MatPWM.GetRows( ); ++iFrom ) 00189 veciTotals[ iPos ] += MatPWM.Get( iFrom, iPos ); 00190 00191 MatProbs.Initialize( MatPWM.GetRows( ), MatPWM.GetRows( ) ); 00192 MatProbs.Clear( ); 00193 for( iPos = 1; iPos < MatPWM.GetColumns( ); ++iPos ) 00194 for( iFrom = 0; iFrom < MatPWM.GetRows( ); ++iFrom ) { 00195 if( !( dFrom = ( (float)MatPWM.Get( iFrom, iPos - 1 ) / veciTotals[ iPos - 1 ] ) ) ) 00196 continue; 00197 for( iTo = 0; iTo < MatPWM.GetRows( ); ++iTo ) 00198 if( dTo = ( (float)MatPWM.Get( iTo, iPos ) / veciTotals[ iPos ] ) ) 00199 MatProbs.Get( iFrom, iTo ) += dFrom * dTo; } 00200 for( iFrom = 0; iFrom < MatProbs.GetRows( ); ++iFrom ) { 00201 for( d = 0,iTo = 0; iTo < MatProbs.GetColumns( ); ++iTo ) 00202 d += MatProbs.Get( iFrom, iTo ); 00203 if( !d ) 00204 continue; 00205 for( iTo = 0; iTo < MatProbs.GetColumns( ); ++iTo ) 00206 MatProbs.Get( iFrom, iTo ) /= d; } 00207 dIC = 0; 00208 for( iFrom = 0; iFrom < MatProbs.GetRows( ); ++iFrom ) 00209 for( iTo = 0; iTo < MatProbs.GetColumns( ); ++iTo ) 00210 if( d = MatProbs.Get( iFrom, iTo ) / ( MatPWM.GetColumns( ) - 1 ) ) 00211 dIC += d * log( d ); 00212 00213 return ( dIC / ( -log( 2.0f ) * MatProbs.GetColumns( ) ) ); } 00214 00215 CCoalesceMotifLibraryImpl( size_t iK ) : m_iK(iK), m_dPenaltyGap(1), m_dPenaltyMismatch(2) { 00216 uint32_t i, j, iRC; 00217 00218 // TODO: if I was smart, I could do this with a direct encoding... 00219 m_veciKMer2RC.resize( GetKMers( ) ); 00220 m_veciRC2KMer.resize( GetRCs( ) ); 00221 std::fill( m_veciKMer2RC.begin( ), m_veciKMer2RC.end( ), -1 ); 00222 for( i = j = 0; i < m_veciKMer2RC.size( ); ++i ) 00223 if( m_veciKMer2RC[ i ] == -1 ) { 00224 iRC = KMer2ID( ID2KMer( i, m_iK ), true ); 00225 if( iRC != i ) { 00226 m_veciKMer2RC[ i ] = m_veciKMer2RC[ iRC ] = j; 00227 m_veciRC2KMer[ j++ ] = i; } } } 00228 00229 virtual ~CCoalesceMotifLibraryImpl( ); 00230 00231 std::string GetMotif( uint32_t ) const; 00232 CPST* CreatePST( uint32_t& ); 00233 uint32_t MergeKMers( const std::string&, const std::string&, float, bool ); 00234 uint32_t MergeKMerRC( uint32_t, uint32_t, float, bool ); 00235 uint32_t MergeKMerPST( const std::string&, const CPST&, float, bool ); 00236 uint32_t MergeRCs( uint32_t, uint32_t, float, bool ); 00237 uint32_t MergeRCPST( uint32_t, const CPST&, float, bool ); 00238 uint32_t MergePSTs( const CPST&, const CPST&, float, bool ); 00239 float AlignKMers( const std::string&, const std::string&, float ) const; 00240 float AlignKMerRC( const std::string&, uint32_t, float ) const; 00241 float AlignKMerPST( const std::string&, const CPST&, float ) const; 00242 float AlignRCs( uint32_t, uint32_t, float ) const; 00243 float AlignRCPST( uint32_t, const CPST&, float ) const; 00244 float AlignPSTs( const CPST&, const CPST&, float ) const; 00245 uint32_t RemoveRCs( const CPST&, float, float ); 00246 bool GetPWM( uint32_t, float, float, float, bool, CFullMatrix<uint16_t>& ) const; 00247 00248 EType GetType( uint32_t iMotif ) const { 00249 00250 if( iMotif < GetKMers( ) ) 00251 return ETypeKMer; 00252 if( iMotif < GetBasePSTs( ) ) 00253 return ETypeRC; 00254 00255 return ETypePST; } 00256 00257 uint32_t GetKMers( ) const { 00258 00259 return (uint32_t)CountKMers( m_iK ); } 00260 00261 uint32_t GetRCs( ) const { 00262 00263 return (uint32_t)CountRCs( m_iK ); } 00264 00265 uint32_t GetBaseRCs( ) const { 00266 00267 return GetKMers( ); } 00268 00269 uint32_t GetBasePSTs( ) const { 00270 00271 return ( GetBaseRCs( ) + GetRCs( ) ); } 00272 00273 CPST* GetPST( uint32_t iMotif ) const { 00274 00275 return m_vecpPSTs[ iMotif - GetBasePSTs( ) ]; } 00276 00277 uint32_t GetPSTs( ) const { 00278 00279 return (uint32_t)m_vecpPSTs.size( ); } 00280 00281 float Align( const std::string& strFixed, const std::string& strMobile, float dCutoff, int& iRet ) const { 00282 const std::string& strShort = ( strFixed.length( ) < strMobile.length( ) ) ? strFixed : strMobile; 00283 const std::string& strLong = ( strFixed.length( ) < strMobile.length( ) ) ? strMobile : strFixed; 00284 size_t iBegin, iEnd, iOffset, i, iLength, iMin; 00285 float dRet, dCur; 00286 00287 dRet = FLT_MAX; 00288 dCur = dCutoff / m_dPenaltyGap; 00289 iLength = strShort.length( ) + strLong.length( ); 00290 iBegin = ( dCur < iLength ) ? (size_t)ceil( ( iLength - dCur ) / 2 ) : 0; 00291 iEnd = iLength + 1 - iBegin; 00292 for( iMin = 0,iOffset = iBegin; iOffset < iEnd; ++iOffset ) { 00293 i = ( iOffset <= strShort.length( ) ) ? iOffset : min( strShort.length( ), iLength - iOffset ); 00294 dCur = m_dPenaltyGap * ( iLength - ( 2 * i ) ); 00295 for( i = ( iOffset <= strShort.length( ) ) ? 0 : ( iOffset - strShort.length( ) ); 00296 i < min( strShort.length( ), iOffset ); ++i ) 00297 if( strShort[ i ] != strLong[ strLong.length( ) - iOffset + i ] ) 00298 dCur += m_dPenaltyMismatch; 00299 if( dCur < dRet ) { 00300 dRet = dCur; 00301 iMin = iOffset; } } 00302 00303 iRet = (int)strLong.length( ) - (int)iMin; 00304 if( strFixed.length( ) < strMobile.length( ) ) 00305 iRet = -iRet; 00306 return dRet; } 00307 00308 std::string GetRCOne( uint32_t iMotif ) const { 00309 00310 return ID2KMer( (uint32_t)m_veciRC2KMer[ iMotif - GetBaseRCs( ) ], m_iK ); } 00311 00312 float m_dPenaltyGap; 00313 float m_dPenaltyMismatch; 00314 size_t m_iK; 00315 std::vector<uint32_t> m_veciKMer2RC; 00316 std::vector<uint32_t> m_veciRC2KMer; 00317 std::vector<CPST*> m_vecpPSTs; 00318 TMapPrIII m_mappriiiMerged; 00319 SKnowns m_sKnowns; 00320 }; 00321 00322 } 00323 00324 #endif // COALESCEMOTIFSI_H