Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "coalescestructsi.h" 00024 #include "coalescemotifs.h" 00025 #include "fasta.h" 00026 #include "pcl.h" 00027 #include "statistics.h" 00028 #include "clusthierarchical.h" 00029 00030 namespace Sleipnir { 00031 00032 // SCoalesceModifiers 00033 00034 void SCoalesceModifiers::Initialize( const CPCL& PCL ) { 00035 size_t i, j; 00036 00037 m_vecveciPCL2Wiggles.resize( m_vecpWiggles.size( ) ); 00038 for( i = 0; i < m_vecveciPCL2Wiggles.size( ); ++i ) { 00039 m_vecveciPCL2Wiggles[ i ].resize( PCL.GetGenes( ) ); 00040 for( j = 0; j < m_vecveciPCL2Wiggles[ i ].size( ); ++j ) 00041 m_vecveciPCL2Wiggles[ i ][ j ] = m_vecpWiggles[ i ]->GetGene( PCL.GetGene( j ) ); } } 00042 00043 // SCoalesceModifierCache 00044 00045 void SCoalesceModifierCache::Get( size_t iPCL ) { 00046 size_t i; 00047 00048 m_vecvecsWiggles.resize( m_Modifiers.m_vecpWiggles.size( ) ); 00049 for( i = 0; i < m_vecvecsWiggles.size( ); ++i ) { 00050 m_vecvecsWiggles[ i ].clear( ); 00051 m_Modifiers.m_vecpWiggles[ i ]->Get( m_Modifiers.m_vecveciPCL2Wiggles[ i ][ iPCL ], 00052 m_vecvecsWiggles[ i ] ); } } 00053 00054 void SCoalesceModifierCache::SetType( const std::string& strType ) { 00055 size_t i, j; 00056 00057 m_veciWiggleTypes.resize( m_vecvecsWiggles.size( ) ); 00058 for( i = 0; i < m_vecvecsWiggles.size( ); ++i ) { 00059 for( j = 0; j < m_vecvecsWiggles[ i ].size( ); ++j ) 00060 if( strType == m_vecvecsWiggles[ i ][ j ].m_strType ) 00061 break; 00062 m_veciWiggleTypes[ i ] = ( j < m_vecvecsWiggles[ i ].size( ) ) ? j : -1; } } 00063 00064 void SCoalesceModifierCache::InitializeWeight( size_t iK, size_t iOffset ) { 00065 size_t i, j; 00066 00067 m_dWeight = 0; 00068 for( i = 0; i < m_vecvecsWiggles.size( ); ++i ) 00069 if( ( j = m_veciWiggleTypes[ i ] ) != -1 ) { 00070 const vector<float>& vecdWiggle = m_vecvecsWiggles[ i ][ j ].m_vecdValues; 00071 00072 for( j = 0; ( j < iK ) && ( ( iOffset + j ) < vecdWiggle.size( ) ); ++j ) 00073 m_dWeight += vecdWiggle[ iOffset + j ]; 00074 for( ; j < iK; ++j ) 00075 m_dWeight += 1; } } 00076 00077 void SCoalesceModifierCache::AddWeight( size_t iK, size_t iOffset, size_t iDelta ) { 00078 size_t i, j; 00079 float dSub, dAdd; 00080 00081 for( i = 0; i < m_vecvecsWiggles.size( ); ++i ) 00082 if( ( j = m_veciWiggleTypes[ i ] ) != -1 ) { 00083 const std::vector<float>& vecdWiggle = m_vecvecsWiggles[ i ][ j ].m_vecdValues; 00084 00085 dSub = ( ( iOffset + iDelta ) < vecdWiggle.size( ) ) ? vecdWiggle[ iOffset + iDelta ] : 1; 00086 if( iK ) { 00087 j = iOffset + iDelta + iK; 00088 dAdd = ( j < vecdWiggle.size( ) ) ? vecdWiggle[ j ] : 1; } 00089 else { 00090 dAdd = dSub; 00091 dSub = 0; } 00092 m_dWeight -= dSub; 00093 m_dWeight += dAdd; } } 00094 00095 // SCoalesceDataset 00096 00097 bool SCoalesceDataset::CalculateCovariance( const CPCL& PCL ) { 00098 size_t i, j, k; 00099 vector<bool> vecfDataset; 00100 float dOne, dTwo; 00101 vector<float> vecdAves; 00102 CDataMatrix MatSigma; 00103 vector<size_t> veciIndices, veciCounts; 00104 bool fEven; 00105 00106 if( GetConditions( ) == 1 ) 00107 return false; 00108 00109 m_vecdStdevs.resize( GetConditions( ) ); 00110 MatSigma.Initialize( GetConditions( ), GetConditions( ) ); 00111 MatSigma.Clear( ); 00112 vecdAves.resize( GetConditions( ) ); 00113 veciCounts.resize( GetConditions( ) ); 00114 for( i = 0; i < PCL.GetGenes( ); ++i ) 00115 for( j = 0; j < vecdAves.size( ); ++j ) 00116 if( !CMeta::IsNaN( dOne = PCL.Get( i, GetCondition( j ) ) ) ) { 00117 veciCounts[ j ]++; 00118 vecdAves[ j ] += dOne; } 00119 for( i = 0; i < vecdAves.size( ); ++i ) 00120 if( j = veciCounts[ i ] ) 00121 vecdAves[ i ] /= j; 00122 for( i = 0; i < PCL.GetGenes( ); ++i ) 00123 for( j = 0; j < GetConditions( ); ++j ) { 00124 if( CMeta::IsNaN( dOne = PCL.Get( i, GetCondition( j ) ) ) ) 00125 continue; 00126 dOne -= vecdAves[ j ]; 00127 for( k = j; k < GetConditions( ); ++k ) 00128 if( !CMeta::IsNaN( dTwo = PCL.Get( i, GetCondition( k ) ) ) ) 00129 MatSigma.Get( j, k ) += dOne * ( dTwo - vecdAves[ k ] ); } 00130 for( i = 0; i < MatSigma.GetRows( ); ++i ) { 00131 for( j = i; j < MatSigma.GetColumns( ); ++j ) 00132 MatSigma.Set( j, i, MatSigma.Get( i, j ) /= PCL.GetGenes( ) ); 00133 m_vecdStdevs[ i ] = sqrt( MatSigma.Get( i, i ) ); } 00134 00135 m_MatSigmaChol.Open( MatSigma ); 00136 CStatistics::CholeskyDecomposition( m_MatSigmaChol ); 00137 00138 CStatistics::MatrixLUDecompose( MatSigma, veciIndices, fEven ); 00139 CStatistics::MatrixLUInvert( MatSigma, veciIndices, m_MatSigmaInv ); 00140 m_dSigmaDetSqrt = sqrt( CStatistics::MatrixLUDeterminant( MatSigma, fEven ) ); 00141 00142 return true; } 00143 00144 // SMotifMatch 00145 00146 bool SMotifMatch::Open( std::istream& istm, CCoalesceMotifLibrary& Motifs ) { 00147 string strLine; 00148 vector<string> vecstrLine; 00149 size_t i; 00150 00151 m_vecprstrdKnown.clear( ); 00152 strLine.resize( CFile::GetBufferSize( ) ); 00153 istm.getline( &strLine[ 0 ], strLine.size( ) - 1 ); 00154 CMeta::Tokenize( strLine.c_str( ), vecstrLine ); 00155 if( vecstrLine.size( ) < 3 ) { 00156 g_CatSleipnir( ).error( "SMotifMatch::Open( ) invalid line: %s", strLine.c_str( ) ); 00157 return false; } 00158 m_strType = vecstrLine[ 0 ]; 00159 if( ( m_eSubsequence = CCoalesceSequencerBase::GetSubsequence( vecstrLine[ 1 ] ) ) == 00160 CCoalesceSequencerBase::ESubsequenceEnd ) { 00161 g_CatSleipnir( ).error( "SMotifMatch::Open( ) invalid subsequence: %s", vecstrLine[ 1 ].c_str( ) ); 00162 return false; } 00163 m_dZ = (float)atof( vecstrLine[ 2 ].c_str( ) ); 00164 00165 if( vecstrLine.size( ) > 3 ) { 00166 vector<string> vecstrKnowns; 00167 00168 CMeta::Tokenize( vecstrLine[ 3 ].c_str( ), vecstrKnowns, "|" ); 00169 for( i = 0; i < vecstrKnowns.size( ); ++i ) { 00170 vector<string> vecstrKnown; 00171 00172 CMeta::Tokenize( vecstrKnowns[ i ].c_str( ), vecstrKnown, ":" ); 00173 if( vecstrKnown.size( ) != 2 ) { 00174 g_CatSleipnir( ).error( "SMotifMatch::Open( ) invalid known: %s", vecstrKnowns[ i ].c_str( ) ); 00175 return false; } 00176 m_vecprstrdKnown.push_back( pair<string, float>( vecstrKnown[ 0 ], 00177 (float)atof( vecstrKnown[ 1 ].c_str( ) ) ) ); } } 00178 00179 istm.getline( &strLine[ 0 ], strLine.size( ) - 1 ); 00180 if( ( m_iMotif = Motifs.Open( strLine.c_str( ) ) ) == -1 ) { 00181 g_CatSleipnir( ).error( "SMotifMatch::Open( ) invalid motif: %s", strLine.c_str( ) ); 00182 return false; } 00183 00184 return true; } 00185 00186 uint32_t SMotifMatch::Open( const CHierarchy& Hier, const vector<SMotifMatch>& vecsMotifs, 00187 CCoalesceMotifLibrary& Motifs, size_t& iCount ) { 00188 uint32_t iLeft, iRight; 00189 00190 if( Hier.IsGene( ) ) { 00191 const SMotifMatch& sMotif = vecsMotifs[ Hier.GetID( ) ]; 00192 00193 m_eSubsequence = sMotif.m_eSubsequence; 00194 m_strType = sMotif.m_strType; 00195 m_dZ += sMotif.m_dZ; 00196 iCount++; 00197 return ( m_iMotif = sMotif.m_iMotif ); } 00198 00199 return ( ( ( ( iLeft = Open( Hier.Get( false ), vecsMotifs, Motifs, iCount ) ) == -1 ) || 00200 ( ( iRight = Open( Hier.Get( true ), vecsMotifs, Motifs, iCount ) ) == -1 ) ) ? -1 : 00201 ( m_iMotif = Motifs.Merge( iLeft, iRight, FLT_MAX, true ) ) ); } 00202 00203 uint32_t SMotifMatch::Open( const SMotifMatch& sOne, const SMotifMatch& sTwo, 00204 CCoalesceMotifLibrary& Motifs ) { 00205 00206 m_eSubsequence = sOne.m_eSubsequence; 00207 m_strType = sOne.m_strType; 00208 m_dZ = ( sOne.m_dZ + sTwo.m_dZ ) / 2; 00209 return ( m_iMotif = Motifs.Merge( sOne.m_iMotif, sTwo.m_iMotif, FLT_MAX, true ) ); } 00210 00211 string SMotifMatch::Save( const CCoalesceMotifLibrary* pMotifs, bool fPWM, float dCutoffPWMs, 00212 float dPenaltyGap, float dPenaltyMismatch, bool fNoRCs ) const { 00213 ostringstream ossm; 00214 string strPWM; 00215 size_t i; 00216 00217 ossm << m_strType << '\t' << CCoalesceSequencerBase::GetSubsequence( m_eSubsequence ) << '\t' << m_dZ; 00218 for( i = 0; i < m_vecprstrdKnown.size( ); ++i ) { 00219 const pair<string, float>& prstrdKnown = m_vecprstrdKnown[ i ]; 00220 00221 ossm << ( i ? "|" : "\t" ) << prstrdKnown.first << ':' << prstrdKnown.second; } 00222 ossm << endl; 00223 if( pMotifs ) { 00224 ossm << pMotifs->GetMotif( m_iMotif ); 00225 if( fPWM ) { 00226 if( ( strPWM = pMotifs->GetPWM( m_iMotif, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch, 00227 fNoRCs ) ).empty( ) ) 00228 return ""; 00229 ossm << endl << strPWM; } } 00230 else 00231 ossm << m_iMotif; 00232 00233 return ossm.str( ); } 00234 00235 bool SMotifMatch::Label( const CCoalesceMotifLibrary& Motifs, EType eMatchType, float dPenaltyGap, 00236 float dPenaltyMismatch, float dPValue ) { 00237 00238 m_vecprstrdKnown.clear( ); 00239 return Motifs.GetKnown( m_iMotif, eMatchType, dPenaltyGap, dPenaltyMismatch, m_vecprstrdKnown, dPValue ); } 00240 00241 }