Sleipnir
src/coalescestructs.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "coalescestructsi.h"
00024 #include "coalescemotifs.h"
00025 #include "fasta.h"
00026 #include "pcl.h"
00027 #include "statistics.h"
00028 #include "clusthierarchical.h"
00029 
00030 namespace Sleipnir {
00031 
00032 // SCoalesceModifiers
00033 
00034 void SCoalesceModifiers::Initialize( const CPCL& PCL ) {
00035     size_t  i, j;
00036 
00037     m_vecveciPCL2Wiggles.resize( m_vecpWiggles.size( ) );
00038     for( i = 0; i < m_vecveciPCL2Wiggles.size( ); ++i ) {
00039         m_vecveciPCL2Wiggles[ i ].resize( PCL.GetGenes( ) );
00040         for( j = 0; j < m_vecveciPCL2Wiggles[ i ].size( ); ++j )
00041             m_vecveciPCL2Wiggles[ i ][ j ] = m_vecpWiggles[ i ]->GetGene( PCL.GetGene( j ) ); } }
00042 
00043 // SCoalesceModifierCache
00044 
00045 void SCoalesceModifierCache::Get( size_t iPCL ) {
00046     size_t  i;
00047 
00048     m_vecvecsWiggles.resize( m_Modifiers.m_vecpWiggles.size( ) );
00049     for( i = 0; i < m_vecvecsWiggles.size( ); ++i ) {
00050         m_vecvecsWiggles[ i ].clear( );
00051         m_Modifiers.m_vecpWiggles[ i ]->Get( m_Modifiers.m_vecveciPCL2Wiggles[ i ][ iPCL ],
00052             m_vecvecsWiggles[ i ] ); } }
00053 
00054 void SCoalesceModifierCache::SetType( const std::string& strType ) {
00055     size_t  i, j;
00056 
00057     m_veciWiggleTypes.resize( m_vecvecsWiggles.size( ) );
00058     for( i = 0; i < m_vecvecsWiggles.size( ); ++i ) {
00059         for( j = 0; j < m_vecvecsWiggles[ i ].size( ); ++j )
00060             if( strType == m_vecvecsWiggles[ i ][ j ].m_strType )
00061                 break;
00062         m_veciWiggleTypes[ i ] = ( j < m_vecvecsWiggles[ i ].size( ) ) ? j : -1; } }
00063 
00064 void SCoalesceModifierCache::InitializeWeight( size_t iK, size_t iOffset ) {
00065     size_t  i, j;
00066 
00067     m_dWeight = 0;
00068     for( i = 0; i < m_vecvecsWiggles.size( ); ++i )
00069         if( ( j = m_veciWiggleTypes[ i ] ) != -1 ) {
00070             const vector<float>&    vecdWiggle  = m_vecvecsWiggles[ i ][ j ].m_vecdValues;
00071 
00072             for( j = 0; ( j < iK ) && ( ( iOffset + j ) < vecdWiggle.size( ) ); ++j )
00073                 m_dWeight += vecdWiggle[ iOffset + j ];
00074             for( ; j < iK; ++j )
00075                 m_dWeight += 1; } }
00076 
00077 void SCoalesceModifierCache::AddWeight( size_t iK, size_t iOffset, size_t iDelta ) {
00078     size_t  i, j;
00079     float   dSub, dAdd;
00080 
00081     for( i = 0; i < m_vecvecsWiggles.size( ); ++i )
00082         if( ( j = m_veciWiggleTypes[ i ] ) != -1 ) {
00083             const std::vector<float>&   vecdWiggle  = m_vecvecsWiggles[ i ][ j ].m_vecdValues;
00084 
00085             dSub = ( ( iOffset + iDelta ) < vecdWiggle.size( ) ) ? vecdWiggle[ iOffset + iDelta ] : 1;
00086             if( iK ) {
00087                 j = iOffset + iDelta + iK;
00088                 dAdd = ( j < vecdWiggle.size( ) ) ? vecdWiggle[ j ] : 1; }
00089             else {
00090                 dAdd = dSub;
00091                 dSub = 0; }
00092             m_dWeight -= dSub;
00093             m_dWeight += dAdd; } }
00094 
00095 // SCoalesceDataset
00096 
00097 bool SCoalesceDataset::CalculateCovariance( const CPCL& PCL ) {
00098     size_t          i, j, k;
00099     vector<bool>    vecfDataset;
00100     float           dOne, dTwo;
00101     vector<float>   vecdAves;
00102     CDataMatrix     MatSigma;
00103     vector<size_t>  veciIndices, veciCounts;
00104     bool            fEven;
00105 
00106     if( GetConditions( ) == 1 )
00107         return false;
00108 
00109     m_vecdStdevs.resize( GetConditions( ) );
00110     MatSigma.Initialize( GetConditions( ), GetConditions( ) );
00111     MatSigma.Clear( );
00112     vecdAves.resize( GetConditions( ) );
00113     veciCounts.resize( GetConditions( ) );
00114     for( i = 0; i < PCL.GetGenes( ); ++i )
00115         for( j = 0; j < vecdAves.size( ); ++j )
00116             if( !CMeta::IsNaN( dOne = PCL.Get( i, GetCondition( j ) ) ) ) {
00117                 veciCounts[ j ]++;
00118                 vecdAves[ j ] += dOne; }
00119     for( i = 0; i < vecdAves.size( ); ++i )
00120         if( j = veciCounts[ i ] )
00121             vecdAves[ i ] /= j;
00122     for( i = 0; i < PCL.GetGenes( ); ++i )
00123         for( j = 0; j < GetConditions( ); ++j ) {
00124             if( CMeta::IsNaN( dOne = PCL.Get( i, GetCondition( j ) ) ) )
00125                 continue;
00126             dOne -= vecdAves[ j ];
00127             for( k = j; k < GetConditions( ); ++k )
00128                 if( !CMeta::IsNaN( dTwo = PCL.Get( i, GetCondition( k ) ) ) )
00129                     MatSigma.Get( j, k ) += dOne * ( dTwo - vecdAves[ k ] ); }
00130     for( i = 0; i < MatSigma.GetRows( ); ++i ) {
00131         for( j = i; j < MatSigma.GetColumns( ); ++j )
00132             MatSigma.Set( j, i, MatSigma.Get( i, j ) /= PCL.GetGenes( ) );
00133         m_vecdStdevs[ i ] = sqrt( MatSigma.Get( i, i ) ); }
00134 
00135     m_MatSigmaChol.Open( MatSigma );
00136     CStatistics::CholeskyDecomposition( m_MatSigmaChol );
00137 
00138     CStatistics::MatrixLUDecompose( MatSigma, veciIndices, fEven );
00139     CStatistics::MatrixLUInvert( MatSigma, veciIndices, m_MatSigmaInv );
00140     m_dSigmaDetSqrt = sqrt( CStatistics::MatrixLUDeterminant( MatSigma, fEven ) );
00141 
00142     return true; }
00143 
00144 // SMotifMatch
00145 
00146 bool SMotifMatch::Open( std::istream& istm, CCoalesceMotifLibrary& Motifs ) {
00147     string          strLine;
00148     vector<string>  vecstrLine;
00149     size_t          i;
00150 
00151     m_vecprstrdKnown.clear( );
00152     strLine.resize( CFile::GetBufferSize( ) );
00153     istm.getline( &strLine[ 0 ], strLine.size( ) - 1 );
00154     CMeta::Tokenize( strLine.c_str( ), vecstrLine );
00155     if( vecstrLine.size( ) < 3 ) {
00156         g_CatSleipnir( ).error( "SMotifMatch::Open( ) invalid line: %s", strLine.c_str( ) );
00157         return false; }
00158     m_strType = vecstrLine[ 0 ];
00159     if( ( m_eSubsequence = CCoalesceSequencerBase::GetSubsequence( vecstrLine[ 1 ] ) ) ==
00160         CCoalesceSequencerBase::ESubsequenceEnd ) {
00161         g_CatSleipnir( ).error( "SMotifMatch::Open( ) invalid subsequence: %s", vecstrLine[ 1 ].c_str( ) );
00162         return false; }
00163     m_dZ = (float)atof( vecstrLine[ 2 ].c_str( ) );
00164 
00165     if( vecstrLine.size( ) > 3 ) {
00166         vector<string>  vecstrKnowns;
00167 
00168         CMeta::Tokenize( vecstrLine[ 3 ].c_str( ), vecstrKnowns, "|" );
00169         for( i = 0; i < vecstrKnowns.size( ); ++i ) {
00170             vector<string>  vecstrKnown;
00171 
00172             CMeta::Tokenize( vecstrKnowns[ i ].c_str( ), vecstrKnown, ":" );
00173             if( vecstrKnown.size( ) != 2 ) {
00174                 g_CatSleipnir( ).error( "SMotifMatch::Open( ) invalid known: %s", vecstrKnowns[ i ].c_str( ) );
00175                 return false; }
00176             m_vecprstrdKnown.push_back( pair<string, float>( vecstrKnown[ 0 ],
00177                 (float)atof( vecstrKnown[ 1 ].c_str( ) ) ) ); } }
00178 
00179     istm.getline( &strLine[ 0 ], strLine.size( ) - 1 );
00180     if( ( m_iMotif = Motifs.Open( strLine.c_str( ) ) ) == -1 ) {
00181         g_CatSleipnir( ).error( "SMotifMatch::Open( ) invalid motif: %s", strLine.c_str( ) );
00182         return false; }
00183 
00184     return true; }
00185 
00186 uint32_t SMotifMatch::Open( const CHierarchy& Hier, const vector<SMotifMatch>& vecsMotifs,
00187     CCoalesceMotifLibrary& Motifs, size_t& iCount ) {
00188     uint32_t    iLeft, iRight;
00189 
00190     if( Hier.IsGene( ) ) {
00191         const SMotifMatch&  sMotif  = vecsMotifs[ Hier.GetID( ) ];
00192 
00193         m_eSubsequence = sMotif.m_eSubsequence;
00194         m_strType = sMotif.m_strType;
00195         m_dZ += sMotif.m_dZ;
00196         iCount++;
00197         return ( m_iMotif = sMotif.m_iMotif ); }
00198 
00199     return ( ( ( ( iLeft = Open( Hier.Get( false ), vecsMotifs, Motifs, iCount ) ) == -1 ) ||
00200         ( ( iRight = Open( Hier.Get( true ), vecsMotifs, Motifs, iCount ) ) == -1 ) ) ? -1 :
00201         ( m_iMotif = Motifs.Merge( iLeft, iRight, FLT_MAX, true ) ) ); }
00202 
00203 uint32_t SMotifMatch::Open( const SMotifMatch& sOne, const SMotifMatch& sTwo,
00204     CCoalesceMotifLibrary& Motifs ) {
00205 
00206     m_eSubsequence = sOne.m_eSubsequence;
00207     m_strType = sOne.m_strType;
00208     m_dZ = ( sOne.m_dZ + sTwo.m_dZ ) / 2;
00209     return ( m_iMotif = Motifs.Merge( sOne.m_iMotif, sTwo.m_iMotif, FLT_MAX, true ) ); }
00210 
00211 string SMotifMatch::Save( const CCoalesceMotifLibrary* pMotifs, bool fPWM, float dCutoffPWMs,
00212     float dPenaltyGap, float dPenaltyMismatch, bool fNoRCs ) const {
00213     ostringstream   ossm;
00214     string          strPWM;
00215     size_t          i;
00216 
00217     ossm << m_strType << '\t' << CCoalesceSequencerBase::GetSubsequence( m_eSubsequence ) << '\t' << m_dZ;
00218     for( i = 0; i < m_vecprstrdKnown.size( ); ++i ) {
00219         const pair<string, float>&  prstrdKnown = m_vecprstrdKnown[ i ];
00220 
00221         ossm << ( i ? "|" : "\t" ) << prstrdKnown.first << ':' << prstrdKnown.second; }
00222     ossm << endl;
00223     if( pMotifs ) {
00224         ossm << pMotifs->GetMotif( m_iMotif );
00225         if( fPWM ) {
00226             if( ( strPWM = pMotifs->GetPWM( m_iMotif, dCutoffPWMs, dPenaltyGap, dPenaltyMismatch,
00227                 fNoRCs ) ).empty( ) )
00228                 return "";
00229             ossm << endl << strPWM; } }
00230     else
00231         ossm << m_iMotif;
00232 
00233     return ossm.str( ); }
00234 
00235 bool SMotifMatch::Label( const CCoalesceMotifLibrary& Motifs, EType eMatchType, float dPenaltyGap,
00236     float dPenaltyMismatch, float dPValue ) {
00237 
00238     m_vecprstrdKnown.clear( );
00239     return Motifs.GetKnown( m_iMotif, eMatchType, dPenaltyGap, dPenaltyMismatch, m_vecprstrdKnown, dPValue ); }
00240 
00241 }