Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "annotation.h" 00024 #include "genome.h" 00025 #include "statistics.h" 00026 00027 namespace Sleipnir { 00028 00029 COntologyImpl::SNode::SNode( ) : m_iParents(0), m_aiParents(NULL), m_iChildren(0), 00030 m_aiChildren(NULL), m_iGenes(0), m_apGenes(NULL), m_iCacheGenes(-1), 00031 m_apCacheGenes(NULL) { } 00032 00033 void COntologyImpl::SNode::Reset( ) { 00034 00035 m_strID.clear( ); 00036 m_iParents = m_iChildren = m_iGenes = 0; 00037 if( m_aiParents ) 00038 delete[] m_aiParents; 00039 if( m_aiChildren ) 00040 delete[] m_aiChildren; 00041 if( m_apGenes ) 00042 delete[] m_apGenes; 00043 if( m_apCacheGenes ) 00044 delete[] m_apCacheGenes; } 00045 00046 COntologyImpl::SParser::SParser( std::istream& istm, CGenome& Genome ) : m_istm(istm), 00047 m_Genome(Genome), m_iLine(0) { 00048 00049 m_szLine[ 0 ] = 0; } 00050 00051 bool COntologyImpl::SParser::GetLine( ) { 00052 00053 m_iLine++; 00054 m_istm.getline( m_szLine, c_iBuffer - 1 ); 00055 g_CatSleipnir( ).debug( "COntologyImpl::SParser::GetLine( ) %s", m_szLine ); 00056 return true; } 00057 00058 bool COntologyImpl::SParser::IsStart( const char* szStart ) const { 00059 00060 return !strncmp( m_szLine, szStart, strlen( szStart ) ); } 00061 00062 const CGene& COntologyImpl::GetGene( size_t iNode, size_t iGene ) const { 00063 size_t i; 00064 00065 if( iGene < ( i = m_aNodes[ iNode ].m_iGenes ) ) 00066 return *m_aNodes[ iNode ].m_apGenes[ iGene ]; 00067 00068 CollectGenes( iNode ); 00069 return *m_aNodes[ iNode ].m_apCacheGenes[ iGene - i ]; } 00070 00071 void COntologyImpl::Reset( ) { 00072 size_t i; 00073 00074 m_mapNodes.clear( ); 00075 if( !m_aNodes ) 00076 return; 00077 00078 for( i = 0; i < m_iNodes; ++i ) 00079 m_aNodes[ i ].Reset( ); 00080 m_iNodes = 0; 00081 delete[] m_aNodes; 00082 m_aNodes = NULL; } 00083 00084 bool COntologyImpl::IsAnnotated( size_t iNode, const CGene& Gene, bool fKids ) const { 00085 size_t i; 00086 00087 for( i = 0; i < m_aNodes[ iNode ].m_iGenes; ++i ) 00088 if( m_aNodes[ iNode ].m_apGenes[ i ] == &Gene ) 00089 return true; 00090 if( fKids ) { 00091 CollectGenes( iNode ); 00092 for( i = 0; i < m_aNodes[ iNode ].m_iCacheGenes; ++i ) 00093 if( m_aNodes[ iNode ].m_apCacheGenes[ i ] == &Gene ) 00094 return true; } 00095 00096 return false; } 00097 00098 void COntologyImpl::CollectGenes( size_t iNode, TSetPGenes& setpGenes ) { 00099 SNode& sNode = m_aNodes[ iNode ]; 00100 size_t i; 00101 TSetPGenes setpSelf, setpKids; 00102 TSetPGenes::const_iterator iterGenes; 00103 00104 if( sNode.m_iCacheGenes != -1 ) { 00105 for( i = 0; i < sNode.m_iGenes; ++i ) 00106 setpGenes.insert( sNode.m_apGenes[ i ] ); 00107 for( i = 0; i < sNode.m_iCacheGenes; ++i ) 00108 setpGenes.insert( sNode.m_apCacheGenes[ i ] ); 00109 return; } 00110 00111 for( i = 0; i < sNode.m_iGenes; ++i ) { 00112 setpSelf.insert( sNode.m_apGenes[ i ] ); 00113 setpGenes.insert( sNode.m_apGenes[ i ] ); } 00114 for( i = 0; i < sNode.m_iChildren; ++i ) 00115 CollectGenes( sNode.m_aiChildren[ i ], setpKids ); 00116 sNode.m_iCacheGenes = 0; 00117 for( iterGenes = setpKids.begin( ); iterGenes != setpKids.end( ); ++iterGenes ) 00118 if( setpSelf.find( *iterGenes ) == setpSelf.end( ) ) 00119 sNode.m_iCacheGenes++; 00120 if( sNode.m_iCacheGenes ) { 00121 sNode.m_apCacheGenes = new const CGene*[ sNode.m_iCacheGenes ]; 00122 for( i = 0,iterGenes = setpKids.begin( ); iterGenes != setpKids.end( ); ++iterGenes ) 00123 if( setpSelf.find( *iterGenes ) == setpSelf.end( ) ) { 00124 sNode.m_apCacheGenes[ i++ ] = *iterGenes; 00125 setpGenes.insert( *iterGenes ); } } } 00126 00127 size_t COntologyImpl::GetNode( const std::string& strID ) const { 00128 TMapStrI::const_iterator iterNode; 00129 00130 iterNode = m_mapNodes.find( strID ); 00131 return ( ( iterNode == m_mapNodes.end( ) ) ? -1 : iterNode->second ); } 00132 00133 void COntologyImpl::GetGeneNames( std::vector<std::string>& vecstrGenes ) const { 00134 set<string> setstrGenes; 00135 set<string>::const_iterator iterGene; 00136 size_t i, j; 00137 00138 for( i = 0; i < m_iNodes; ++i ) 00139 for( j = 0; j < m_aNodes[ i ].m_iGenes; ++j ) 00140 setstrGenes.insert( m_aNodes[ i ].m_apGenes[ j ]->GetName( ) ); 00141 00142 for( iterGene = setstrGenes.begin( ); iterGene != setstrGenes.end( ); ++iterGene ) 00143 vecstrGenes.push_back( *iterGene ); } 00144 00145 void COntologyImpl::TermFinder( const CGenes& Genes, vector<STermFound>& vecsTerms, bool fBon, 00146 bool fKids, bool fBack, float dPValue, const CGenes* pBkg ) const { 00147 size_t i, j, iMult, iBkg, iGenes, iGiven; 00148 double d; 00149 vector<size_t> veciAnno; 00150 00151 for( iGiven = i = 0; i < Genes.GetGenes( ); ++i ) 00152 if( Genes.GetGene( i ).IsAnnotated( m_pOntology ) ) 00153 iGiven++; 00154 iBkg = pBkg ? pBkg->GetGenes( ) : 00155 ( fBack ? Genes.GetGenome( ).GetGenes( ) : Genes.GetGenome( ).CountGenes( m_pOntology ) ); 00156 veciAnno.resize( m_iNodes ); 00157 for( iMult = i = 0; i < veciAnno.size( ); ++i ) 00158 if( veciAnno[ i ] = Genes.CountAnnotations( m_pOntology, i, fKids, pBkg ) ) 00159 iMult++; 00160 for( i = 0; i < m_iNodes; ++i ) { 00161 if( !veciAnno[ i ] ) 00162 continue; 00163 if( pBkg ) { 00164 for( iGenes = j = 0; j < pBkg->GetGenes( ); ++j ) 00165 if( IsAnnotated( i, pBkg->GetGene( j ), fKids ) ) 00166 iGenes++; } 00167 else 00168 iGenes = GetGenes( i, fKids ); 00169 d = CStatistics::HypergeometricCDF( veciAnno[ i ], iGiven, iGenes, iBkg ); 00170 if( fBon ) { 00171 if( ( d *= iMult ) > 1 ) 00172 d = 1; } 00173 if( d <= dPValue ) 00174 vecsTerms.push_back( STermFound( i, d, veciAnno[ i ], iGiven, iGenes, iBkg ) ); } } 00175 00176 void CSlimImpl::Reset( const IOntology* pOntology ) { 00177 00178 m_pOntology = pOntology; 00179 m_vecstrSlims.clear( ); 00180 m_vecveciTerms.clear( ); 00181 m_vecvecpGenes.clear( ); } 00182 00208 bool CSlim::Open( std::istream& istmSlim, const IOntology* pOntology ) { 00209 static const size_t c_iBuffer = 1024; 00210 char szBuf[ c_iBuffer ]; 00211 size_t i, j, k, iNode; 00212 string str; 00213 set<const CGene*> setiGenes; 00214 set<const CGene*>::const_iterator iterGene; 00215 00216 g_CatSleipnir( ).info( "CSlim::Open( %s )", pOntology->GetID( ).c_str( ) ); 00217 00218 Reset( pOntology ); 00219 while( istmSlim.peek( ) != EOF ) { 00220 i = m_vecveciTerms.size( ); 00221 m_vecveciTerms.resize( i + 1 ); 00222 istmSlim.getline( szBuf, c_iBuffer - 1 ); 00223 { 00224 istringstream issm( szBuf ); 00225 00226 while( issm.peek( ) != EOF ) { 00227 str = OpenToken( issm ); 00228 cout << str << endl; 00229 if( !str.length( ) ) 00230 break; 00231 if( m_vecstrSlims.size( ) <= i ) 00232 m_vecstrSlims.push_back( str ); 00233 else { 00234 if( ( j = m_pOntology->GetNode( str ) ) == -1 ) { 00235 g_CatSleipnir( ).error( "CSlim::Open( %s ) unknown node: %s", 00236 m_pOntology->GetID( ).c_str( ), str.c_str( ) ); 00237 return false; } 00238 m_vecveciTerms[ i ].push_back( j ); } } 00239 } } 00240 00241 m_vecvecpGenes.resize( m_vecveciTerms.size( ) ); 00242 for( i = 0; i < m_vecveciTerms.size( ); ++i ) { 00243 setiGenes.clear( ); 00244 for( j = 0; j < m_vecveciTerms[ i ].size( ); ++j ) { 00245 iNode = m_vecveciTerms[ i ][ j ]; 00246 for( k = 0; k < m_pOntology->GetGenes( iNode, true ); ++k ) 00247 setiGenes.insert( &m_pOntology->GetGene( iNode, k ) ); } 00248 for( iterGene = setiGenes.begin( ); iterGene != setiGenes.end( ); ++iterGene ) 00249 m_vecvecpGenes[ i ].push_back( *iterGene ); } 00250 00251 return true; } 00252 00260 void CSlim::GetGeneNames( std::vector<std::string>& vecstrGenes ) const { 00261 set<const CGene*> setpGenes; 00262 set<const CGene*>::const_iterator iterGene; 00263 size_t i, j; 00264 00265 for( i = 0; i < m_vecvecpGenes.size( ); ++i ) 00266 for( j = 0; j < m_vecvecpGenes[ i ].size( ); ++j ) 00267 setpGenes.insert( m_vecvecpGenes[ i ][ j ] ); 00268 00269 for( iterGene = setpGenes.begin( ); iterGene != setpGenes.end( ); ++iterGene ) 00270 vecstrGenes.push_back( (*iterGene)->GetName( ) ); } 00271 00272 }