Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "annotation.h" 00024 #include "genome.h" 00025 #include "meta.h" 00026 00027 namespace Sleipnir { 00028 00029 COntologyKEGG::COntologyKEGG( ) { 00030 00031 m_pOntology = this; } 00032 00069 bool COntologyKEGG::Open( std::istream& istm, CGenome& Genome, const std::string& strOrganism, 00070 bool fSynonyms ) { 00071 SParserKEGG sParser( istm, Genome, strOrganism, fSynonyms ); 00072 size_t i, j, iNode; 00073 TMapStrI::const_iterator iterNode; 00074 vector<set<CGene*> > vecsetpGenes; 00075 set<CGene*>::iterator iterGene; 00076 00077 g_CatSleipnir( ).info( "COntologyKEGG::Open( %s )", strOrganism.c_str( ) ); 00078 Reset( ); 00079 sParser.GetLine( ); 00080 while( istm.peek( ) != EOF ) { 00081 if( !COntologyKEGGImpl::Open( sParser ) ) 00082 return false; 00083 for( i = 0; i < sParser.m_vecstrIDs.size( ); ++i ) { 00084 if( ( iterNode = m_mapNodes.find( sParser.m_vecstrIDs[ i ] ) ) == 00085 m_mapNodes.end( ) ) 00086 m_mapNodes[ sParser.m_vecstrIDs[ i ] ] = iNode = m_mapNodes.size( ); 00087 else 00088 iNode = iterNode->second; 00089 if( vecsetpGenes.size( ) <= iNode ) 00090 vecsetpGenes.resize( iNode + 1 ); 00091 for( j = 0; j < sParser.m_vecpGenes.size( ); ++j ) 00092 vecsetpGenes[ iNode ].insert( sParser.m_vecpGenes[ j ] ); } } 00093 00094 m_aNodes = new SNode[ m_iNodes = vecsetpGenes.size( ) ]; 00095 for( iterNode = m_mapNodes.begin( ); iterNode != m_mapNodes.end( ); ++iterNode ) { 00096 i = iterNode->second; 00097 m_aNodes[ i ].m_strID = iterNode->first; 00098 m_aNodes[ i ].m_strGloss = sParser.m_mapGlosses[ iterNode->first ]; 00099 m_aNodes[ i ].m_iGenes = vecsetpGenes[ i ].size( ); 00100 m_aNodes[ i ].m_apGenes = new const CGene*[ m_aNodes[ i ].m_iGenes ]; 00101 for( j = 0,iterGene = vecsetpGenes[ i ].begin( ); 00102 iterGene != vecsetpGenes[ i ].end( ); ++j,++iterGene ) { 00103 (*iterGene)->AddAnnotation( this, i ); 00104 m_aNodes[ i ].m_apGenes[ j ] = *iterGene; } } 00105 00106 return true; } 00107 00108 const char COntologyKEGGImpl::c_szKEGG[] = "KEGG"; 00109 const char COntologyKEGGImpl::c_szEntry[] = "ENTRY"; 00110 const char COntologyKEGGImpl::c_szName[] = "NAME"; 00111 const char COntologyKEGGImpl::c_szDefinition[] = "DEFINITION"; 00112 const char COntologyKEGGImpl::c_szClass[] = "CLASS"; 00113 const char COntologyKEGGImpl::c_szPath[] = "PATH:"; 00114 const char COntologyKEGGImpl::c_szReference[] = "REFERENCE"; 00115 const char COntologyKEGGImpl::c_szDisease[] = "DISEASE"; 00116 const char COntologyKEGGImpl::c_szPathway[] = "PATHWAY"; 00117 const char COntologyKEGGImpl::c_szModule[] = "MODULE"; 00118 const char COntologyKEGGImpl::c_szBR[] = "BR:"; 00119 const char COntologyKEGGImpl::c_szDBLinks[] = "DBLINKS"; 00120 const char COntologyKEGGImpl::c_szGenes[] = "GENES"; 00121 const char COntologyKEGGImpl::c_szEnd[] = "///"; 00122 00123 COntologyKEGGImpl::SParserKEGG::SParserKEGG( std::istream& istm, CGenome& Genome, 00124 const std::string& strOrganism, bool fSynonyms ) : m_fSynonyms(fSynonyms), m_fOrganism(false), 00125 m_strOrganism(strOrganism), SParser( istm, Genome ) { } 00126 00127 void COntologyKEGGImpl::SParserKEGG::Reset( ) { 00128 00129 m_vecpGenes.clear( ); 00130 m_vecstrIDs.clear( ); } 00131 00132 COntologyKEGGImpl::COntologyKEGGImpl( ) : COntologyImpl( c_szKEGG ) { } 00133 00134 bool COntologyKEGGImpl::Open( SParserKEGG& sParser ) { 00135 00136 sParser.Reset( ); 00137 return ( OpenEntry( sParser ) && OpenName( sParser ) && 00138 OpenDefinition( sParser ) && OpenPathway( sParser ) && 00139 OpenModule( sParser ) && OpenDisease( sParser ) && 00140 OpenClass( sParser ) && OpenDBLinks( sParser ) && 00141 OpenGenes( sParser ) && OpenReferences( sParser ) && 00142 OpenEnd( sParser ) ); } 00143 00144 bool COntologyKEGGImpl::OpenEntry( SParserKEGG& sParser ) { 00145 00146 return ( sParser.IsStart( c_szEntry ) && sParser.GetLine( ) ); } 00147 00148 bool COntologyKEGGImpl::OpenName( SParserKEGG& sParser ) { 00149 00150 g_CatSleipnir( ).debug( "COntologyKEGGImpl::OpenName( ) %s", sParser.m_szLine ); 00151 return ( sParser.IsStart( c_szName ) ? sParser.GetLine( ) : true ); } 00152 00153 bool COntologyKEGGImpl::OpenPathway( SParserKEGG& sParser ) { 00154 00155 if( !sParser.IsStart( c_szPathway ) ) 00156 return true; 00157 00158 do 00159 if( !sParser.GetLine( ) ) 00160 return false; 00161 while( isspace( sParser.m_szLine[ 0 ] ) ); 00162 00163 return true; } 00164 00165 bool COntologyKEGGImpl::OpenReferences( SParserKEGG& sParser ) { 00166 00167 while( OpenReference( sParser ) ); 00168 00169 return true; } 00170 00171 bool COntologyKEGGImpl::OpenReference( SParserKEGG& sParser ) { 00172 00173 if( !sParser.IsStart( c_szReference ) ) 00174 return false; 00175 00176 do 00177 if( !sParser.GetLine( ) ) 00178 return false; 00179 while( isspace( sParser.m_szLine[ 0 ] ) ); 00180 00181 return true; } 00182 00183 bool COntologyKEGGImpl::OpenDisease( SParserKEGG& sParser ) { 00184 00185 if( !sParser.IsStart( c_szDisease ) ) 00186 return true; 00187 00188 do 00189 if( !sParser.GetLine( ) ) 00190 return false; 00191 while( isspace( sParser.m_szLine[ 0 ] ) ); 00192 00193 return true; } 00194 00195 bool COntologyKEGGImpl::OpenModule( SParserKEGG& sParser ) { 00196 00197 if( !sParser.IsStart( c_szModule ) ) 00198 return true; 00199 00200 do 00201 if( !sParser.GetLine( ) ) 00202 return false; 00203 while( isspace( sParser.m_szLine[ 0 ] ) ); 00204 00205 return true; } 00206 00207 bool COntologyKEGGImpl::OpenDefinition( SParserKEGG& sParser ) { 00208 00209 if( !sParser.IsStart( c_szDefinition ) ) 00210 return true; 00211 00212 do 00213 if( !sParser.GetLine( ) ) 00214 return false; 00215 while( isspace( sParser.m_szLine[ 0 ] ) ); 00216 00217 return true; } 00218 00219 bool COntologyKEGGImpl::OpenClass( SParserKEGG& sParser ) { 00220 size_t i; 00221 00222 if( !sParser.IsStart( c_szClass ) ) 00223 return false; 00224 00225 sParser.m_strGloss.clear( ); 00226 i = strlen( c_szClass ); 00227 memmove( sParser.m_szLine, sParser.m_szLine + i, strlen( sParser.m_szLine ) - i + 1 ); 00228 sParser.m_fPathing = false; 00229 do 00230 if( !OpenGloss( sParser ) ) 00231 return false; 00232 while( isspace( sParser.m_szLine[ 0 ] ) ); 00233 00234 return true; } 00235 00236 bool COntologyKEGGImpl::OpenGloss( SParserKEGG& sParser ) { 00237 char* pchStartGloss; 00238 char* pchEndGloss; 00239 char* pchStartPath; 00240 char* pchEndPath; 00241 vector<string> vecstrIDs; 00242 size_t i; 00243 00244 for( pchStartGloss = sParser.m_szLine; isspace( *pchStartGloss ); ++pchStartGloss ); 00245 if( ( pchEndGloss = strstr( pchStartGloss, c_szPath ) ) || 00246 ( pchEndGloss = strstr( pchStartGloss, c_szBR ) ) ) { 00247 pchStartPath = pchEndGloss + ( strncmp( pchEndGloss, c_szBR, strlen( c_szBR ) ) ? 00248 strlen( c_szPath ) : strlen( c_szBR ) ); 00249 if( !( pchEndPath = strchr( pchStartPath, ']' ) ) ) 00250 return false; 00251 *pchEndPath = 0; 00252 CMeta::Tokenize( pchStartPath, vecstrIDs, " ", true ); 00253 for( i = 0; i < vecstrIDs.size( ); ++i ) 00254 sParser.m_vecstrIDs.push_back( vecstrIDs[ i ] ); 00255 if( pchEndGloss > ( pchStartGloss + 1 ) ) { 00256 *( pchEndGloss - 1 ) = 0; 00257 if( sParser.m_fPathing ) 00258 sParser.m_strGloss.clear( ); 00259 else if( sParser.m_strGloss.length( ) ) 00260 sParser.m_strGloss += ' '; 00261 sParser.m_strGloss += pchStartGloss; } 00262 sParser.m_fPathing = true; 00263 for( i = 0; i < vecstrIDs.size( ); ++i ) 00264 sParser.m_mapGlosses[ vecstrIDs[ i ] ] = sParser.m_strGloss; } 00265 else { 00266 if( sParser.m_fPathing ) { 00267 sParser.m_fPathing = false; 00268 sParser.m_strGloss.clear( ); } 00269 else if( sParser.m_strGloss.length( ) ) 00270 sParser.m_strGloss += ' '; 00271 sParser.m_strGloss += pchStartGloss; } 00272 00273 return sParser.GetLine( ); } 00274 00275 bool COntologyKEGGImpl::OpenDBLinks( SParserKEGG& sParser ) { 00276 00277 if( !sParser.IsStart( c_szDBLinks ) ) 00278 return true; 00279 00280 do 00281 if( !sParser.GetLine( ) ) 00282 return false; 00283 while( isspace( sParser.m_szLine[ 0 ] ) ); 00284 00285 return true; } 00286 00287 bool COntologyKEGGImpl::OpenGenes( SParserKEGG& sParser ) { 00288 size_t i; 00289 00290 if( !sParser.IsStart( c_szGenes ) ) 00291 return true; 00292 00293 i = strlen( c_szGenes ); 00294 memmove( sParser.m_szLine, sParser.m_szLine + i, strlen( sParser.m_szLine ) - i + 1 ); 00295 do 00296 if( !OpenOrganism( sParser ) ) 00297 return false; 00298 while( isspace( sParser.m_szLine[ 0 ] ) ); 00299 00300 return true; } 00301 00302 bool COntologyKEGGImpl::OpenOrganism( SParserKEGG& sParser ) { 00303 char* pch; 00304 size_t i; 00305 00306 for( pch = sParser.m_szLine; *pch && isspace( *pch ); ++pch ); 00307 if( !*pch ) 00308 return false; 00309 00310 if( sParser.m_fOrganism ) { 00311 if( ( strlen( pch ) > 3 ) && ( pch[ 3 ] == ':' ) ) 00312 sParser.m_fOrganism = false; } 00313 else if( !strncmp( pch, ( sParser.m_strOrganism + ':' ).c_str( ), 00314 i = ( sParser.m_strOrganism.length( ) + 1 ) ) ) { 00315 sParser.m_fOrganism = true; 00316 pch += i + 1; } 00317 if( sParser.m_fOrganism ) 00318 while( *pch ) 00319 pch = OpenGene( sParser, pch ); 00320 00321 return sParser.GetLine( ); } 00322 00323 char* COntologyKEGGImpl::OpenGene( SParserKEGG& sParser, char* pch ) { 00324 char* pchEnd; 00325 char* pchSyn; 00326 bool fInc, fSyn; 00327 CGene* pGene; 00328 string strName; 00329 vector<string> vecstrSynonyms; 00330 size_t i; 00331 00332 for( pchEnd = pch; *pchEnd && !isspace( *pchEnd ) && ( *pchEnd != '(' ); ++pchEnd ); 00333 if( fInc = !!*pchEnd ) 00334 fSyn = ( *pchEnd == '(' ); 00335 *pchEnd = 0; 00336 strName = pch; 00337 00338 if( fInc ) { 00339 ++pchEnd; 00340 if( fSyn ) { 00341 pchSyn = pchEnd; 00342 for( ; *pchEnd != ')'; ++pchEnd ); 00343 *(pchEnd++) = 0; 00344 vecstrSynonyms.push_back( pchSyn ); } } 00345 for( ; *pchEnd && !isspace( *pchEnd ); ++pchEnd ); 00346 if( isspace( *pchEnd ) ) 00347 ++pchEnd; 00348 00349 pGene = &sParser.m_Genome.AddGene( ( sParser.m_fSynonyms && !vecstrSynonyms.empty( ) ) ? 00350 vecstrSynonyms[ 0 ] : strName ); 00351 if( sParser.m_fSynonyms ) 00352 sParser.m_Genome.AddSynonym( *pGene, strName ); 00353 for( i = 0; i < vecstrSynonyms.size( ); ++i ) 00354 sParser.m_Genome.AddSynonym( *pGene, vecstrSynonyms[ i ] ); 00355 sParser.m_vecpGenes.push_back( pGene ); 00356 00357 return pchEnd; } 00358 00359 bool COntologyKEGGImpl::OpenEnd( SParserKEGG& sParser ) { 00360 00361 return ( sParser.IsStart( c_szEnd ) && sParser.GetLine( ) ); } 00362 00363 }