Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "annotation.h" 00024 #include "genome.h" 00025 #include "meta.h" 00026 00027 namespace Sleipnir { 00028 00029 const char COntologyOBO::c_szBiologicalProcess[] = "biological_process"; 00030 const char COntologyOBO::c_szCellularComponent[] = "cellular_component"; 00031 const char COntologyOBO::c_szMolecularFunction[] = "molecular_function"; 00032 00033 const char COntologyOBOImpl::c_szAltID[] = "alt_id: "; 00034 const char COntologyOBOImpl::c_szID[] = "id: "; 00035 const char COntologyOBOImpl::c_szIsA[] = "is_a: "; 00036 const char COntologyOBOImpl::c_szIsObsolete[] = "is_obsolete: "; 00037 const char COntologyOBOImpl::c_szOBO[] = "OBO"; 00038 const char COntologyOBOImpl::c_szHUMAN[] = "_HUMAN"; 00039 const char COntologyOBOImpl::c_szName[] = "name: "; 00040 const char COntologyOBOImpl::c_szNamespace[] = "namespace: "; 00041 const char COntologyOBOImpl::c_szNOT[] = "NOT"; 00042 const char COntologyOBOImpl::c_szPartOf[] = "part_of "; 00043 const char COntologyOBOImpl::c_szRelationship[] = "relationship: "; 00044 const char COntologyOBOImpl::c_szSGD[] = "SGD"; 00045 const char COntologyOBOImpl::c_szTerm[] = "[Term]"; 00046 00047 COntologyOBO::COntologyOBO( ) { 00048 00049 m_pOntology = this; } 00050 00051 COntologyOBOImpl::SParserOBO::SParserOBO( std::istream& istm, CGenome& Genome, bool fDBIDs, bool fSynonyms ) : 00052 m_fObsolete(false), m_szTarget(NULL), m_fDBIDs(fDBIDs), m_fSynonyms(fSynonyms), SParser( istm, Genome ) { } 00053 00054 void COntologyOBOImpl::SParserOBO::Reset( ) { 00055 00056 m_fObsolete = false; 00057 m_vecstrIDs.clear( ); } 00058 00059 COntologyOBOImpl::COntologyOBOImpl( ) : COntologyImpl( c_szOBO ) { } 00060 00094 bool COntologyOBO::Open( std::istream& istmOntology, std::istream& istmAnnotations, CGenome& Genome, 00095 COntologyOBO& OntoBP, COntologyOBO& OntoMF, COntologyOBO& OntoCC, bool fDatabaseIDs, bool fSynonyms ) { 00096 00097 if( !OntoBP.Open( istmOntology, istmAnnotations, Genome, c_szBiologicalProcess, fDatabaseIDs, fSynonyms ) ) 00098 return false; 00099 00100 istmOntology.clear( ); 00101 istmOntology.seekg( 0, ios_base::beg ); 00102 istmAnnotations.clear( ); 00103 istmAnnotations.seekg( 0, ios_base::beg ); 00104 if( !OntoCC.Open( istmOntology, istmAnnotations, Genome, c_szCellularComponent, fDatabaseIDs, fSynonyms ) ) 00105 return false; 00106 00107 istmOntology.clear( ); 00108 istmOntology.seekg( 0, ios_base::beg ); 00109 istmAnnotations.clear( ); 00110 istmAnnotations.seekg( 0, ios_base::beg ); 00111 return OntoMF.Open( istmOntology, istmAnnotations, Genome, c_szMolecularFunction, fDatabaseIDs, fSynonyms ); } 00112 00156 bool COntologyOBO::Open( std::istream& istmOntology, std::istream& istmAnnotations, CGenome& Genome, 00157 const char* szNamespace, bool fDatabaseIDs, bool fSynonyms ) { 00158 SParserOBO sParserOnto( istmOntology, Genome ); 00159 SParserOBO sParserGene( istmAnnotations, Genome, fDatabaseIDs, fSynonyms ); 00160 00161 m_strID += szNamespace; 00162 sParserOnto.m_szTarget = szNamespace; 00163 return ( OpenOntology( sParserOnto ) && OpenGenes( sParserGene ) ); } 00164 00165 bool COntologyOBOImpl::OpenOntology( SParserOBO& sParser ) { 00166 size_t i, j, iParent; 00167 vector<vector<size_t> > vecveciChildren; 00168 00169 g_CatSleipnir( ).info( "COntologyOBOImpl::OpenOntology( ) %s", sParser.m_szTarget ); 00170 Reset( ); 00171 sParser.m_vecvecstrParents.clear( ); 00172 sParser.m_vecNodes.clear( ); 00173 if( !( sParser.GetLine( ) && OpenHeader( sParser ) ) ) 00174 return false; 00175 00176 while( sParser.m_istm.peek( ) != EOF ) 00177 if( !OpenBlock( sParser ) ) 00178 return false; 00179 00180 m_aNodes = new SNode[ m_iNodes = sParser.m_vecNodes.size( ) ]; 00181 vecveciChildren.resize( m_iNodes ); 00182 for( i = 0; i < m_iNodes; ++i ) { 00183 m_aNodes[ i ] = sParser.m_vecNodes[ i ]; 00184 if( m_aNodes[ i ].m_iParents = sParser.m_vecvecstrParents[ i ].size( ) ) { 00185 m_aNodes[ i ].m_aiParents = new size_t[ m_aNodes[ i ].m_iParents ]; 00186 for( j = 0; j < m_aNodes[ i ].m_iParents; ++j ) { 00187 if( ( iParent = m_mapNodes[ sParser.m_vecvecstrParents[ i ][ j ] ] ) == i ) { 00188 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenOntology( ) found a loop for node %d: %s has parent %s", 00189 i, m_aNodes[ i ].m_strID.c_str( ), sParser.m_vecvecstrParents[ i ][ j ].c_str( ) ); 00190 return false; } 00191 m_aNodes[ i ].m_aiParents[ j ] = iParent; 00192 vecveciChildren[ m_aNodes[ i ].m_aiParents[ j ] ].push_back( i ); } } } 00193 for( i = 0; i < m_iNodes; ++i ) { 00194 if( !vecveciChildren[ i ].size( ) ) 00195 continue; 00196 m_aNodes[ i ].m_aiChildren = new size_t[ m_aNodes[ i ].m_iChildren = 00197 vecveciChildren[ i ].size( ) ]; 00198 for( j = 0; j < m_aNodes[ i ].m_iChildren; ++j ) 00199 m_aNodes[ i ].m_aiChildren[ j ] = vecveciChildren[ i ][ j ]; } 00200 00201 return true; } 00202 00203 bool COntologyOBOImpl::OpenHeader( SParserOBO& sParser ) { 00204 00205 while( sParser.m_szLine[ 0 ] ) 00206 if( !sParser.GetLine( ) ) 00207 return false; 00208 00209 return sParser.GetLine( ); } 00210 00211 bool COntologyOBOImpl::OpenBlock( SParserOBO& sParser ) { 00212 00213 if( sParser.IsStart( c_szTerm ) ) 00214 return ( sParser.GetLine( ) && OpenTerm( sParser ) ); 00215 00216 while( sParser.m_szLine[ 0 ] ) 00217 if( !sParser.GetLine( ) ) 00218 return false; 00219 00220 return sParser.GetLine( ); } 00221 00222 bool COntologyOBOImpl::OpenTerm( SParserOBO& sParser ) { 00223 bool fRet, fHit; 00224 SNode sNode; 00225 size_t i; 00226 00227 sParser.Reset( ); 00228 while( sParser.m_vecvecstrParents.size( ) < ( sParser.m_vecNodes.size( ) + 1 ) ) 00229 sParser.m_vecvecstrParents.push_back( vector<string>( ) ); 00230 sParser.m_vecvecstrParents[ sParser.m_vecNodes.size( ) ].clear( ); 00231 while( sParser.m_szLine[ 0 ] ) { 00232 fRet = fHit = false; 00233 switch( sParser.m_szLine[ 0 ] ) { 00234 case 'a': 00235 if( sParser.IsStart( c_szAltID ) ) { 00236 fHit = true; 00237 if( !( fRet = OpenAltID( sParser ) ) ) 00238 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", c_szAltID ); } 00239 break; 00240 00241 case 'i': 00242 if( sParser.IsStart( c_szID ) ) { 00243 fHit = true; 00244 if( !( fRet = OpenID( sParser ) ) ) 00245 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", c_szID ); } 00246 else if( sParser.IsStart( c_szIsA ) ) { 00247 fHit = true; 00248 if( !( fRet = OpenParent( sParser ) ) ) 00249 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", c_szIsA ); } 00250 else if( sParser.IsStart( c_szIsObsolete ) ) { 00251 fHit = true; 00252 if( !( fRet = OpenObsolete( sParser ) ) ) 00253 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", 00254 c_szIsObsolete ); } 00255 break; 00256 00257 case 'n': 00258 if( sParser.IsStart( c_szName ) ) { 00259 fHit = true; 00260 if( !( fRet = OpenName( sParser ) ) ) 00261 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", c_szName ); } 00262 else if( sParser.IsStart( c_szNamespace ) ) { 00263 fHit = true; 00264 if( !( fRet = OpenNamespace( sParser ) ) ) 00265 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", 00266 c_szNamespace ); } 00267 break; 00268 00269 case 'r': 00270 if( sParser.IsStart( c_szRelationship ) ) { 00271 fHit = true; 00272 if( !( fRet = OpenRelationship( sParser ) ) ) 00273 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", 00274 c_szRelationship ); } 00275 break; } 00276 if( !fHit ) { 00277 g_CatSleipnir( ).info( "COntologyOBOImpl::OpenTerm( ) skipping: %s", sParser.m_szLine ); 00278 fRet = sParser.GetLine( ); } 00279 if( !fRet ) { 00280 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", sParser.m_szLine ); 00281 return false; } } 00282 00283 if( !sParser.m_fObsolete && ( sParser.m_strNamespace == sParser.m_szTarget ) ) { 00284 sNode.m_strGloss = sParser.m_strGloss; 00285 sNode.m_strID = sParser.m_vecstrIDs[ 0 ]; 00286 m_mapNodes[ sNode.m_strID ] = sParser.m_vecNodes.size( ); 00287 for( i = 1; i < sParser.m_vecstrIDs.size( ); ++i ) 00288 m_mapNodes[ sParser.m_vecstrIDs[ i ] ] = sParser.m_vecNodes.size( ); 00289 sParser.m_vecNodes.push_back( sNode ); } 00290 00291 return true; } 00292 00293 bool COntologyOBOImpl::OpenID( SParserOBO& sParser ) { 00294 00295 sParser.m_vecstrIDs.push_back( sParser.m_szLine + strlen( c_szID ) ); 00296 return sParser.GetLine( ); } 00297 00298 bool COntologyOBOImpl::OpenAltID( SParserOBO& sParser ) { 00299 00300 sParser.m_vecstrIDs.push_back( sParser.m_szLine + strlen( c_szAltID ) ); 00301 return sParser.GetLine( ); } 00302 00303 bool COntologyOBOImpl::OpenName( SParserOBO& sParser ) { 00304 00305 sParser.m_strGloss = sParser.m_szLine + strlen( c_szName ); 00306 return sParser.GetLine( ); } 00307 00308 bool COntologyOBOImpl::OpenParent( SParserOBO& sParser ) { 00309 const char* szStart; 00310 const char* szEnd; 00311 00312 szStart = sParser.m_szLine + strlen( c_szIsA ); 00313 for( szEnd = szStart; *szEnd && !isspace( *szEnd ); ++szEnd ); 00314 sParser.m_vecvecstrParents[ sParser.m_vecNodes.size( ) ].push_back( string( szStart, szEnd ) ); 00315 return sParser.GetLine( ); } 00316 00317 bool COntologyOBOImpl::OpenRelationship( SParserOBO& sParser ) { 00318 const char* szStart; 00319 const char* szEnd; 00320 00321 if( strncmp( sParser.m_szLine + strlen( c_szRelationship ), c_szPartOf, 00322 strlen( c_szPartOf ) ) ) { 00323 g_CatSleipnir( ).info( "COntologyOBOImpl::OpenRelationship( %s ) unknown relationship", 00324 sParser.m_szLine ); 00325 return sParser.GetLine( ); } 00326 00327 szStart = sParser.m_szLine + strlen( c_szRelationship ) + strlen( c_szPartOf ); 00328 for( szEnd = szStart; *szEnd && !isspace( *szEnd ); ++szEnd ); 00329 sParser.m_vecvecstrParents[ sParser.m_vecNodes.size( ) ].push_back( string( szStart, szEnd ) ); 00330 return sParser.GetLine( ); } 00331 00332 bool COntologyOBOImpl::OpenNamespace( SParserOBO& sParser ) { 00333 00334 sParser.m_strNamespace = sParser.m_szLine + strlen( c_szNamespace ); 00335 return sParser.GetLine( ); } 00336 00337 bool COntologyOBOImpl::OpenObsolete( SParserOBO& sParser ) { 00338 00339 sParser.m_fObsolete = true; 00340 return sParser.GetLine( ); } 00341 00342 bool COntologyOBOImpl::OpenGenes( SParserOBO& sParser ) { 00343 size_t i, j; 00344 SParserOBO::TSetPGene::const_iterator iterGene; 00345 00346 g_CatSleipnir( ).info( "COntologyOBOImpl::OpenGenes( )" ); 00347 if( !sParser.GetLine( ) ) 00348 return false; 00349 if( !sParser.m_szLine[ 0 ] ) 00350 return true; 00351 00352 sParser.m_vecsetpGenes.resize( m_iNodes ); 00353 while( sParser.m_istm.peek( ) != EOF ) 00354 if( !OpenGene( sParser ) ) 00355 return false; 00356 if( !OpenGene( sParser ) ) 00357 return false; 00358 00359 for( i = 0; i < m_iNodes; ++i ) { 00360 if( sParser.m_vecsetpGenes[ i ].empty( ) ) 00361 continue; 00362 m_aNodes[ i ].m_apGenes = new const CGene*[ m_aNodes[ i ].m_iGenes = 00363 sParser.m_vecsetpGenes[ i ].size( ) ]; 00364 for( j = 0,iterGene = sParser.m_vecsetpGenes[ i ].begin( ); 00365 iterGene != sParser.m_vecsetpGenes[ i ].end( ); ++j,++iterGene ) 00366 m_aNodes[ i ].m_apGenes[ j ] = *iterGene; } 00367 00368 return true; } 00369 00370 bool COntologyOBOImpl::OpenGene( SParserOBO& sParser ) { 00371 size_t i; 00372 string strID, strName; 00373 vector<string> vecstrLine, vecstrNames; 00374 TMapStrI::const_iterator iterNode; 00375 00376 if( sParser.m_szLine[ 0 ] == '!' ) 00377 return sParser.GetLine( ); 00378 00379 // 1 DB ID 00380 // 2 Name 00381 // 3 NOT 00382 // 4 GO ID 00383 // 6 Annotation source 00384 // 9 Gloss 00385 // 10 Syns 00386 CMeta::Tokenize( sParser.m_szLine, vecstrLine ); 00387 if( ( vecstrLine.size( ) < 11 ) || !( strID = vecstrLine[ 4 ] ).length( ) ) 00388 return false; 00389 if( vecstrLine[ 3 ].length( ) || ( ( iterNode = m_mapNodes.find( strID ) ) == 00390 m_mapNodes.end( ) ) ) 00391 return sParser.GetLine( ); 00392 CMeta::Tokenize( vecstrLine[ 10 ].c_str( ), vecstrNames, "|" ); 00393 00394 while( !vecstrNames.empty( ) && vecstrNames[ 0 ].empty( ) ) 00395 vecstrNames.erase( vecstrNames.begin( ) ); 00396 strName = ( sParser.m_fSynonyms || vecstrNames.empty( ) ) ? vecstrLine[ 2 ] : vecstrNames[ 0 ]; 00397 strName = ( sParser.m_fDBIDs ? vecstrLine[ 1 ] : strName ); 00398 00399 if( strName.empty( ) ) { 00400 g_CatSleipnir( ).error( "COntologyOBOImpl::OpenGene( ) null name: %s", 00401 sParser.m_szLine ); 00402 return false; } 00403 { 00404 CGene& Gene = sParser.m_Genome.AddGene( strName ); 00405 00406 if( sParser.m_fSynonyms ) 00407 sParser.m_Genome.AddSynonym( Gene, vecstrLine[ 2 ] ); 00408 //if( sParser.m_fDBIDs ) 00409 // sParser.m_Genome.AddSynonym( Gene, vecstrLine[ 1 ] ); 00410 if( vecstrLine[ 2 ].length( ) ) { 00411 strID = ( ( i = vecstrLine[ 2 ].find( c_szHUMAN ) ) == string::npos ) ? vecstrLine[ 2 ] : 00412 vecstrLine[ 2 ].substr( 0, i ); 00413 sParser.m_Genome.AddSynonym( Gene, strID ); } 00414 for( i = 1; i < vecstrNames.size( ); ++i ) 00415 sParser.m_Genome.AddSynonym( Gene, vecstrNames[ i ] ); 00416 Gene.AddAnnotation( m_pOntology, iterNode->second ); 00417 if( Gene.GetGloss( ).length( ) == 0 ) 00418 Gene.SetGloss( vecstrLine[ 9 ] ); 00419 sParser.m_vecsetpGenes[ iterNode->second ].insert( &Gene ); 00420 } 00421 00422 return sParser.GetLine( ); } 00423 00424 }