Sleipnir
src/annotationobo.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "annotation.h"
00024 #include "genome.h"
00025 #include "meta.h"
00026 
00027 namespace Sleipnir {
00028 
00029 const char  COntologyOBO::c_szBiologicalProcess[]   = "biological_process";
00030 const char  COntologyOBO::c_szCellularComponent[]   = "cellular_component";
00031 const char  COntologyOBO::c_szMolecularFunction[]   = "molecular_function";
00032 
00033 const char  COntologyOBOImpl::c_szAltID[]       = "alt_id: ";
00034 const char  COntologyOBOImpl::c_szID[]          = "id: ";
00035 const char  COntologyOBOImpl::c_szIsA[]         = "is_a: ";
00036 const char  COntologyOBOImpl::c_szIsObsolete[]  = "is_obsolete: ";
00037 const char  COntologyOBOImpl::c_szOBO[]         = "OBO";
00038 const char  COntologyOBOImpl::c_szHUMAN[]       = "_HUMAN";
00039 const char  COntologyOBOImpl::c_szName[]            = "name: ";
00040 const char  COntologyOBOImpl::c_szNamespace[]   = "namespace: ";
00041 const char  COntologyOBOImpl::c_szNOT[]         = "NOT";
00042 const char  COntologyOBOImpl::c_szPartOf[]      = "part_of ";
00043 const char  COntologyOBOImpl::c_szRelationship[]    = "relationship: ";
00044 const char  COntologyOBOImpl::c_szSGD[]         = "SGD";
00045 const char  COntologyOBOImpl::c_szTerm[]            = "[Term]";
00046 
00047 COntologyOBO::COntologyOBO( ) {
00048 
00049     m_pOntology = this; }
00050 
00051 COntologyOBOImpl::SParserOBO::SParserOBO( std::istream& istm, CGenome& Genome, bool fDBIDs, bool fSynonyms ) :
00052     m_fObsolete(false), m_szTarget(NULL), m_fDBIDs(fDBIDs), m_fSynonyms(fSynonyms), SParser( istm, Genome ) { }
00053 
00054 void COntologyOBOImpl::SParserOBO::Reset( ) {
00055 
00056     m_fObsolete = false;
00057     m_vecstrIDs.clear( ); }
00058 
00059 COntologyOBOImpl::COntologyOBOImpl( ) : COntologyImpl( c_szOBO ) { }
00060 
00094 bool COntologyOBO::Open( std::istream& istmOntology, std::istream& istmAnnotations, CGenome& Genome,
00095     COntologyOBO& OntoBP, COntologyOBO& OntoMF, COntologyOBO& OntoCC, bool fDatabaseIDs, bool fSynonyms ) {
00096 
00097     if( !OntoBP.Open( istmOntology, istmAnnotations, Genome, c_szBiologicalProcess, fDatabaseIDs, fSynonyms ) )
00098         return false;
00099 
00100     istmOntology.clear( );
00101     istmOntology.seekg( 0, ios_base::beg );
00102     istmAnnotations.clear( );
00103     istmAnnotations.seekg( 0, ios_base::beg );
00104     if( !OntoCC.Open( istmOntology, istmAnnotations, Genome, c_szCellularComponent, fDatabaseIDs, fSynonyms ) )
00105         return false;
00106 
00107     istmOntology.clear( );
00108     istmOntology.seekg( 0, ios_base::beg );
00109     istmAnnotations.clear( );
00110     istmAnnotations.seekg( 0, ios_base::beg );
00111     return OntoMF.Open( istmOntology, istmAnnotations, Genome, c_szMolecularFunction, fDatabaseIDs, fSynonyms ); }
00112 
00156 bool COntologyOBO::Open( std::istream& istmOntology, std::istream& istmAnnotations, CGenome& Genome,
00157     const char* szNamespace, bool fDatabaseIDs, bool fSynonyms ) {
00158     SParserOBO  sParserOnto( istmOntology, Genome );
00159     SParserOBO  sParserGene( istmAnnotations, Genome, fDatabaseIDs, fSynonyms );
00160 
00161     m_strID += szNamespace;
00162     sParserOnto.m_szTarget = szNamespace;
00163     return ( OpenOntology( sParserOnto ) && OpenGenes( sParserGene ) ); }
00164 
00165 bool COntologyOBOImpl::OpenOntology( SParserOBO& sParser ) {
00166     size_t                  i, j, iParent;
00167     vector<vector<size_t> > vecveciChildren;
00168 
00169     g_CatSleipnir( ).info( "COntologyOBOImpl::OpenOntology( ) %s", sParser.m_szTarget );
00170     Reset( );
00171     sParser.m_vecvecstrParents.clear( );
00172     sParser.m_vecNodes.clear( );
00173     if( !( sParser.GetLine( ) && OpenHeader( sParser ) ) )
00174         return false;
00175 
00176     while( sParser.m_istm.peek( ) != EOF )
00177         if( !OpenBlock( sParser ) )
00178             return false;
00179 
00180     m_aNodes = new SNode[ m_iNodes = sParser.m_vecNodes.size( ) ];
00181     vecveciChildren.resize( m_iNodes );
00182     for( i = 0; i < m_iNodes; ++i ) {
00183         m_aNodes[ i ] = sParser.m_vecNodes[ i ];
00184         if( m_aNodes[ i ].m_iParents = sParser.m_vecvecstrParents[ i ].size( ) ) {
00185             m_aNodes[ i ].m_aiParents = new size_t[ m_aNodes[ i ].m_iParents ];
00186             for( j = 0; j < m_aNodes[ i ].m_iParents; ++j ) {
00187                 if( ( iParent = m_mapNodes[ sParser.m_vecvecstrParents[ i ][ j ] ] ) == i ) {
00188                     g_CatSleipnir( ).error( "COntologyOBOImpl::OpenOntology( ) found a loop for node %d: %s has parent %s",
00189                         i, m_aNodes[ i ].m_strID.c_str( ), sParser.m_vecvecstrParents[ i ][ j ].c_str( ) );
00190                     return false; }
00191                 m_aNodes[ i ].m_aiParents[ j ] = iParent;
00192                 vecveciChildren[ m_aNodes[ i ].m_aiParents[ j ] ].push_back( i ); } } }
00193     for( i = 0; i < m_iNodes; ++i ) {
00194         if( !vecveciChildren[ i ].size( ) )
00195             continue;
00196         m_aNodes[ i ].m_aiChildren = new size_t[ m_aNodes[ i ].m_iChildren =
00197             vecveciChildren[ i ].size( ) ];
00198         for( j = 0; j < m_aNodes[ i ].m_iChildren; ++j )
00199             m_aNodes[ i ].m_aiChildren[ j ] = vecveciChildren[ i ][ j ]; }
00200 
00201     return true; }
00202 
00203 bool COntologyOBOImpl::OpenHeader( SParserOBO& sParser ) {
00204 
00205     while( sParser.m_szLine[ 0 ] )
00206         if( !sParser.GetLine( ) )
00207             return false;
00208 
00209     return sParser.GetLine( ); }
00210 
00211 bool COntologyOBOImpl::OpenBlock( SParserOBO& sParser ) {
00212 
00213     if( sParser.IsStart( c_szTerm ) )
00214         return ( sParser.GetLine( ) && OpenTerm( sParser ) );
00215 
00216     while( sParser.m_szLine[ 0 ] )
00217         if( !sParser.GetLine( ) )
00218             return false;
00219 
00220     return sParser.GetLine( ); }
00221 
00222 bool COntologyOBOImpl::OpenTerm( SParserOBO& sParser ) {
00223     bool    fRet, fHit;
00224     SNode   sNode;
00225     size_t  i;
00226 
00227     sParser.Reset( );
00228     while( sParser.m_vecvecstrParents.size( ) < ( sParser.m_vecNodes.size( ) + 1 ) )
00229         sParser.m_vecvecstrParents.push_back( vector<string>( ) );
00230     sParser.m_vecvecstrParents[ sParser.m_vecNodes.size( ) ].clear( );
00231     while( sParser.m_szLine[ 0 ] ) {
00232         fRet = fHit = false;
00233         switch( sParser.m_szLine[ 0 ] ) {
00234             case 'a':
00235                 if( sParser.IsStart( c_szAltID ) ) {
00236                     fHit = true;
00237                     if( !( fRet = OpenAltID( sParser ) ) )
00238                         g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", c_szAltID ); }
00239                 break;
00240 
00241             case 'i':
00242                 if( sParser.IsStart( c_szID ) ) {
00243                     fHit = true;
00244                     if( !( fRet = OpenID( sParser ) ) )
00245                         g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", c_szID ); }
00246                 else if( sParser.IsStart( c_szIsA ) ) {
00247                     fHit = true;
00248                     if( !( fRet = OpenParent( sParser ) ) )
00249                         g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", c_szIsA ); }
00250                 else if( sParser.IsStart( c_szIsObsolete ) ) {
00251                     fHit = true;
00252                     if( !( fRet = OpenObsolete( sParser ) ) )
00253                         g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s",
00254                             c_szIsObsolete ); }
00255                 break;
00256 
00257             case 'n':
00258                 if( sParser.IsStart( c_szName ) ) {
00259                     fHit = true;
00260                     if( !( fRet = OpenName( sParser ) ) )
00261                         g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", c_szName ); }
00262                 else if( sParser.IsStart( c_szNamespace ) ) {
00263                     fHit = true;
00264                     if( !( fRet = OpenNamespace( sParser ) ) )
00265                         g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s",
00266                             c_szNamespace ); }
00267                 break;
00268 
00269             case 'r':
00270                 if( sParser.IsStart( c_szRelationship ) ) {
00271                     fHit = true;
00272                     if( !( fRet = OpenRelationship( sParser ) ) )
00273                         g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s",
00274                             c_szRelationship ); }
00275                 break; }
00276         if( !fHit ) {
00277             g_CatSleipnir( ).info( "COntologyOBOImpl::OpenTerm( ) skipping: %s", sParser.m_szLine );
00278             fRet = sParser.GetLine( ); }
00279         if( !fRet ) {
00280             g_CatSleipnir( ).error( "COntologyOBOImpl::OpenTerm( ) failed: %s", sParser.m_szLine );
00281             return false; } }
00282 
00283     if( !sParser.m_fObsolete && ( sParser.m_strNamespace == sParser.m_szTarget ) ) {
00284         sNode.m_strGloss = sParser.m_strGloss;
00285         sNode.m_strID = sParser.m_vecstrIDs[ 0 ];
00286         m_mapNodes[ sNode.m_strID ] = sParser.m_vecNodes.size( );
00287         for( i = 1; i < sParser.m_vecstrIDs.size( ); ++i )
00288             m_mapNodes[ sParser.m_vecstrIDs[ i ] ] = sParser.m_vecNodes.size( );
00289         sParser.m_vecNodes.push_back( sNode ); }
00290 
00291     return true; }
00292 
00293 bool COntologyOBOImpl::OpenID( SParserOBO& sParser ) {
00294 
00295     sParser.m_vecstrIDs.push_back( sParser.m_szLine + strlen( c_szID ) );
00296     return sParser.GetLine( ); }
00297 
00298 bool COntologyOBOImpl::OpenAltID( SParserOBO& sParser ) {
00299 
00300     sParser.m_vecstrIDs.push_back( sParser.m_szLine + strlen( c_szAltID ) );
00301     return sParser.GetLine( ); }
00302 
00303 bool COntologyOBOImpl::OpenName( SParserOBO& sParser ) {
00304 
00305     sParser.m_strGloss = sParser.m_szLine + strlen( c_szName );
00306     return sParser.GetLine( ); }
00307 
00308 bool COntologyOBOImpl::OpenParent( SParserOBO& sParser ) {
00309     const char* szStart;
00310     const char* szEnd;
00311 
00312     szStart = sParser.m_szLine + strlen( c_szIsA );
00313     for( szEnd = szStart; *szEnd && !isspace( *szEnd ); ++szEnd );
00314     sParser.m_vecvecstrParents[ sParser.m_vecNodes.size( ) ].push_back( string( szStart, szEnd ) );
00315     return sParser.GetLine( ); }
00316 
00317 bool COntologyOBOImpl::OpenRelationship( SParserOBO& sParser ) {
00318     const char* szStart;
00319     const char* szEnd;
00320 
00321     if( strncmp( sParser.m_szLine + strlen( c_szRelationship ), c_szPartOf,
00322         strlen( c_szPartOf ) ) ) {
00323         g_CatSleipnir( ).info( "COntologyOBOImpl::OpenRelationship( %s ) unknown relationship",
00324             sParser.m_szLine );
00325         return sParser.GetLine( ); }
00326 
00327     szStart = sParser.m_szLine + strlen( c_szRelationship ) + strlen( c_szPartOf );
00328     for( szEnd = szStart; *szEnd && !isspace( *szEnd ); ++szEnd );
00329     sParser.m_vecvecstrParents[ sParser.m_vecNodes.size( ) ].push_back( string( szStart, szEnd ) );
00330     return sParser.GetLine( ); }
00331 
00332 bool COntologyOBOImpl::OpenNamespace( SParserOBO& sParser ) {
00333 
00334     sParser.m_strNamespace = sParser.m_szLine + strlen( c_szNamespace );
00335     return sParser.GetLine( ); }
00336 
00337 bool COntologyOBOImpl::OpenObsolete( SParserOBO& sParser ) {
00338 
00339     sParser.m_fObsolete = true;
00340     return sParser.GetLine( ); }
00341 
00342 bool COntologyOBOImpl::OpenGenes( SParserOBO& sParser ) {
00343     size_t                                  i, j;
00344     SParserOBO::TSetPGene::const_iterator   iterGene;
00345 
00346     g_CatSleipnir( ).info( "COntologyOBOImpl::OpenGenes( )" );
00347     if( !sParser.GetLine( ) )
00348         return false;
00349     if( !sParser.m_szLine[ 0 ] )
00350         return true;
00351 
00352     sParser.m_vecsetpGenes.resize( m_iNodes );
00353     while( sParser.m_istm.peek( ) != EOF )
00354         if( !OpenGene( sParser ) )
00355             return false;
00356     if( !OpenGene( sParser ) )
00357         return false;
00358 
00359     for( i = 0; i < m_iNodes; ++i ) {
00360         if( sParser.m_vecsetpGenes[ i ].empty( ) )
00361             continue;
00362         m_aNodes[ i ].m_apGenes = new const CGene*[ m_aNodes[ i ].m_iGenes =
00363             sParser.m_vecsetpGenes[ i ].size( ) ];
00364         for( j = 0,iterGene = sParser.m_vecsetpGenes[ i ].begin( );
00365             iterGene != sParser.m_vecsetpGenes[ i ].end( ); ++j,++iterGene )
00366             m_aNodes[ i ].m_apGenes[ j ] = *iterGene; }
00367 
00368     return true; }
00369 
00370 bool COntologyOBOImpl::OpenGene( SParserOBO& sParser ) {
00371     size_t                      i;
00372     string                      strID, strName;
00373     vector<string>              vecstrLine, vecstrNames;
00374     TMapStrI::const_iterator    iterNode;
00375 
00376     if( sParser.m_szLine[ 0 ] == '!' )
00377         return sParser.GetLine( );
00378 
00379 //  1   DB ID
00380 //  2   Name
00381 //  3   NOT
00382 //  4   GO ID
00383 //  6   Annotation source
00384 //  9   Gloss
00385 //  10  Syns
00386     CMeta::Tokenize( sParser.m_szLine, vecstrLine );
00387     if( ( vecstrLine.size( ) < 11 ) || !( strID = vecstrLine[ 4 ] ).length( ) )
00388         return false;
00389     if( vecstrLine[ 3 ].length( ) || ( ( iterNode = m_mapNodes.find( strID ) ) ==
00390         m_mapNodes.end( ) ) )
00391         return sParser.GetLine( );
00392     CMeta::Tokenize( vecstrLine[ 10 ].c_str( ), vecstrNames, "|" );
00393 
00394     while( !vecstrNames.empty( ) && vecstrNames[ 0 ].empty( ) )
00395         vecstrNames.erase( vecstrNames.begin( ) );
00396     strName = ( sParser.m_fSynonyms || vecstrNames.empty( ) ) ? vecstrLine[ 2 ] : vecstrNames[ 0 ]; 
00397     strName = ( sParser.m_fDBIDs ? vecstrLine[ 1 ] : strName );
00398 
00399     if( strName.empty( ) ) {
00400         g_CatSleipnir( ).error( "COntologyOBOImpl::OpenGene( ) null name: %s",
00401             sParser.m_szLine );
00402         return false; }
00403     {
00404         CGene&  Gene    = sParser.m_Genome.AddGene( strName );
00405 
00406         if( sParser.m_fSynonyms )
00407             sParser.m_Genome.AddSynonym( Gene, vecstrLine[ 2 ] );
00408         //if( sParser.m_fDBIDs )
00409         //  sParser.m_Genome.AddSynonym( Gene, vecstrLine[ 1 ] );
00410         if( vecstrLine[ 2 ].length( ) ) {
00411             strID = ( ( i = vecstrLine[ 2 ].find( c_szHUMAN ) ) == string::npos ) ? vecstrLine[ 2 ] :
00412                 vecstrLine[ 2 ].substr( 0, i );             
00413             sParser.m_Genome.AddSynonym( Gene, strID ); }
00414         for( i = 1; i < vecstrNames.size( ); ++i )
00415             sParser.m_Genome.AddSynonym( Gene, vecstrNames[ i ] );
00416         Gene.AddAnnotation( m_pOntology, iterNode->second );
00417         if( Gene.GetGloss( ).length( ) == 0 )
00418             Gene.SetGloss( vecstrLine[ 9 ] );
00419         sParser.m_vecsetpGenes[ iterNode->second ].insert( &Gene );
00420     }
00421 
00422     return sParser.GetLine( ); }
00423 
00424 }