Sleipnir
src/annotationkegg.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "annotation.h"
00024 #include "genome.h"
00025 #include "meta.h"
00026 
00027 namespace Sleipnir {
00028 
00029 COntologyKEGG::COntologyKEGG( ) {
00030 
00031     m_pOntology = this; }
00032 
00069 bool COntologyKEGG::Open( std::istream& istm, CGenome& Genome, const std::string& strOrganism,
00070     bool fSynonyms ) {
00071     SParserKEGG                 sParser( istm, Genome, strOrganism, fSynonyms );
00072     size_t                      i, j, iNode;
00073     TMapStrI::const_iterator    iterNode;
00074     vector<set<CGene*> >        vecsetpGenes;
00075     set<CGene*>::iterator       iterGene;
00076 
00077     g_CatSleipnir( ).info( "COntologyKEGG::Open( %s )", strOrganism.c_str( ) );
00078     Reset( );
00079     sParser.GetLine( );
00080     while( istm.peek( ) != EOF ) {
00081         if( !COntologyKEGGImpl::Open( sParser ) )
00082             return false;
00083         for( i = 0; i < sParser.m_vecstrIDs.size( ); ++i ) {
00084             if( ( iterNode = m_mapNodes.find( sParser.m_vecstrIDs[ i ] ) ) ==
00085                 m_mapNodes.end( ) )
00086                 m_mapNodes[ sParser.m_vecstrIDs[ i ] ] = iNode = m_mapNodes.size( );
00087             else
00088                 iNode = iterNode->second;
00089             if( vecsetpGenes.size( ) <= iNode )
00090                 vecsetpGenes.resize( iNode + 1 );
00091             for( j = 0; j < sParser.m_vecpGenes.size( ); ++j )
00092                 vecsetpGenes[ iNode ].insert( sParser.m_vecpGenes[ j ] ); } }
00093 
00094     m_aNodes = new SNode[ m_iNodes = vecsetpGenes.size( ) ];
00095     for( iterNode = m_mapNodes.begin( ); iterNode != m_mapNodes.end( ); ++iterNode ) {
00096         i = iterNode->second;
00097         m_aNodes[ i ].m_strID = iterNode->first;
00098         m_aNodes[ i ].m_strGloss = sParser.m_mapGlosses[ iterNode->first ];
00099         m_aNodes[ i ].m_iGenes = vecsetpGenes[ i ].size( );
00100         m_aNodes[ i ].m_apGenes = new const CGene*[ m_aNodes[ i ].m_iGenes ];
00101         for( j = 0,iterGene = vecsetpGenes[ i ].begin( );
00102             iterGene != vecsetpGenes[ i ].end( ); ++j,++iterGene ) {
00103             (*iterGene)->AddAnnotation( this, i );
00104             m_aNodes[ i ].m_apGenes[ j ] = *iterGene; } }
00105 
00106     return true; }
00107 
00108 const char  COntologyKEGGImpl::c_szKEGG[]       = "KEGG";
00109 const char  COntologyKEGGImpl::c_szEntry[]      = "ENTRY";
00110 const char  COntologyKEGGImpl::c_szName[]       = "NAME";
00111 const char  COntologyKEGGImpl::c_szDefinition[] = "DEFINITION";
00112 const char  COntologyKEGGImpl::c_szClass[]      = "CLASS";
00113 const char  COntologyKEGGImpl::c_szPath[]       = "PATH:";
00114 const char  COntologyKEGGImpl::c_szReference[]  = "REFERENCE";
00115 const char  COntologyKEGGImpl::c_szDisease[]    = "DISEASE";
00116 const char  COntologyKEGGImpl::c_szPathway[]    = "PATHWAY";
00117 const char  COntologyKEGGImpl::c_szModule[]     = "MODULE";
00118 const char  COntologyKEGGImpl::c_szBR[]         = "BR:";
00119 const char  COntologyKEGGImpl::c_szDBLinks[]    = "DBLINKS";
00120 const char  COntologyKEGGImpl::c_szGenes[]      = "GENES";
00121 const char  COntologyKEGGImpl::c_szEnd[]        = "///";
00122 
00123 COntologyKEGGImpl::SParserKEGG::SParserKEGG( std::istream& istm, CGenome& Genome,
00124     const std::string& strOrganism, bool fSynonyms ) : m_fSynonyms(fSynonyms), m_fOrganism(false),
00125     m_strOrganism(strOrganism), SParser( istm, Genome ) { }
00126 
00127 void COntologyKEGGImpl::SParserKEGG::Reset( ) {
00128 
00129     m_vecpGenes.clear( );
00130     m_vecstrIDs.clear( ); }
00131 
00132 COntologyKEGGImpl::COntologyKEGGImpl( ) : COntologyImpl( c_szKEGG ) { }
00133 
00134 bool COntologyKEGGImpl::Open( SParserKEGG& sParser ) {
00135 
00136     sParser.Reset( );
00137     return ( OpenEntry( sParser ) && OpenName( sParser ) &&
00138         OpenDefinition( sParser ) && OpenPathway( sParser ) &&
00139         OpenModule( sParser ) && OpenDisease( sParser ) &&
00140         OpenClass( sParser ) && OpenDBLinks( sParser ) &&
00141         OpenGenes( sParser ) && OpenReferences( sParser ) &&
00142         OpenEnd( sParser ) ); }
00143 
00144 bool COntologyKEGGImpl::OpenEntry( SParserKEGG& sParser ) {
00145 
00146     return ( sParser.IsStart( c_szEntry ) && sParser.GetLine( ) ); }
00147 
00148 bool COntologyKEGGImpl::OpenName( SParserKEGG& sParser ) {
00149 
00150     g_CatSleipnir( ).debug( "COntologyKEGGImpl::OpenName( ) %s", sParser.m_szLine );
00151     return ( sParser.IsStart( c_szName ) ? sParser.GetLine( ) : true ); }
00152 
00153 bool COntologyKEGGImpl::OpenPathway( SParserKEGG& sParser ) {
00154 
00155     if( !sParser.IsStart( c_szPathway ) )
00156         return true;
00157 
00158     do
00159         if( !sParser.GetLine( ) )
00160             return false;
00161     while( isspace( sParser.m_szLine[ 0 ] ) );
00162 
00163     return true; }
00164 
00165 bool COntologyKEGGImpl::OpenReferences( SParserKEGG& sParser ) {
00166 
00167     while( OpenReference( sParser ) );
00168 
00169     return true; }
00170 
00171 bool COntologyKEGGImpl::OpenReference( SParserKEGG& sParser ) {
00172 
00173     if( !sParser.IsStart( c_szReference ) )
00174         return false;
00175 
00176     do
00177         if( !sParser.GetLine( ) )
00178             return false;
00179     while( isspace( sParser.m_szLine[ 0 ] ) );
00180 
00181     return true; }
00182 
00183 bool COntologyKEGGImpl::OpenDisease( SParserKEGG& sParser ) {
00184 
00185     if( !sParser.IsStart( c_szDisease ) )
00186         return true;
00187 
00188     do
00189         if( !sParser.GetLine( ) )
00190             return false;
00191     while( isspace( sParser.m_szLine[ 0 ] ) );
00192 
00193     return true; }
00194 
00195 bool COntologyKEGGImpl::OpenModule( SParserKEGG& sParser ) {
00196 
00197     if( !sParser.IsStart( c_szModule ) )
00198         return true;
00199 
00200     do
00201         if( !sParser.GetLine( ) )
00202             return false;
00203     while( isspace( sParser.m_szLine[ 0 ] ) );
00204 
00205     return true; }
00206 
00207 bool COntologyKEGGImpl::OpenDefinition( SParserKEGG& sParser ) {
00208 
00209     if( !sParser.IsStart( c_szDefinition ) )
00210         return true;
00211 
00212     do
00213         if( !sParser.GetLine( ) )
00214             return false;
00215     while( isspace( sParser.m_szLine[ 0 ] ) );
00216 
00217     return true; }
00218 
00219 bool COntologyKEGGImpl::OpenClass( SParserKEGG& sParser ) {
00220     size_t  i;
00221 
00222     if( !sParser.IsStart( c_szClass ) )
00223         return false;
00224 
00225     sParser.m_strGloss.clear( );
00226     i = strlen( c_szClass );
00227     memmove( sParser.m_szLine, sParser.m_szLine + i, strlen( sParser.m_szLine ) - i + 1 );
00228     sParser.m_fPathing = false;
00229     do
00230         if( !OpenGloss( sParser ) )
00231             return false;
00232     while( isspace( sParser.m_szLine[ 0 ] ) );
00233 
00234     return true; }
00235 
00236 bool COntologyKEGGImpl::OpenGloss( SParserKEGG& sParser ) {
00237     char*           pchStartGloss;
00238     char*           pchEndGloss;
00239     char*           pchStartPath;
00240     char*           pchEndPath;
00241     vector<string>  vecstrIDs;
00242     size_t          i;
00243 
00244     for( pchStartGloss = sParser.m_szLine; isspace( *pchStartGloss ); ++pchStartGloss );
00245     if( ( pchEndGloss = strstr( pchStartGloss, c_szPath ) ) ||
00246         ( pchEndGloss = strstr( pchStartGloss, c_szBR ) ) ) {
00247         pchStartPath = pchEndGloss + ( strncmp( pchEndGloss, c_szBR, strlen( c_szBR ) ) ?
00248             strlen( c_szPath ) : strlen( c_szBR ) );
00249         if( !( pchEndPath = strchr( pchStartPath, ']' ) ) )
00250             return false;
00251         *pchEndPath = 0;
00252         CMeta::Tokenize( pchStartPath, vecstrIDs, " ", true );
00253         for( i = 0; i < vecstrIDs.size( ); ++i )
00254             sParser.m_vecstrIDs.push_back( vecstrIDs[ i ] );
00255         if( pchEndGloss > ( pchStartGloss + 1 ) ) {
00256             *( pchEndGloss - 1 ) = 0;
00257             if( sParser.m_fPathing )
00258                 sParser.m_strGloss.clear( );
00259             else if( sParser.m_strGloss.length( ) )
00260                 sParser.m_strGloss += ' ';
00261             sParser.m_strGloss += pchStartGloss; }
00262         sParser.m_fPathing = true;
00263         for( i = 0; i < vecstrIDs.size( ); ++i )
00264             sParser.m_mapGlosses[ vecstrIDs[ i ] ] = sParser.m_strGloss; }
00265     else {
00266         if( sParser.m_fPathing ) {
00267             sParser.m_fPathing = false;
00268             sParser.m_strGloss.clear( ); }
00269         else if( sParser.m_strGloss.length( ) )
00270             sParser.m_strGloss += ' ';
00271         sParser.m_strGloss += pchStartGloss; }
00272 
00273     return sParser.GetLine( ); }
00274 
00275 bool COntologyKEGGImpl::OpenDBLinks( SParserKEGG& sParser ) {
00276 
00277     if( !sParser.IsStart( c_szDBLinks ) )
00278         return true;
00279 
00280     do
00281         if( !sParser.GetLine( ) )
00282             return false;
00283     while( isspace( sParser.m_szLine[ 0 ] ) );
00284 
00285     return true; }
00286 
00287 bool COntologyKEGGImpl::OpenGenes( SParserKEGG& sParser ) {
00288     size_t  i;
00289 
00290     if( !sParser.IsStart( c_szGenes ) )
00291         return true;
00292 
00293     i = strlen( c_szGenes );
00294     memmove( sParser.m_szLine, sParser.m_szLine + i, strlen( sParser.m_szLine ) - i + 1 );
00295     do
00296         if( !OpenOrganism( sParser ) )
00297             return false;
00298     while( isspace( sParser.m_szLine[ 0 ] ) );
00299 
00300     return true; }
00301 
00302 bool COntologyKEGGImpl::OpenOrganism( SParserKEGG& sParser ) {
00303     char*   pch;
00304     size_t  i;
00305 
00306     for( pch = sParser.m_szLine; *pch && isspace( *pch ); ++pch );
00307     if( !*pch )
00308         return false;
00309 
00310     if( sParser.m_fOrganism ) {
00311         if( ( strlen( pch ) > 3 ) && ( pch[ 3 ] == ':' ) )
00312             sParser.m_fOrganism = false; }
00313     else if( !strncmp( pch, ( sParser.m_strOrganism + ':' ).c_str( ),
00314         i = ( sParser.m_strOrganism.length( ) + 1 ) ) ) {
00315         sParser.m_fOrganism = true;
00316         pch += i + 1; }
00317     if( sParser.m_fOrganism )
00318         while( *pch )
00319             pch = OpenGene( sParser, pch );
00320 
00321     return sParser.GetLine( ); }
00322 
00323 char* COntologyKEGGImpl::OpenGene( SParserKEGG& sParser, char* pch ) {
00324     char*           pchEnd;
00325     char*           pchSyn;
00326     bool            fInc, fSyn;
00327     CGene*          pGene;
00328     string          strName;
00329     vector<string>  vecstrSynonyms;
00330     size_t          i;
00331 
00332     for( pchEnd = pch; *pchEnd && !isspace( *pchEnd ) && ( *pchEnd != '(' ); ++pchEnd );
00333     if( fInc = !!*pchEnd )
00334         fSyn = ( *pchEnd == '(' );
00335     *pchEnd = 0;
00336     strName = pch;
00337 
00338     if( fInc ) {
00339         ++pchEnd;
00340         if( fSyn ) {
00341             pchSyn = pchEnd;
00342             for( ; *pchEnd != ')'; ++pchEnd );
00343             *(pchEnd++) = 0;
00344             vecstrSynonyms.push_back( pchSyn ); } }
00345     for( ; *pchEnd && !isspace( *pchEnd ); ++pchEnd );
00346     if( isspace( *pchEnd ) )
00347         ++pchEnd;
00348 
00349     pGene = &sParser.m_Genome.AddGene( ( sParser.m_fSynonyms && !vecstrSynonyms.empty( ) ) ?
00350         vecstrSynonyms[ 0 ] : strName );
00351     if( sParser.m_fSynonyms )
00352         sParser.m_Genome.AddSynonym( *pGene, strName );
00353     for( i = 0; i < vecstrSynonyms.size( ); ++i )
00354         sParser.m_Genome.AddSynonym( *pGene, vecstrSynonyms[ i ] );
00355     sParser.m_vecpGenes.push_back( pGene );
00356 
00357     return pchEnd; }
00358 
00359 bool COntologyKEGGImpl::OpenEnd( SParserKEGG& sParser ) {
00360 
00361     return ( sParser.IsStart( c_szEnd ) && sParser.GetLine( ) ); }
00362 
00363 }