Sleipnir
src/annotationi.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef ANNOTATIONI_H
00023 #define ANNOTATIONI_H
00024 
00025 #include <map>
00026 #include <iostream>
00027 #include <set>
00028 #include <stack>
00029 #include <string>
00030 #include <vector>
00031 
00032 #include "file.h"
00033 
00034 namespace Sleipnir {
00035 
00036 class CGene;
00037 class CGenes;
00038 class CGenome;
00039 class IOntology;
00040 
00041 class COntologyImpl {
00042 protected:
00043     typedef std::map<std::string,size_t>    TMapStrI;
00044     typedef std::set<const CGene*>          TSetPGenes;
00045 
00046     struct SNode {
00047         SNode( );
00048 
00049         void Reset( );
00050 
00051         std::string     m_strID;
00052         std::string     m_strGloss;
00053         size_t          m_iParents;
00054         size_t*         m_aiParents;
00055         size_t          m_iChildren;
00056         size_t*         m_aiChildren;
00057         size_t          m_iGenes;
00058         const CGene**   m_apGenes;
00059         size_t          m_iCacheGenes;
00060         const CGene**   m_apCacheGenes;
00061     };
00062 
00063     struct SParser {
00064         static const size_t c_iBuffer   = 65536;
00065 
00066         SParser( std::istream&, CGenome& );
00067 
00068         bool GetLine( );
00069         bool IsStart( const char* ) const;
00070 
00071         std::istream&   m_istm;
00072         CGenome&        m_Genome;
00073         char            m_szLine[ c_iBuffer ];
00074         std::string     m_strGloss;
00075         size_t          m_iLine;
00076     };
00077 
00078     COntologyImpl( const std::string& strID ) : m_strID(strID), m_iNodes(0), m_aNodes(NULL) { }
00079 
00080     ~COntologyImpl( ) {
00081 
00082         Reset( ); }
00083 
00084     size_t GetNode( const std::string& ) const;
00085     bool IsAnnotated( size_t, const CGene&, bool ) const;
00086     const CGene& GetGene( size_t, size_t ) const;
00087     void GetGeneNames( std::vector<std::string>& ) const;
00088     void Reset( );
00089     void CollectGenes( size_t, TSetPGenes& );
00090     void TermFinder( const CGenes&, std::vector<STermFound>&, bool, bool, bool, float, const CGenes* ) const;
00091 
00092     size_t GetNodes( ) const {
00093 
00094         return m_iNodes; }
00095 
00096     size_t GetParents( size_t iNode ) const {
00097 
00098         return m_aNodes[ iNode ].m_iParents; }
00099 
00100     size_t GetParent( size_t iNode, size_t iParent ) const {
00101 
00102         return m_aNodes[ iNode ].m_aiParents[ iParent ]; }
00103 
00104     size_t GetChildren( size_t iNode ) const {
00105 
00106         return m_aNodes[ iNode ].m_iChildren; }
00107 
00108     size_t GetChild( size_t iNode, size_t iChild ) const {
00109 
00110         return m_aNodes[ iNode ].m_aiChildren[ iChild ]; }
00111 
00112     size_t GetGenes( size_t iNode, bool fKids ) const {
00113         size_t  iRet;
00114 
00115         iRet = m_aNodes[ iNode ].m_iGenes;
00116         if( fKids ) {
00117             CollectGenes( iNode );
00118             iRet += m_aNodes[ iNode ].m_iCacheGenes; }
00119 
00120         return iRet; }
00121 
00122     const std::string& GetID( ) const {
00123 
00124         return m_strID; }
00125 
00126     const std::string& GetID( size_t iNode ) const {
00127 
00128         return m_aNodes[ iNode ].m_strID; }
00129 
00130     const std::string& GetGloss( size_t iNode ) const {
00131 
00132         return m_aNodes[ iNode ].m_strGloss; }
00133 
00134     void CollectGenes( size_t iNode ) const {
00135         TSetPGenes  setpGenes;
00136 
00137         if( m_aNodes[ iNode ].m_iCacheGenes == -1 )
00138             ((COntologyImpl*)this)->CollectGenes( iNode, setpGenes ); }
00139 
00140     bool GetChildren( size_t iNode, std::set<size_t>& setiChildren ) const {
00141         size_t  i, iChild;
00142 
00143         if( setiChildren.find( iNode ) != setiChildren.end( ) )
00144             return true;
00145 
00146         for( i = 0; i < GetChildren( iNode ); ++i ) {
00147             if( !GetChildren( iChild = GetChild( iNode, i ), setiChildren ) )
00148                 return false;
00149             setiChildren.insert( iChild ); }
00150 
00151         return true; }
00152 
00153     bool GetParents( size_t iNode, std::set<size_t>& setiParents ) const {
00154         size_t  i, iParent;
00155 
00156         if( setiParents.find( iNode ) != setiParents.end( ) )
00157             return true;
00158 
00159         for( i = 0; i < GetParents( iNode ); ++i ) {
00160             if( !GetParents( iParent = GetParent( iNode, i ), setiParents ) )
00161                 return false;
00162             setiParents.insert( iParent ); }
00163 
00164         return true; }
00165 
00166     const IOntology*    m_pOntology;
00167     std::string         m_strID;
00168     size_t              m_iNodes;
00169     TMapStrI            m_mapNodes;
00170     SNode*              m_aNodes;
00171 };
00172 
00173 class COntologyKEGGImpl : protected COntologyImpl {
00174 protected:
00175     static const char   c_szKEGG[];
00176     static const char   c_szEntry[];
00177     static const char   c_szName[];
00178     static const char   c_szDefinition[];
00179     static const char   c_szClass[];
00180     static const char   c_szPath[];
00181     static const char   c_szReference[];
00182     static const char   c_szDisease[];
00183     static const char   c_szPathway[];
00184     static const char   c_szModule[];
00185     static const char   c_szBR[];
00186     static const char   c_szDBLinks[];
00187     static const char   c_szGenes[];
00188     static const char   c_szEnd[];
00189     static const size_t c_iKEGG     = 10000;
00190 
00191     struct SParserKEGG : SParser {
00192         SParserKEGG( std::istream&, CGenome&, const std::string&, bool fSynonyms );
00193 
00194         void Reset( );
00195 
00196         const std::string&                  m_strOrganism;
00197         bool                                m_fOrganism;
00198         bool                                m_fPathing;
00199         bool                                m_fSynonyms;
00200         std::vector<CGene*>                 m_vecpGenes;
00201         std::vector<std::string>            m_vecstrIDs;
00202         std::map<std::string,std::string>   m_mapGlosses;
00203     };
00204 
00205     COntologyKEGGImpl( );
00206 
00207     bool Open( SParserKEGG& );
00208     bool OpenEntry( SParserKEGG& );
00209     bool OpenReferences( SParserKEGG& );
00210     bool OpenReference( SParserKEGG& );
00211     bool OpenName( SParserKEGG& );
00212     bool OpenDisease( SParserKEGG& );
00213     bool OpenPathway( SParserKEGG& );
00214     bool OpenModule( SParserKEGG& );
00215     bool OpenDefinition( SParserKEGG& );
00216     bool OpenClass( SParserKEGG& );
00217     bool OpenDBLinks( SParserKEGG& );
00218     bool OpenGenes( SParserKEGG& );
00219     bool OpenOrganism( SParserKEGG& );
00220     char* OpenGene( SParserKEGG&, char* );
00221     bool OpenEnd( SParserKEGG& );
00222     bool OpenGloss( SParserKEGG& );
00223 };
00224 
00225 class COntologyOBOImpl : protected COntologyImpl {
00226 protected:
00227     static const char   c_szAltID[];
00228     static const char   c_szOBO[];
00229     static const char   c_szHUMAN[];
00230     static const char   c_szID[];
00231     static const char   c_szIsA[];
00232     static const char   c_szIsObsolete[];
00233     static const char   c_szName[];
00234     static const char   c_szNamespace[];
00235     static const char   c_szNOT[];
00236     static const char   c_szPartOf[];
00237     static const char   c_szRelationship[];
00238     static const char   c_szSGD[];
00239     static const char   c_szTerm[];
00240     
00241     struct SParserOBO : SParser {
00242         typedef std::set<const CGene*>  TSetPGene;
00243 
00244         SParserOBO( std::istream&, CGenome&, bool = false, bool = false );
00245 
00246         void Reset( );
00247 
00248         const char*                 m_szTarget;
00249         std::vector<std::vector<std::string> >  m_vecvecstrParents;
00250         bool                        m_fObsolete;
00251         bool                        m_fDBIDs;
00252         bool                        m_fSynonyms;
00253         std::string                 m_strNamespace;
00254         std::vector<std::string>    m_vecstrIDs;
00255         std::vector<SNode>          m_vecNodes;
00256         std::vector<TSetPGene>      m_vecsetpGenes;
00257     };
00258 
00259     COntologyOBOImpl( );
00260     
00261     bool OpenOntology( SParserOBO& );
00262     bool OpenHeader( SParserOBO& );
00263     bool OpenBlock( SParserOBO& );
00264     bool OpenTerm( SParserOBO& );
00265     bool OpenID( SParserOBO& );
00266     bool OpenName( SParserOBO& );
00267     bool OpenNamespace( SParserOBO& );
00268     bool OpenRelationship( SParserOBO& );
00269     bool OpenParent( SParserOBO& );
00270     bool OpenAltID( SParserOBO& );
00271     bool OpenObsolete( SParserOBO& );
00272     bool OpenGenes( SParserOBO& );
00273     bool OpenGene( SParserOBO& );
00274 };
00275 
00276 class COntologyMIPSImpl : protected COntologyImpl {
00277 protected:
00278     static const char   c_szMIPS[];
00279 
00280     struct SParserMIPS : SParser {
00281         SParserMIPS( std::istream&, CGenome& );
00282 
00283         std::vector<size_t>                     m_veciParents;
00284         std::vector<std::string>                m_vecstrIDs;
00285         std::vector<std::string>                m_vecstrGlosses;
00286         std::stack<size_t>                      m_stakiHier;
00287         std::vector<std::vector<const CGene*> > m_vecpGenes;
00288     };
00289 
00290     COntologyMIPSImpl( );
00291 
00292     bool OpenOntology( SParserMIPS& );
00293     bool OpenCategory( SParserMIPS& );
00294     size_t OpenID( SParserMIPS& );
00295     bool OpenGenes( SParserMIPS& );
00296     bool OpenGene( SParserMIPS& );
00297 };
00298 
00299 class CSlimImpl : protected CFile {
00300 protected:
00301     void Reset( const IOntology* );
00302 
00303     std::vector<std::string>                m_vecstrSlims;
00304     std::vector<std::vector<size_t> >       m_vecveciTerms;
00305     std::vector<std::vector<const CGene*> > m_vecvecpGenes;
00306     const IOntology*                        m_pOntology;
00307 };
00308 
00309 }
00310 
00311 #endif // ANNOTATIONI_H