Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef ANNOTATIONI_H 00023 #define ANNOTATIONI_H 00024 00025 #include <map> 00026 #include <iostream> 00027 #include <set> 00028 #include <stack> 00029 #include <string> 00030 #include <vector> 00031 00032 #include "file.h" 00033 00034 namespace Sleipnir { 00035 00036 class CGene; 00037 class CGenes; 00038 class CGenome; 00039 class IOntology; 00040 00041 class COntologyImpl { 00042 protected: 00043 typedef std::map<std::string,size_t> TMapStrI; 00044 typedef std::set<const CGene*> TSetPGenes; 00045 00046 struct SNode { 00047 SNode( ); 00048 00049 void Reset( ); 00050 00051 std::string m_strID; 00052 std::string m_strGloss; 00053 size_t m_iParents; 00054 size_t* m_aiParents; 00055 size_t m_iChildren; 00056 size_t* m_aiChildren; 00057 size_t m_iGenes; 00058 const CGene** m_apGenes; 00059 size_t m_iCacheGenes; 00060 const CGene** m_apCacheGenes; 00061 }; 00062 00063 struct SParser { 00064 static const size_t c_iBuffer = 65536; 00065 00066 SParser( std::istream&, CGenome& ); 00067 00068 bool GetLine( ); 00069 bool IsStart( const char* ) const; 00070 00071 std::istream& m_istm; 00072 CGenome& m_Genome; 00073 char m_szLine[ c_iBuffer ]; 00074 std::string m_strGloss; 00075 size_t m_iLine; 00076 }; 00077 00078 COntologyImpl( const std::string& strID ) : m_strID(strID), m_iNodes(0), m_aNodes(NULL) { } 00079 00080 ~COntologyImpl( ) { 00081 00082 Reset( ); } 00083 00084 size_t GetNode( const std::string& ) const; 00085 bool IsAnnotated( size_t, const CGene&, bool ) const; 00086 const CGene& GetGene( size_t, size_t ) const; 00087 void GetGeneNames( std::vector<std::string>& ) const; 00088 void Reset( ); 00089 void CollectGenes( size_t, TSetPGenes& ); 00090 void TermFinder( const CGenes&, std::vector<STermFound>&, bool, bool, bool, float, const CGenes* ) const; 00091 00092 size_t GetNodes( ) const { 00093 00094 return m_iNodes; } 00095 00096 size_t GetParents( size_t iNode ) const { 00097 00098 return m_aNodes[ iNode ].m_iParents; } 00099 00100 size_t GetParent( size_t iNode, size_t iParent ) const { 00101 00102 return m_aNodes[ iNode ].m_aiParents[ iParent ]; } 00103 00104 size_t GetChildren( size_t iNode ) const { 00105 00106 return m_aNodes[ iNode ].m_iChildren; } 00107 00108 size_t GetChild( size_t iNode, size_t iChild ) const { 00109 00110 return m_aNodes[ iNode ].m_aiChildren[ iChild ]; } 00111 00112 size_t GetGenes( size_t iNode, bool fKids ) const { 00113 size_t iRet; 00114 00115 iRet = m_aNodes[ iNode ].m_iGenes; 00116 if( fKids ) { 00117 CollectGenes( iNode ); 00118 iRet += m_aNodes[ iNode ].m_iCacheGenes; } 00119 00120 return iRet; } 00121 00122 const std::string& GetID( ) const { 00123 00124 return m_strID; } 00125 00126 const std::string& GetID( size_t iNode ) const { 00127 00128 return m_aNodes[ iNode ].m_strID; } 00129 00130 const std::string& GetGloss( size_t iNode ) const { 00131 00132 return m_aNodes[ iNode ].m_strGloss; } 00133 00134 void CollectGenes( size_t iNode ) const { 00135 TSetPGenes setpGenes; 00136 00137 if( m_aNodes[ iNode ].m_iCacheGenes == -1 ) 00138 ((COntologyImpl*)this)->CollectGenes( iNode, setpGenes ); } 00139 00140 bool GetChildren( size_t iNode, std::set<size_t>& setiChildren ) const { 00141 size_t i, iChild; 00142 00143 if( setiChildren.find( iNode ) != setiChildren.end( ) ) 00144 return true; 00145 00146 for( i = 0; i < GetChildren( iNode ); ++i ) { 00147 if( !GetChildren( iChild = GetChild( iNode, i ), setiChildren ) ) 00148 return false; 00149 setiChildren.insert( iChild ); } 00150 00151 return true; } 00152 00153 bool GetParents( size_t iNode, std::set<size_t>& setiParents ) const { 00154 size_t i, iParent; 00155 00156 if( setiParents.find( iNode ) != setiParents.end( ) ) 00157 return true; 00158 00159 for( i = 0; i < GetParents( iNode ); ++i ) { 00160 if( !GetParents( iParent = GetParent( iNode, i ), setiParents ) ) 00161 return false; 00162 setiParents.insert( iParent ); } 00163 00164 return true; } 00165 00166 const IOntology* m_pOntology; 00167 std::string m_strID; 00168 size_t m_iNodes; 00169 TMapStrI m_mapNodes; 00170 SNode* m_aNodes; 00171 }; 00172 00173 class COntologyKEGGImpl : protected COntologyImpl { 00174 protected: 00175 static const char c_szKEGG[]; 00176 static const char c_szEntry[]; 00177 static const char c_szName[]; 00178 static const char c_szDefinition[]; 00179 static const char c_szClass[]; 00180 static const char c_szPath[]; 00181 static const char c_szReference[]; 00182 static const char c_szDisease[]; 00183 static const char c_szPathway[]; 00184 static const char c_szModule[]; 00185 static const char c_szBR[]; 00186 static const char c_szDBLinks[]; 00187 static const char c_szGenes[]; 00188 static const char c_szEnd[]; 00189 static const size_t c_iKEGG = 10000; 00190 00191 struct SParserKEGG : SParser { 00192 SParserKEGG( std::istream&, CGenome&, const std::string&, bool fSynonyms ); 00193 00194 void Reset( ); 00195 00196 const std::string& m_strOrganism; 00197 bool m_fOrganism; 00198 bool m_fPathing; 00199 bool m_fSynonyms; 00200 std::vector<CGene*> m_vecpGenes; 00201 std::vector<std::string> m_vecstrIDs; 00202 std::map<std::string,std::string> m_mapGlosses; 00203 }; 00204 00205 COntologyKEGGImpl( ); 00206 00207 bool Open( SParserKEGG& ); 00208 bool OpenEntry( SParserKEGG& ); 00209 bool OpenReferences( SParserKEGG& ); 00210 bool OpenReference( SParserKEGG& ); 00211 bool OpenName( SParserKEGG& ); 00212 bool OpenDisease( SParserKEGG& ); 00213 bool OpenPathway( SParserKEGG& ); 00214 bool OpenModule( SParserKEGG& ); 00215 bool OpenDefinition( SParserKEGG& ); 00216 bool OpenClass( SParserKEGG& ); 00217 bool OpenDBLinks( SParserKEGG& ); 00218 bool OpenGenes( SParserKEGG& ); 00219 bool OpenOrganism( SParserKEGG& ); 00220 char* OpenGene( SParserKEGG&, char* ); 00221 bool OpenEnd( SParserKEGG& ); 00222 bool OpenGloss( SParserKEGG& ); 00223 }; 00224 00225 class COntologyOBOImpl : protected COntologyImpl { 00226 protected: 00227 static const char c_szAltID[]; 00228 static const char c_szOBO[]; 00229 static const char c_szHUMAN[]; 00230 static const char c_szID[]; 00231 static const char c_szIsA[]; 00232 static const char c_szIsObsolete[]; 00233 static const char c_szName[]; 00234 static const char c_szNamespace[]; 00235 static const char c_szNOT[]; 00236 static const char c_szPartOf[]; 00237 static const char c_szRelationship[]; 00238 static const char c_szSGD[]; 00239 static const char c_szTerm[]; 00240 00241 struct SParserOBO : SParser { 00242 typedef std::set<const CGene*> TSetPGene; 00243 00244 SParserOBO( std::istream&, CGenome&, bool = false, bool = false ); 00245 00246 void Reset( ); 00247 00248 const char* m_szTarget; 00249 std::vector<std::vector<std::string> > m_vecvecstrParents; 00250 bool m_fObsolete; 00251 bool m_fDBIDs; 00252 bool m_fSynonyms; 00253 std::string m_strNamespace; 00254 std::vector<std::string> m_vecstrIDs; 00255 std::vector<SNode> m_vecNodes; 00256 std::vector<TSetPGene> m_vecsetpGenes; 00257 }; 00258 00259 COntologyOBOImpl( ); 00260 00261 bool OpenOntology( SParserOBO& ); 00262 bool OpenHeader( SParserOBO& ); 00263 bool OpenBlock( SParserOBO& ); 00264 bool OpenTerm( SParserOBO& ); 00265 bool OpenID( SParserOBO& ); 00266 bool OpenName( SParserOBO& ); 00267 bool OpenNamespace( SParserOBO& ); 00268 bool OpenRelationship( SParserOBO& ); 00269 bool OpenParent( SParserOBO& ); 00270 bool OpenAltID( SParserOBO& ); 00271 bool OpenObsolete( SParserOBO& ); 00272 bool OpenGenes( SParserOBO& ); 00273 bool OpenGene( SParserOBO& ); 00274 }; 00275 00276 class COntologyMIPSImpl : protected COntologyImpl { 00277 protected: 00278 static const char c_szMIPS[]; 00279 00280 struct SParserMIPS : SParser { 00281 SParserMIPS( std::istream&, CGenome& ); 00282 00283 std::vector<size_t> m_veciParents; 00284 std::vector<std::string> m_vecstrIDs; 00285 std::vector<std::string> m_vecstrGlosses; 00286 std::stack<size_t> m_stakiHier; 00287 std::vector<std::vector<const CGene*> > m_vecpGenes; 00288 }; 00289 00290 COntologyMIPSImpl( ); 00291 00292 bool OpenOntology( SParserMIPS& ); 00293 bool OpenCategory( SParserMIPS& ); 00294 size_t OpenID( SParserMIPS& ); 00295 bool OpenGenes( SParserMIPS& ); 00296 bool OpenGene( SParserMIPS& ); 00297 }; 00298 00299 class CSlimImpl : protected CFile { 00300 protected: 00301 void Reset( const IOntology* ); 00302 00303 std::vector<std::string> m_vecstrSlims; 00304 std::vector<std::vector<size_t> > m_vecveciTerms; 00305 std::vector<std::vector<const CGene*> > m_vecvecpGenes; 00306 const IOntology* m_pOntology; 00307 }; 00308 00309 } 00310 00311 #endif // ANNOTATIONI_H