Sleipnir
src/annotation.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "annotation.h"
00024 #include "genome.h"
00025 #include "statistics.h"
00026 
00027 namespace Sleipnir {
00028 
00029 COntologyImpl::SNode::SNode( ) : m_iParents(0), m_aiParents(NULL), m_iChildren(0),
00030     m_aiChildren(NULL), m_iGenes(0), m_apGenes(NULL), m_iCacheGenes(-1),
00031     m_apCacheGenes(NULL) { }
00032 
00033 void COntologyImpl::SNode::Reset( ) {
00034 
00035     m_strID.clear( );
00036     m_iParents = m_iChildren = m_iGenes = 0;
00037     if( m_aiParents )
00038         delete[] m_aiParents;
00039     if( m_aiChildren )
00040         delete[] m_aiChildren;
00041     if( m_apGenes )
00042         delete[] m_apGenes;
00043     if( m_apCacheGenes )
00044         delete[] m_apCacheGenes; }
00045 
00046 COntologyImpl::SParser::SParser( std::istream& istm, CGenome& Genome ) : m_istm(istm),
00047     m_Genome(Genome), m_iLine(0) {
00048 
00049     m_szLine[ 0 ] = 0; }
00050 
00051 bool COntologyImpl::SParser::GetLine( ) {
00052 
00053     m_iLine++;
00054     m_istm.getline( m_szLine, c_iBuffer - 1 );
00055     g_CatSleipnir( ).debug( "COntologyImpl::SParser::GetLine( ) %s", m_szLine );
00056     return true; }
00057 
00058 bool COntologyImpl::SParser::IsStart( const char* szStart ) const {
00059 
00060     return !strncmp( m_szLine, szStart, strlen( szStart ) ); }
00061 
00062 const CGene& COntologyImpl::GetGene( size_t iNode, size_t iGene ) const {
00063     size_t  i;
00064 
00065     if( iGene < ( i = m_aNodes[ iNode ].m_iGenes ) )
00066         return *m_aNodes[ iNode ].m_apGenes[ iGene ];
00067 
00068     CollectGenes( iNode );
00069     return *m_aNodes[ iNode ].m_apCacheGenes[ iGene - i ]; }
00070 
00071 void COntologyImpl::Reset( ) {
00072     size_t  i;
00073 
00074     m_mapNodes.clear( );
00075     if( !m_aNodes )
00076         return;
00077 
00078     for( i = 0; i < m_iNodes; ++i )
00079         m_aNodes[ i ].Reset( );
00080     m_iNodes = 0;
00081     delete[] m_aNodes;
00082     m_aNodes = NULL; }
00083 
00084 bool COntologyImpl::IsAnnotated( size_t iNode, const CGene& Gene, bool fKids ) const {
00085     size_t  i;
00086 
00087     for( i = 0; i < m_aNodes[ iNode ].m_iGenes; ++i )
00088         if( m_aNodes[ iNode ].m_apGenes[ i ] == &Gene )
00089             return true;
00090     if( fKids ) {
00091         CollectGenes( iNode );
00092         for( i = 0; i < m_aNodes[ iNode ].m_iCacheGenes; ++i )
00093             if( m_aNodes[ iNode ].m_apCacheGenes[ i ] == &Gene )
00094                 return true; }
00095 
00096     return false; }
00097 
00098 void COntologyImpl::CollectGenes( size_t iNode, TSetPGenes& setpGenes ) {
00099     SNode&                      sNode   = m_aNodes[ iNode ];
00100     size_t                      i;
00101     TSetPGenes                  setpSelf, setpKids;
00102     TSetPGenes::const_iterator  iterGenes;
00103 
00104     if( sNode.m_iCacheGenes != -1 ) {
00105         for( i = 0; i < sNode.m_iGenes; ++i )
00106             setpGenes.insert( sNode.m_apGenes[ i ] );
00107         for( i = 0; i < sNode.m_iCacheGenes; ++i )
00108             setpGenes.insert( sNode.m_apCacheGenes[ i ] );
00109         return; }
00110 
00111     for( i = 0; i < sNode.m_iGenes; ++i ) {
00112         setpSelf.insert( sNode.m_apGenes[ i ] );
00113         setpGenes.insert( sNode.m_apGenes[ i ] ); }
00114     for( i = 0; i < sNode.m_iChildren; ++i )
00115         CollectGenes( sNode.m_aiChildren[ i ], setpKids );
00116     sNode.m_iCacheGenes = 0;
00117     for( iterGenes = setpKids.begin( ); iterGenes != setpKids.end( ); ++iterGenes )
00118         if( setpSelf.find( *iterGenes ) == setpSelf.end( ) )
00119             sNode.m_iCacheGenes++;
00120     if( sNode.m_iCacheGenes ) {
00121         sNode.m_apCacheGenes = new const CGene*[ sNode.m_iCacheGenes ];
00122         for( i = 0,iterGenes = setpKids.begin( ); iterGenes != setpKids.end( ); ++iterGenes )
00123             if( setpSelf.find( *iterGenes ) == setpSelf.end( ) ) {
00124                 sNode.m_apCacheGenes[ i++ ] = *iterGenes;
00125                 setpGenes.insert( *iterGenes ); } } }
00126 
00127 size_t COntologyImpl::GetNode( const std::string& strID ) const {
00128     TMapStrI::const_iterator    iterNode;
00129 
00130     iterNode = m_mapNodes.find( strID );
00131     return ( ( iterNode == m_mapNodes.end( ) ) ? -1 : iterNode->second ); }
00132 
00133 void COntologyImpl::GetGeneNames( std::vector<std::string>& vecstrGenes ) const {
00134     set<string>                 setstrGenes;
00135     set<string>::const_iterator iterGene;
00136     size_t                      i, j;
00137 
00138     for( i = 0; i < m_iNodes; ++i )
00139         for( j = 0; j < m_aNodes[ i ].m_iGenes; ++j )
00140             setstrGenes.insert( m_aNodes[ i ].m_apGenes[ j ]->GetName( ) );
00141 
00142     for( iterGene = setstrGenes.begin( ); iterGene != setstrGenes.end( ); ++iterGene )
00143         vecstrGenes.push_back( *iterGene ); }
00144 
00145 void COntologyImpl::TermFinder( const CGenes& Genes, vector<STermFound>& vecsTerms, bool fBon,
00146     bool fKids, bool fBack, float dPValue, const CGenes* pBkg ) const {
00147     size_t          i, j, iMult, iBkg, iGenes, iGiven;
00148     double          d;
00149     vector<size_t>  veciAnno;
00150 
00151     for( iGiven = i = 0; i < Genes.GetGenes( ); ++i )
00152         if( Genes.GetGene( i ).IsAnnotated( m_pOntology ) )
00153             iGiven++;
00154     iBkg = pBkg ? pBkg->GetGenes( ) :
00155         ( fBack ? Genes.GetGenome( ).GetGenes( ) : Genes.GetGenome( ).CountGenes( m_pOntology ) );
00156     veciAnno.resize( m_iNodes );
00157     for( iMult = i = 0; i < veciAnno.size( ); ++i )
00158         if( veciAnno[ i ] = Genes.CountAnnotations( m_pOntology, i, fKids, pBkg ) )
00159             iMult++;
00160     for( i = 0; i < m_iNodes; ++i ) {
00161         if( !veciAnno[ i ] )
00162             continue;
00163         if( pBkg ) {
00164             for( iGenes = j = 0; j < pBkg->GetGenes( ); ++j )
00165                 if( IsAnnotated( i, pBkg->GetGene( j ), fKids ) )
00166                     iGenes++; }
00167         else
00168             iGenes = GetGenes( i, fKids );
00169         d = CStatistics::HypergeometricCDF( veciAnno[ i ], iGiven, iGenes, iBkg );
00170         if( fBon ) {
00171             if( ( d *= iMult ) > 1 )
00172                 d = 1; }
00173         if( d <= dPValue )
00174             vecsTerms.push_back( STermFound( i, d, veciAnno[ i ], iGiven, iGenes, iBkg ) ); } }
00175 
00176 void CSlimImpl::Reset( const IOntology* pOntology ) {
00177 
00178     m_pOntology = pOntology;
00179     m_vecstrSlims.clear( );
00180     m_vecveciTerms.clear( );
00181     m_vecvecpGenes.clear( ); }
00182 
00208 bool CSlim::Open( std::istream& istmSlim, const IOntology* pOntology ) {
00209     static const size_t c_iBuffer   = 1024;
00210     char                                szBuf[ c_iBuffer ];
00211     size_t                              i, j, k, iNode;
00212     string                              str;
00213     set<const CGene*>                   setiGenes;
00214     set<const CGene*>::const_iterator   iterGene;
00215 
00216     g_CatSleipnir( ).info( "CSlim::Open( %s )", pOntology->GetID( ).c_str( ) );
00217 
00218     Reset( pOntology );
00219     while( istmSlim.peek( ) != EOF ) {
00220         i = m_vecveciTerms.size( );
00221         m_vecveciTerms.resize( i + 1 );
00222         istmSlim.getline( szBuf, c_iBuffer - 1 );
00223         {
00224             istringstream   issm( szBuf );
00225 
00226             while( issm.peek( ) != EOF ) {
00227                 str = OpenToken( issm );
00228                 cout << str << endl;
00229                 if( !str.length( ) )
00230                     break;
00231                 if( m_vecstrSlims.size( ) <= i )
00232                     m_vecstrSlims.push_back( str );
00233                 else {
00234                     if( ( j = m_pOntology->GetNode( str ) ) == -1 ) {
00235                         g_CatSleipnir( ).error( "CSlim::Open( %s ) unknown node: %s",
00236                             m_pOntology->GetID( ).c_str( ), str.c_str( ) );
00237                         return false; }
00238                     m_vecveciTerms[ i ].push_back( j ); } }
00239         } }
00240 
00241     m_vecvecpGenes.resize( m_vecveciTerms.size( ) );
00242     for( i = 0; i < m_vecveciTerms.size( ); ++i ) {
00243         setiGenes.clear( );
00244         for( j = 0; j < m_vecveciTerms[ i ].size( ); ++j ) {
00245             iNode = m_vecveciTerms[ i ][ j ];
00246             for( k = 0; k < m_pOntology->GetGenes( iNode, true ); ++k )
00247                 setiGenes.insert( &m_pOntology->GetGene( iNode, k ) ); }
00248         for( iterGene = setiGenes.begin( ); iterGene != setiGenes.end( ); ++iterGene )
00249             m_vecvecpGenes[ i ].push_back( *iterGene ); }
00250 
00251     return true; }
00252 
00260 void CSlim::GetGeneNames( std::vector<std::string>& vecstrGenes ) const {
00261     set<const CGene*>                   setpGenes;
00262     set<const CGene*>::const_iterator   iterGene;
00263     size_t                              i, j;
00264 
00265     for( i = 0; i < m_vecvecpGenes.size( ); ++i )
00266         for( j = 0; j < m_vecvecpGenes[ i ].size( ); ++j )
00267             setpGenes.insert( m_vecvecpGenes[ i ][ j ] );
00268 
00269     for( iterGene = setpGenes.begin( ); iterGene != setpGenes.end( ); ++iterGene )
00270         vecstrGenes.push_back( (*iterGene)->GetName( ) ); }
00271 
00272 }