Sleipnir
src/genome.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "genome.h"
00024 #include "meta.h"
00025 #include "annotation.h"
00026 
00027 namespace Sleipnir {
00028 
00039 CGene::CGene( const std::string& strID ) : CGeneImpl(strID) { }
00040 
00041 CGeneImpl::CGeneImpl( const std::string& strName ) : m_strName(strName),
00042     m_iOntologies(0), m_apOntologies(NULL), m_apveciAnnotations(NULL),
00043     m_iSynonyms(0), m_astrSynonyms(NULL), m_fRNA(false), m_fDubious(false) { }
00044 
00045 CGeneImpl::~CGeneImpl( ) {
00046     size_t  i;
00047 
00048     if( m_iOntologies ) {
00049         delete[] m_apOntologies;
00050         for( i = 0; i < m_iOntologies; ++i )
00051             delete m_apveciAnnotations[ i ];
00052         delete[] m_apveciAnnotations; }
00053     if( m_iSynonyms )
00054         delete[] m_astrSynonyms; }
00055 
00056 CGeneImpl& CGeneImpl::operator=( const CGeneImpl& Gene ) {
00057     size_t  i, j;
00058 
00059     m_strName = Gene.m_strName;
00060     if( m_iSynonyms = Gene.m_iSynonyms ) {
00061         m_astrSynonyms = new string[ m_iSynonyms ];
00062         for( i = 0; i < m_iSynonyms; ++i )
00063             m_astrSynonyms[ i ] = Gene.m_astrSynonyms[ i ]; }
00064     if( m_iOntologies = Gene.m_iOntologies ) {
00065         m_apOntologies = new IOntology const*[ m_iOntologies ];
00066         m_apveciAnnotations = new vector<size_t>*[ m_iOntologies ];
00067         for( i = 0; i < m_iOntologies; ++i ) {
00068             m_apOntologies[ i ] = Gene.m_apOntologies[ i ];
00069             m_apveciAnnotations[ i ] = new vector<size_t>( );
00070             m_apveciAnnotations[ i ]->resize( Gene.m_apveciAnnotations[ i ]->size( ) );
00071             for( j = 0; j < m_apveciAnnotations[ i ]->size( ); ++j )
00072                 (*m_apveciAnnotations[ i ])[ j ] = (*Gene.m_apveciAnnotations[ i ])[ j ]; } }
00073 
00074     return *this; }
00075 
00100 bool CGene::AddAnnotation( const IOntology* pOntology, size_t iTerm ) {
00101     size_t  i, iOnto;
00102 
00103     for( iOnto = 0; iOnto < m_iOntologies; ++iOnto )
00104         if( m_apOntologies[ iOnto ] == pOntology )
00105             break;
00106     if( iOnto >= m_iOntologies )
00107         IncrementOntologies( pOntology );
00108 
00109     for( i = 0; i < m_apveciAnnotations[ iOnto ]->size( ); ++i )
00110         if( (*m_apveciAnnotations[ iOnto ])[ i ] == iTerm )
00111             return false;
00112     m_apveciAnnotations[ iOnto ]->push_back( iTerm );
00113     return true; }
00114 
00115 void CGeneImpl::IncrementOntologies( const IOntology* pOntology ) {
00116     const IOntology**   apOntologies;
00117     vector<size_t>**    apveciAnnotations;
00118 
00119     apOntologies = new IOntology const*[ m_iOntologies + 1 ];
00120     if( m_apOntologies ) {
00121         memcpy( apOntologies, m_apOntologies, m_iOntologies * sizeof(*m_apOntologies) );
00122         delete[] m_apOntologies; }
00123     apOntologies[ m_iOntologies ] = pOntology;
00124     m_apOntologies = apOntologies;
00125 
00126     apveciAnnotations = new vector<size_t>*[ m_iOntologies + 1 ];
00127     if( m_apveciAnnotations ) {
00128         memcpy( apveciAnnotations, m_apveciAnnotations, m_iOntologies *
00129             sizeof(*m_apveciAnnotations) );
00130         delete[] m_apveciAnnotations; }
00131     apveciAnnotations[ m_iOntologies++ ] = new vector<size_t>;
00132     m_apveciAnnotations = apveciAnnotations; }
00133 
00144 bool CGene::IsAnnotated( const IOntology* pOntology ) const {
00145     size_t  i;
00146 
00147     for( i = 0; i < m_iOntologies; ++i )
00148         if( pOntology == m_apOntologies[ i ] )
00149             return true;
00150 
00151     return false; }
00152 
00170 bool CGene::IsAnnotated( const IOntology* pOntology, size_t iTerm ) const {
00171     size_t  i, j;
00172 
00173     for( i = 0; i < m_iOntologies; ++i )
00174         if( pOntology == m_apOntologies[ i ] ) {
00175             for( j = 0; j < m_apveciAnnotations[ i ]->size( ); ++j )
00176                 if( iTerm == (*m_apveciAnnotations[ i ])[ j ] )
00177                     return true;
00178             break; }
00179 
00180     return false; }
00181 
00198 bool CGene::AddSynonym( const std::string& strName ) {
00199     size_t  i;
00200     string* astrSynonyms;
00201 
00202     if( !strName.length( ) )
00203         g_CatSleipnir( ).warn( "CGene::AddSynonym( %s ) adding null synonym to %s",
00204             strName.c_str( ), m_strName.c_str( ) );
00205     if( strName == m_strName )
00206         return false;
00207     for( i = 0; i < m_iSynonyms; ++i )
00208         if( strName == m_astrSynonyms[ i ] )
00209             return false;
00210 
00211     astrSynonyms = new string[ ++m_iSynonyms ];
00212     for( i = 0; ( i + 1 ) < m_iSynonyms; ++i )
00213         astrSynonyms[ i ] = m_astrSynonyms[ i ];
00214     astrSynonyms[ i ] = strName;
00215 
00216     if( m_astrSynonyms )
00217         delete[] m_astrSynonyms;
00218     m_astrSynonyms = astrSynonyms;
00219     return true; }
00220 
00221 bool CGene::SetWeight(float weight){
00222     m_weight = weight;
00223     return true;
00224 };
00225 
00226 
00227 
00228 const char  CGenomeImpl::c_szDubious[]  = "Dubious";
00229 const char  CGenomeImpl::c_szORF[]      = "ORF";
00230 const char* CGenomeImpl::c_aszRNA[]     = { "ncRNA", "rRNA", "snRNA", "snoRNA", "tRNA",
00231     NULL };
00232 
00233 CGenomeImpl::~CGenomeImpl( ) {
00234     size_t  i;
00235 
00236     for( i = 0; i < m_vecpGenes.size( ); ++i )
00237         delete m_vecpGenes[ i ]; }
00238 
00252 bool CGenome::Open( std::istream& istmFeatures ) {
00253     static const size_t c_iBuf  = 4096;
00254     char            szBuf[ c_iBuf ];
00255     vector<string>  vecstrLine, vecstrNames;
00256     size_t          i;
00257 
00258     while( istmFeatures.peek( ) != EOF ) {
00259         istmFeatures.getline( szBuf, c_iBuf - 1 );
00260         vecstrLine.clear( );
00261         CMeta::Tokenize( szBuf, vecstrLine );
00262         if( vecstrLine.size( ) < 16 )
00263             return false;
00264 
00265         if( vecstrLine[ 1 ] != c_szORF ) {
00266             for( i = 0; c_aszRNA[ i ]; ++i )
00267                 if( vecstrLine[ 1 ] == c_aszRNA[ i ] )
00268                     break;
00269             if( !c_aszRNA[ i ] )
00270                 continue; }
00271 
00272         {
00273             CGene&  Gene    = AddGene( vecstrLine[ 3 ] );
00274 
00275 //  1   Type        ORF, CDS, RNA, etc.
00276 //  2   Qualifier   Dubious, Verified, etc.
00277 //  3   ORF
00278 //  4   Name
00279 //  5   Aliases
00280 //  15  DESC
00281             Gene.SetRNA( vecstrLine[ 1 ] != c_szORF );
00282             Gene.SetDubious( vecstrLine[ 2 ] == c_szDubious );
00283             if( vecstrLine[ 4 ].length( ) )
00284                 AddSynonym( Gene, vecstrLine[ 4 ] );
00285             if( vecstrLine[ 5 ].length( ) ) {
00286                 vecstrNames.clear( );
00287                 CMeta::Tokenize( vecstrLine[ 5 ].c_str( ), vecstrNames, "|" );
00288                 for( i = 0; i < vecstrNames.size( ); ++i )
00289                     AddSynonym( Gene, vecstrNames[ i ] ); }
00290             Gene.SetGloss( vecstrLine[ 15 ] );
00291         } }
00292 
00293     return true; }
00294 
00309 bool CGenome::Open( const std::vector<std::string>& vecstrGenes ) {
00310     size_t  i;
00311 
00312     for( i = 0; i < vecstrGenes.size( ); ++i )
00313         AddGene( vecstrGenes[ i ] );
00314 
00315     return true; }
00316 
00317 bool CGenome::Open( const char* szFile, std::vector<CGenes*>& vecpGenes ) {
00318     ifstream    ifsm;
00319 
00320     if( !szFile )
00321         return Open( cin, vecpGenes );
00322 
00323     ifsm.open( szFile );
00324     return ( ifsm.is_open( ) ? Open( ifsm, vecpGenes ) : false ); }
00325 
00326 bool CGenome::Open( std::istream& istmGenes, std::vector<CGenes*>& vecpGenes ) {
00327     char            szLine[ c_iBufferSize ];
00328     string          strLine;
00329     vector<string>  vecstrLine;
00330     CGenes*         pGenes;
00331 
00332     while( istmGenes.peek( ) != EOF ) {
00333         istmGenes.getline( szLine, c_iBufferSize - 1 );
00334         szLine[ c_iBufferSize - 1 ] = 0;
00335         if( !( strLine = CMeta::Trim( szLine ) ).length( ) )
00336             continue;
00337         vecstrLine.clear( );
00338         CMeta::Tokenize( strLine.c_str( ), vecstrLine );
00339         pGenes = new CGenes( *this );
00340         if( !pGenes->Open( vecstrLine ) ) {
00341             delete pGenes;
00342             g_CatSleipnir( ).error( "CGenome::Open( ) could not open line: %s", szLine );
00343             return false; }
00344         vecpGenes.push_back( pGenes ); }
00345 
00346     return true; }
00347 
00368 CGene& CGenome::AddGene( const std::string& strID ) {
00369     TMapStrI::const_iterator    iterGene;
00370     CGene*                      pGene;
00371 
00372     if( ( iterGene = m_mapGenes.find( strID ) ) != m_mapGenes.end( ) )
00373         return GetGene( iterGene->second );
00374 
00375     pGene = new CGene( strID );
00376     m_vecpGenes.push_back( pGene );
00377     m_mapGenes[ strID ] = m_vecpGenes.size( ) - 1;
00378     return *pGene; }
00379 
00401 size_t CGenome::FindGene( const std::string& strGene ) const {
00402     size_t  i, j, iRet;
00403 
00404     if( ( iRet = GetGene( strGene ) ) != -1 )
00405         return iRet;
00406 
00407     for( i = 0; i < GetGenes( ); ++i )
00408         for( j = 0; j < GetGene( i ).GetSynonyms( ); ++j )
00409             if( strGene == GetGene( i ).GetSynonym( j ) )
00410                 return i;
00411 
00412     return -1; }
00413 
00421 vector<string> CGenome::GetGeneNames( ) const {
00422     vector<string>  vecstrRet;
00423     size_t          i;
00424 
00425     vecstrRet.resize( m_vecpGenes.size( ) );
00426     for( i = 0; i < vecstrRet.size( ); ++i )
00427         vecstrRet[ i ] = m_vecpGenes[ i ]->GetName( );
00428 
00429     return vecstrRet; }
00430 
00444 size_t CGenome::CountGenes( const IOntology* pOntology ) const {
00445     size_t  i, j, iRet;
00446 
00447     for( iRet = i = 0; i < m_vecpGenes.size( ); ++i )
00448         for( j = 0; j < m_vecpGenes[ i ]->GetOntologies( ); ++j )
00449             if( pOntology == m_vecpGenes[ i ]->GetOntology( j ) ) {
00450                 iRet++;
00451                 break; }
00452 
00453     return iRet; }
00454 
00474 bool CGenome::AddSynonym( CGene& Gene, const std::string& strName ) {
00475 
00476     if( ( strName != Gene.GetName( ) ) && ( Gene.AddSynonym( strName ) ) ) {
00477         m_mapGenes[ strName ] = m_mapGenes[ Gene.GetName( ) ];
00478         return true; }
00479 
00480     return false; }
00481 
00507 bool CGenes::Open( const char* szFile, CGenome& Genome, std::vector<std::string>& vecstrNames, std::vector<CGenes*>& vecpGenes ) {
00508     ifstream        ifsm;
00509     vector<char>    veccBuffer;
00510     istream*        pistm;
00511     bool            fRet;
00512 
00513     if( szFile ) {
00514         ifsm.open( szFile );
00515         pistm = &ifsm; }
00516     else
00517         pistm = &cin;
00518     if( !ifsm.is_open( ) ) {
00519         g_CatSleipnir( ).error( "CGenes::Open( %s ) could not open file", szFile ? szFile : "stdin" );
00520         return false; }
00521 
00522     veccBuffer.resize( CFile::GetBufferSize( ) );
00523     fRet = false;
00524     while( !pistm->eof( ) ) {
00525         vector<string>  vecstrLine;
00526 
00527         pistm->getline( &veccBuffer[0], veccBuffer.size( ) - 1 );
00528         CMeta::Tokenize( &veccBuffer[0], vecstrLine );
00529         if( vecstrLine.empty( ) )
00530             continue;
00531         vecstrNames.push_back( vecstrLine[0] );
00532         vecstrLine.erase( vecstrLine.begin( ) );
00533         vecpGenes.push_back( new CGenes( Genome ) );
00534         if( !( fRet = vecpGenes.back( )->Open( vecstrLine ) ) )
00535             break; }
00536     if( szFile )
00537         ifsm.close( );
00538 
00539     return fRet; }
00540 
00548 CGenes::CGenes( CGenome& Genome ) : CGenesImpl( Genome ) { }
00549 
00550 CGenesImpl::CGenesImpl( CGenome& Genome ) : m_Genome(Genome),isWeighted(false) { }
00551 
00578 bool CGenes::Open( std::istream& istm, bool fCreate ) {
00579     static const size_t c_iBuffer   = 1024;
00580     char    szBuf[ c_iBuffer ];
00581     CGene*  pGene;
00582     size_t  i, iGene;
00583     char*   pc;
00584 
00585     if( istm.rdstate( ) != ios_base::goodbit )
00586         return false;
00587 
00588     m_mapGenes.clear( );
00589     m_vecpGenes.clear( );
00590     while( istm.peek( ) != EOF ) {
00591         istm.getline( szBuf, c_iBuffer - 1 );
00592         if( pc = strchr( szBuf, '\t' ) )
00593             *pc = 0;
00594         if( !szBuf[ 0 ] || ( szBuf[ 0 ] == c_cComment ) )
00595             continue;
00596         if( fCreate )
00597             pGene = &m_Genome.AddGene( szBuf );
00598         else {
00599             if( ( iGene = m_Genome.FindGene( szBuf ) ) == -1 ) {
00600                 g_CatSleipnir( ).warn( "CGenes::Open( %d ) unknown gene: %s", fCreate, szBuf );
00601                 continue; }
00602             pGene = &m_Genome.GetGene( iGene ); }
00603         for( i = 0; i < m_vecpGenes.size( ); ++i )
00604             if( m_vecpGenes[ i ] == pGene )
00605                 break;
00606         if( i != m_vecpGenes.size( ) )
00607             continue;
00608         m_mapGenes[ pGene->GetName( ) ] = m_vecpGenes.size( );
00609         m_vecpGenes.push_back( pGene ); }
00610     isWeighted = false;
00611     return true; }
00612 
00639 bool CGenes::OpenWeighted( std::istream& istm, bool fCreate ) {
00640     static const size_t c_iBuffer   = 1024;
00641     char    szBuf[ c_iBuffer ];
00642     CGene*  pGene;
00643     size_t  i, iGene;
00644     char*   pc;
00645     vector<string> vecstrTokens;
00646 
00647     if( istm.rdstate( ) != ios_base::goodbit )
00648         return false;
00649 
00650     m_mapGenes.clear( );
00651     m_vecpGenes.clear( );
00652     while( istm.peek( ) != EOF ) {
00653         istm.getline( szBuf, c_iBuffer - 1 );
00654         //if( pc = strchr( szBuf, '\t' ) )
00655         //  *pc = 0;
00656         if( !szBuf[ 0 ] || ( szBuf[ 0 ] == c_cComment ) )
00657             continue;
00658         szBuf[c_iBuffer - 1] = 0;
00659         vecstrTokens.clear();
00660         CMeta::Tokenize(szBuf, vecstrTokens);
00661         if (vecstrTokens.empty())
00662             continue;
00663         if (vecstrTokens.size() != 2) {
00664             //cerr << "Illegal label line (" << vecstrTokens.size() << "): "
00665                 //  << szBuf << endl;
00666             return false;
00667         }
00668 
00669         if( fCreate )
00670             pGene = &m_Genome.AddGene( vecstrTokens[0] );
00671         else {
00672             if( ( iGene = m_Genome.FindGene( vecstrTokens[0] ) ) == -1 ) {
00673                 g_CatSleipnir( ).warn( "CGenes::Open( %d ) unknown gene: %s", fCreate, vecstrTokens[0].c_str() );
00674                 continue; }
00675             pGene = &m_Genome.GetGene( iGene ); }
00676         pGene->SetWeight(atof(vecstrTokens[1].c_str()));
00677         for( i = 0; i < m_vecpGenes.size( ); ++i )
00678             if( m_vecpGenes[ i ] == pGene )
00679                 break;
00680         if( i != m_vecpGenes.size( ) )
00681             continue;
00682         m_mapGenes[ pGene->GetName( ) ] = m_vecpGenes.size( );
00683         m_vecpGenes.push_back( pGene ); }
00684     isWeighted = true;
00685     return true; }
00686 
00687 
00711 size_t CGenes::CountAnnotations( const IOntology* pOntology, size_t iTerm, bool fRecursive,
00712     const CGenes* pBackground ) const {
00713     size_t  i, iRet;
00714 
00715     for( iRet = i = 0; i < m_vecpGenes.size( ); ++i )
00716         if( ( !pBackground || pBackground->IsGene( m_vecpGenes[ i ]->GetName( ) ) ) &&
00717             pOntology->IsAnnotated( iTerm, *m_vecpGenes[ i ], fRecursive ) )
00718             iRet++;
00719 
00720     return iRet; }
00721 
00742 bool CGenes::Open( const std::vector<std::string>& vecstrGenes, bool fCreate ) {
00743     size_t  i, iGene;
00744     CGene*  pGene;
00745 
00746     m_mapGenes.clear( );
00747     m_vecpGenes.clear( );
00748     for( i = 0; i < vecstrGenes.size( ); ++i ) {
00749         if( !fCreate && ( ( iGene = m_Genome.FindGene( vecstrGenes[ i ] ) ) == -1 ) )
00750             continue;
00751 
00752         pGene = fCreate ? &m_Genome.AddGene( vecstrGenes[ i ] ) : &m_Genome.GetGene( iGene );
00753         m_mapGenes[ vecstrGenes[ i ] ] = m_vecpGenes.size( );
00754         m_vecpGenes.push_back( pGene ); }
00755     isWeighted = false;
00756     return true; }
00757 
00769 void CGenes::Filter( const CGenes& GenesExclude ) {
00770     size_t  i, j, iSize;
00771 
00772     iSize = m_vecpGenes.size( );
00773     for( i = 0; i < GenesExclude.GetGenes( ); ++i )
00774         for( j = 0; j < iSize; ++j )
00775             if( m_vecpGenes[ j ] == &GenesExclude.GetGene( i ) ) {
00776                 m_mapGenes.erase( m_vecpGenes[ j ]->GetName( ) );
00777                 m_vecpGenes[ j ] = m_vecpGenes[ m_vecpGenes.size( ) - 1 ];
00778                 iSize--;
00779                 break; }
00780     m_vecpGenes.resize( iSize ); }
00781 
00789 vector<string> CGenes::GetGeneNames( ) const {
00790     vector<string>  vecstrRet;
00791     size_t          i;
00792 
00793     vecstrRet.resize( m_vecpGenes.size( ) );
00794     for( i = 0; i < vecstrRet.size( ); ++i )
00795         vecstrRet[ i ] = m_vecpGenes[ i ]->GetName( );
00796 
00797     return vecstrRet; }
00798 
00799 }