Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "genome.h" 00024 #include "meta.h" 00025 #include "annotation.h" 00026 00027 namespace Sleipnir { 00028 00039 CGene::CGene( const std::string& strID ) : CGeneImpl(strID) { } 00040 00041 CGeneImpl::CGeneImpl( const std::string& strName ) : m_strName(strName), 00042 m_iOntologies(0), m_apOntologies(NULL), m_apveciAnnotations(NULL), 00043 m_iSynonyms(0), m_astrSynonyms(NULL), m_fRNA(false), m_fDubious(false) { } 00044 00045 CGeneImpl::~CGeneImpl( ) { 00046 size_t i; 00047 00048 if( m_iOntologies ) { 00049 delete[] m_apOntologies; 00050 for( i = 0; i < m_iOntologies; ++i ) 00051 delete m_apveciAnnotations[ i ]; 00052 delete[] m_apveciAnnotations; } 00053 if( m_iSynonyms ) 00054 delete[] m_astrSynonyms; } 00055 00056 CGeneImpl& CGeneImpl::operator=( const CGeneImpl& Gene ) { 00057 size_t i, j; 00058 00059 m_strName = Gene.m_strName; 00060 if( m_iSynonyms = Gene.m_iSynonyms ) { 00061 m_astrSynonyms = new string[ m_iSynonyms ]; 00062 for( i = 0; i < m_iSynonyms; ++i ) 00063 m_astrSynonyms[ i ] = Gene.m_astrSynonyms[ i ]; } 00064 if( m_iOntologies = Gene.m_iOntologies ) { 00065 m_apOntologies = new IOntology const*[ m_iOntologies ]; 00066 m_apveciAnnotations = new vector<size_t>*[ m_iOntologies ]; 00067 for( i = 0; i < m_iOntologies; ++i ) { 00068 m_apOntologies[ i ] = Gene.m_apOntologies[ i ]; 00069 m_apveciAnnotations[ i ] = new vector<size_t>( ); 00070 m_apveciAnnotations[ i ]->resize( Gene.m_apveciAnnotations[ i ]->size( ) ); 00071 for( j = 0; j < m_apveciAnnotations[ i ]->size( ); ++j ) 00072 (*m_apveciAnnotations[ i ])[ j ] = (*Gene.m_apveciAnnotations[ i ])[ j ]; } } 00073 00074 return *this; } 00075 00100 bool CGene::AddAnnotation( const IOntology* pOntology, size_t iTerm ) { 00101 size_t i, iOnto; 00102 00103 for( iOnto = 0; iOnto < m_iOntologies; ++iOnto ) 00104 if( m_apOntologies[ iOnto ] == pOntology ) 00105 break; 00106 if( iOnto >= m_iOntologies ) 00107 IncrementOntologies( pOntology ); 00108 00109 for( i = 0; i < m_apveciAnnotations[ iOnto ]->size( ); ++i ) 00110 if( (*m_apveciAnnotations[ iOnto ])[ i ] == iTerm ) 00111 return false; 00112 m_apveciAnnotations[ iOnto ]->push_back( iTerm ); 00113 return true; } 00114 00115 void CGeneImpl::IncrementOntologies( const IOntology* pOntology ) { 00116 const IOntology** apOntologies; 00117 vector<size_t>** apveciAnnotations; 00118 00119 apOntologies = new IOntology const*[ m_iOntologies + 1 ]; 00120 if( m_apOntologies ) { 00121 memcpy( apOntologies, m_apOntologies, m_iOntologies * sizeof(*m_apOntologies) ); 00122 delete[] m_apOntologies; } 00123 apOntologies[ m_iOntologies ] = pOntology; 00124 m_apOntologies = apOntologies; 00125 00126 apveciAnnotations = new vector<size_t>*[ m_iOntologies + 1 ]; 00127 if( m_apveciAnnotations ) { 00128 memcpy( apveciAnnotations, m_apveciAnnotations, m_iOntologies * 00129 sizeof(*m_apveciAnnotations) ); 00130 delete[] m_apveciAnnotations; } 00131 apveciAnnotations[ m_iOntologies++ ] = new vector<size_t>; 00132 m_apveciAnnotations = apveciAnnotations; } 00133 00144 bool CGene::IsAnnotated( const IOntology* pOntology ) const { 00145 size_t i; 00146 00147 for( i = 0; i < m_iOntologies; ++i ) 00148 if( pOntology == m_apOntologies[ i ] ) 00149 return true; 00150 00151 return false; } 00152 00170 bool CGene::IsAnnotated( const IOntology* pOntology, size_t iTerm ) const { 00171 size_t i, j; 00172 00173 for( i = 0; i < m_iOntologies; ++i ) 00174 if( pOntology == m_apOntologies[ i ] ) { 00175 for( j = 0; j < m_apveciAnnotations[ i ]->size( ); ++j ) 00176 if( iTerm == (*m_apveciAnnotations[ i ])[ j ] ) 00177 return true; 00178 break; } 00179 00180 return false; } 00181 00198 bool CGene::AddSynonym( const std::string& strName ) { 00199 size_t i; 00200 string* astrSynonyms; 00201 00202 if( !strName.length( ) ) 00203 g_CatSleipnir( ).warn( "CGene::AddSynonym( %s ) adding null synonym to %s", 00204 strName.c_str( ), m_strName.c_str( ) ); 00205 if( strName == m_strName ) 00206 return false; 00207 for( i = 0; i < m_iSynonyms; ++i ) 00208 if( strName == m_astrSynonyms[ i ] ) 00209 return false; 00210 00211 astrSynonyms = new string[ ++m_iSynonyms ]; 00212 for( i = 0; ( i + 1 ) < m_iSynonyms; ++i ) 00213 astrSynonyms[ i ] = m_astrSynonyms[ i ]; 00214 astrSynonyms[ i ] = strName; 00215 00216 if( m_astrSynonyms ) 00217 delete[] m_astrSynonyms; 00218 m_astrSynonyms = astrSynonyms; 00219 return true; } 00220 00221 bool CGene::SetWeight(float weight){ 00222 m_weight = weight; 00223 return true; 00224 }; 00225 00226 00227 00228 const char CGenomeImpl::c_szDubious[] = "Dubious"; 00229 const char CGenomeImpl::c_szORF[] = "ORF"; 00230 const char* CGenomeImpl::c_aszRNA[] = { "ncRNA", "rRNA", "snRNA", "snoRNA", "tRNA", 00231 NULL }; 00232 00233 CGenomeImpl::~CGenomeImpl( ) { 00234 size_t i; 00235 00236 for( i = 0; i < m_vecpGenes.size( ); ++i ) 00237 delete m_vecpGenes[ i ]; } 00238 00252 bool CGenome::Open( std::istream& istmFeatures ) { 00253 static const size_t c_iBuf = 4096; 00254 char szBuf[ c_iBuf ]; 00255 vector<string> vecstrLine, vecstrNames; 00256 size_t i; 00257 00258 while( istmFeatures.peek( ) != EOF ) { 00259 istmFeatures.getline( szBuf, c_iBuf - 1 ); 00260 vecstrLine.clear( ); 00261 CMeta::Tokenize( szBuf, vecstrLine ); 00262 if( vecstrLine.size( ) < 16 ) 00263 return false; 00264 00265 if( vecstrLine[ 1 ] != c_szORF ) { 00266 for( i = 0; c_aszRNA[ i ]; ++i ) 00267 if( vecstrLine[ 1 ] == c_aszRNA[ i ] ) 00268 break; 00269 if( !c_aszRNA[ i ] ) 00270 continue; } 00271 00272 { 00273 CGene& Gene = AddGene( vecstrLine[ 3 ] ); 00274 00275 // 1 Type ORF, CDS, RNA, etc. 00276 // 2 Qualifier Dubious, Verified, etc. 00277 // 3 ORF 00278 // 4 Name 00279 // 5 Aliases 00280 // 15 DESC 00281 Gene.SetRNA( vecstrLine[ 1 ] != c_szORF ); 00282 Gene.SetDubious( vecstrLine[ 2 ] == c_szDubious ); 00283 if( vecstrLine[ 4 ].length( ) ) 00284 AddSynonym( Gene, vecstrLine[ 4 ] ); 00285 if( vecstrLine[ 5 ].length( ) ) { 00286 vecstrNames.clear( ); 00287 CMeta::Tokenize( vecstrLine[ 5 ].c_str( ), vecstrNames, "|" ); 00288 for( i = 0; i < vecstrNames.size( ); ++i ) 00289 AddSynonym( Gene, vecstrNames[ i ] ); } 00290 Gene.SetGloss( vecstrLine[ 15 ] ); 00291 } } 00292 00293 return true; } 00294 00309 bool CGenome::Open( const std::vector<std::string>& vecstrGenes ) { 00310 size_t i; 00311 00312 for( i = 0; i < vecstrGenes.size( ); ++i ) 00313 AddGene( vecstrGenes[ i ] ); 00314 00315 return true; } 00316 00317 bool CGenome::Open( const char* szFile, std::vector<CGenes*>& vecpGenes ) { 00318 ifstream ifsm; 00319 00320 if( !szFile ) 00321 return Open( cin, vecpGenes ); 00322 00323 ifsm.open( szFile ); 00324 return ( ifsm.is_open( ) ? Open( ifsm, vecpGenes ) : false ); } 00325 00326 bool CGenome::Open( std::istream& istmGenes, std::vector<CGenes*>& vecpGenes ) { 00327 char szLine[ c_iBufferSize ]; 00328 string strLine; 00329 vector<string> vecstrLine; 00330 CGenes* pGenes; 00331 00332 while( istmGenes.peek( ) != EOF ) { 00333 istmGenes.getline( szLine, c_iBufferSize - 1 ); 00334 szLine[ c_iBufferSize - 1 ] = 0; 00335 if( !( strLine = CMeta::Trim( szLine ) ).length( ) ) 00336 continue; 00337 vecstrLine.clear( ); 00338 CMeta::Tokenize( strLine.c_str( ), vecstrLine ); 00339 pGenes = new CGenes( *this ); 00340 if( !pGenes->Open( vecstrLine ) ) { 00341 delete pGenes; 00342 g_CatSleipnir( ).error( "CGenome::Open( ) could not open line: %s", szLine ); 00343 return false; } 00344 vecpGenes.push_back( pGenes ); } 00345 00346 return true; } 00347 00368 CGene& CGenome::AddGene( const std::string& strID ) { 00369 TMapStrI::const_iterator iterGene; 00370 CGene* pGene; 00371 00372 if( ( iterGene = m_mapGenes.find( strID ) ) != m_mapGenes.end( ) ) 00373 return GetGene( iterGene->second ); 00374 00375 pGene = new CGene( strID ); 00376 m_vecpGenes.push_back( pGene ); 00377 m_mapGenes[ strID ] = m_vecpGenes.size( ) - 1; 00378 return *pGene; } 00379 00401 size_t CGenome::FindGene( const std::string& strGene ) const { 00402 size_t i, j, iRet; 00403 00404 if( ( iRet = GetGene( strGene ) ) != -1 ) 00405 return iRet; 00406 00407 for( i = 0; i < GetGenes( ); ++i ) 00408 for( j = 0; j < GetGene( i ).GetSynonyms( ); ++j ) 00409 if( strGene == GetGene( i ).GetSynonym( j ) ) 00410 return i; 00411 00412 return -1; } 00413 00421 vector<string> CGenome::GetGeneNames( ) const { 00422 vector<string> vecstrRet; 00423 size_t i; 00424 00425 vecstrRet.resize( m_vecpGenes.size( ) ); 00426 for( i = 0; i < vecstrRet.size( ); ++i ) 00427 vecstrRet[ i ] = m_vecpGenes[ i ]->GetName( ); 00428 00429 return vecstrRet; } 00430 00444 size_t CGenome::CountGenes( const IOntology* pOntology ) const { 00445 size_t i, j, iRet; 00446 00447 for( iRet = i = 0; i < m_vecpGenes.size( ); ++i ) 00448 for( j = 0; j < m_vecpGenes[ i ]->GetOntologies( ); ++j ) 00449 if( pOntology == m_vecpGenes[ i ]->GetOntology( j ) ) { 00450 iRet++; 00451 break; } 00452 00453 return iRet; } 00454 00474 bool CGenome::AddSynonym( CGene& Gene, const std::string& strName ) { 00475 00476 if( ( strName != Gene.GetName( ) ) && ( Gene.AddSynonym( strName ) ) ) { 00477 m_mapGenes[ strName ] = m_mapGenes[ Gene.GetName( ) ]; 00478 return true; } 00479 00480 return false; } 00481 00507 bool CGenes::Open( const char* szFile, CGenome& Genome, std::vector<std::string>& vecstrNames, std::vector<CGenes*>& vecpGenes ) { 00508 ifstream ifsm; 00509 vector<char> veccBuffer; 00510 istream* pistm; 00511 bool fRet; 00512 00513 if( szFile ) { 00514 ifsm.open( szFile ); 00515 pistm = &ifsm; } 00516 else 00517 pistm = &cin; 00518 if( !ifsm.is_open( ) ) { 00519 g_CatSleipnir( ).error( "CGenes::Open( %s ) could not open file", szFile ? szFile : "stdin" ); 00520 return false; } 00521 00522 veccBuffer.resize( CFile::GetBufferSize( ) ); 00523 fRet = false; 00524 while( !pistm->eof( ) ) { 00525 vector<string> vecstrLine; 00526 00527 pistm->getline( &veccBuffer[0], veccBuffer.size( ) - 1 ); 00528 CMeta::Tokenize( &veccBuffer[0], vecstrLine ); 00529 if( vecstrLine.empty( ) ) 00530 continue; 00531 vecstrNames.push_back( vecstrLine[0] ); 00532 vecstrLine.erase( vecstrLine.begin( ) ); 00533 vecpGenes.push_back( new CGenes( Genome ) ); 00534 if( !( fRet = vecpGenes.back( )->Open( vecstrLine ) ) ) 00535 break; } 00536 if( szFile ) 00537 ifsm.close( ); 00538 00539 return fRet; } 00540 00548 CGenes::CGenes( CGenome& Genome ) : CGenesImpl( Genome ) { } 00549 00550 CGenesImpl::CGenesImpl( CGenome& Genome ) : m_Genome(Genome),isWeighted(false) { } 00551 00578 bool CGenes::Open( std::istream& istm, bool fCreate ) { 00579 static const size_t c_iBuffer = 1024; 00580 char szBuf[ c_iBuffer ]; 00581 CGene* pGene; 00582 size_t i, iGene; 00583 char* pc; 00584 00585 if( istm.rdstate( ) != ios_base::goodbit ) 00586 return false; 00587 00588 m_mapGenes.clear( ); 00589 m_vecpGenes.clear( ); 00590 while( istm.peek( ) != EOF ) { 00591 istm.getline( szBuf, c_iBuffer - 1 ); 00592 if( pc = strchr( szBuf, '\t' ) ) 00593 *pc = 0; 00594 if( !szBuf[ 0 ] || ( szBuf[ 0 ] == c_cComment ) ) 00595 continue; 00596 if( fCreate ) 00597 pGene = &m_Genome.AddGene( szBuf ); 00598 else { 00599 if( ( iGene = m_Genome.FindGene( szBuf ) ) == -1 ) { 00600 g_CatSleipnir( ).warn( "CGenes::Open( %d ) unknown gene: %s", fCreate, szBuf ); 00601 continue; } 00602 pGene = &m_Genome.GetGene( iGene ); } 00603 for( i = 0; i < m_vecpGenes.size( ); ++i ) 00604 if( m_vecpGenes[ i ] == pGene ) 00605 break; 00606 if( i != m_vecpGenes.size( ) ) 00607 continue; 00608 m_mapGenes[ pGene->GetName( ) ] = m_vecpGenes.size( ); 00609 m_vecpGenes.push_back( pGene ); } 00610 isWeighted = false; 00611 return true; } 00612 00639 bool CGenes::OpenWeighted( std::istream& istm, bool fCreate ) { 00640 static const size_t c_iBuffer = 1024; 00641 char szBuf[ c_iBuffer ]; 00642 CGene* pGene; 00643 size_t i, iGene; 00644 char* pc; 00645 vector<string> vecstrTokens; 00646 00647 if( istm.rdstate( ) != ios_base::goodbit ) 00648 return false; 00649 00650 m_mapGenes.clear( ); 00651 m_vecpGenes.clear( ); 00652 while( istm.peek( ) != EOF ) { 00653 istm.getline( szBuf, c_iBuffer - 1 ); 00654 //if( pc = strchr( szBuf, '\t' ) ) 00655 // *pc = 0; 00656 if( !szBuf[ 0 ] || ( szBuf[ 0 ] == c_cComment ) ) 00657 continue; 00658 szBuf[c_iBuffer - 1] = 0; 00659 vecstrTokens.clear(); 00660 CMeta::Tokenize(szBuf, vecstrTokens); 00661 if (vecstrTokens.empty()) 00662 continue; 00663 if (vecstrTokens.size() != 2) { 00664 //cerr << "Illegal label line (" << vecstrTokens.size() << "): " 00665 // << szBuf << endl; 00666 return false; 00667 } 00668 00669 if( fCreate ) 00670 pGene = &m_Genome.AddGene( vecstrTokens[0] ); 00671 else { 00672 if( ( iGene = m_Genome.FindGene( vecstrTokens[0] ) ) == -1 ) { 00673 g_CatSleipnir( ).warn( "CGenes::Open( %d ) unknown gene: %s", fCreate, vecstrTokens[0].c_str() ); 00674 continue; } 00675 pGene = &m_Genome.GetGene( iGene ); } 00676 pGene->SetWeight(atof(vecstrTokens[1].c_str())); 00677 for( i = 0; i < m_vecpGenes.size( ); ++i ) 00678 if( m_vecpGenes[ i ] == pGene ) 00679 break; 00680 if( i != m_vecpGenes.size( ) ) 00681 continue; 00682 m_mapGenes[ pGene->GetName( ) ] = m_vecpGenes.size( ); 00683 m_vecpGenes.push_back( pGene ); } 00684 isWeighted = true; 00685 return true; } 00686 00687 00711 size_t CGenes::CountAnnotations( const IOntology* pOntology, size_t iTerm, bool fRecursive, 00712 const CGenes* pBackground ) const { 00713 size_t i, iRet; 00714 00715 for( iRet = i = 0; i < m_vecpGenes.size( ); ++i ) 00716 if( ( !pBackground || pBackground->IsGene( m_vecpGenes[ i ]->GetName( ) ) ) && 00717 pOntology->IsAnnotated( iTerm, *m_vecpGenes[ i ], fRecursive ) ) 00718 iRet++; 00719 00720 return iRet; } 00721 00742 bool CGenes::Open( const std::vector<std::string>& vecstrGenes, bool fCreate ) { 00743 size_t i, iGene; 00744 CGene* pGene; 00745 00746 m_mapGenes.clear( ); 00747 m_vecpGenes.clear( ); 00748 for( i = 0; i < vecstrGenes.size( ); ++i ) { 00749 if( !fCreate && ( ( iGene = m_Genome.FindGene( vecstrGenes[ i ] ) ) == -1 ) ) 00750 continue; 00751 00752 pGene = fCreate ? &m_Genome.AddGene( vecstrGenes[ i ] ) : &m_Genome.GetGene( iGene ); 00753 m_mapGenes[ vecstrGenes[ i ] ] = m_vecpGenes.size( ); 00754 m_vecpGenes.push_back( pGene ); } 00755 isWeighted = false; 00756 return true; } 00757 00769 void CGenes::Filter( const CGenes& GenesExclude ) { 00770 size_t i, j, iSize; 00771 00772 iSize = m_vecpGenes.size( ); 00773 for( i = 0; i < GenesExclude.GetGenes( ); ++i ) 00774 for( j = 0; j < iSize; ++j ) 00775 if( m_vecpGenes[ j ] == &GenesExclude.GetGene( i ) ) { 00776 m_mapGenes.erase( m_vecpGenes[ j ]->GetName( ) ); 00777 m_vecpGenes[ j ] = m_vecpGenes[ m_vecpGenes.size( ) - 1 ]; 00778 iSize--; 00779 break; } 00780 m_vecpGenes.resize( iSize ); } 00781 00789 vector<string> CGenes::GetGeneNames( ) const { 00790 vector<string> vecstrRet; 00791 size_t i; 00792 00793 vecstrRet.resize( m_vecpGenes.size( ) ); 00794 for( i = 0; i < vecstrRet.size( ); ++i ) 00795 vecstrRet[ i ] = m_vecpGenes[ i ]->GetName( ); 00796 00797 return vecstrRet; } 00798 00799 }