Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "dataset.h" 00024 #include "bayesnetint.h" 00025 #include "genome.h" 00026 #include "compactmatrix.h" 00027 00028 namespace Sleipnir { 00029 00030 CDatasetCompactImpl::CDatasetCompactImpl( ) : m_iData(0), m_aData(NULL) { 00031 00032 m_fContinuous = false; } 00033 00034 CDatasetCompactImpl::~CDatasetCompactImpl( ) { 00035 00036 if( m_aData ) 00037 delete[] m_aData; } 00038 00054 bool CDatasetCompact::Open( const std::vector<std::string>& vecstrDataFiles, bool fMemmap ) { 00055 size_t i; 00056 00057 if( !OpenGenes( vecstrDataFiles ) ) 00058 return false; 00059 if( m_aData ) 00060 delete[] m_aData; 00061 m_aData = new CCompactMatrix[ m_iData = (uint32_t)vecstrDataFiles.size( ) ]; 00062 00063 for( i = 0; i < vecstrDataFiles.size( ); ++i ) { 00064 CDataPair Datum; 00065 00066 if( !( Datum.Open( vecstrDataFiles[ i ].c_str( ), false, fMemmap ) && 00067 CDatasetCompactImpl::Open( Datum, i ) ) ) 00068 return false; } 00069 00070 return true; } 00071 00072 struct SIsGene { 00073 const CGenes& m_Genes; 00074 bool m_fIn; 00075 00076 SIsGene( const CGenes& Genes, bool fIn ) : m_Genes(Genes), m_fIn(fIn) { } 00077 00078 bool operator()( const string& strGene ) { 00079 00080 return ( m_fIn == m_Genes.IsGene( strGene ) ); } 00081 }; 00082 00112 bool CDatasetCompact::Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet, 00113 bool fEverything ) { 00114 CGenome Genome; 00115 CGenes GenesIn( Genome ), GenesEx( Genome ); 00116 00117 return Open( Answers, szDataDirectory, pBayesNet, GenesIn, GenesEx, fEverything ); } 00118 00154 bool CDatasetCompact::Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet, 00155 const CGenes& GenesInclude, const CGenes& GenesExclude, bool fEverything ) { 00156 size_t i; 00157 vector<string> vecstrData, vecstrNodes; 00158 set<string> setstrGenes; 00159 00160 if( pBayesNet->IsContinuous( ) ) 00161 return false; 00162 00163 pBayesNet->GetNodes( vecstrNodes ); 00164 m_iData = 1 + (uint32_t)OpenMax( szDataDirectory, vecstrNodes, true, vecstrData, fEverything ? 00165 &setstrGenes : NULL ); 00166 m_veccQuants.resize( m_iData ); 00167 if( m_aData ) 00168 delete[] m_aData; 00169 m_aData = new CCompactMatrix[ m_iData ]; 00170 00171 if( fEverything ) { 00172 m_vecstrGenes.resize( setstrGenes.size( ) ); 00173 copy( setstrGenes.begin( ), setstrGenes.end( ), m_vecstrGenes.begin( ) ); } 00174 else { 00175 m_vecstrGenes.resize( Answers.GetGenes( ) ); 00176 for( i = 0; i < m_vecstrGenes.size( ); ++i ) 00177 m_vecstrGenes[ i ] = Answers.GetGene( i ); } 00178 if( GenesInclude.GetGenes( ) ) 00179 remove_if( m_vecstrGenes.begin( ), m_vecstrGenes.end( ), SIsGene( GenesInclude, false ) ); 00180 if( GenesExclude.GetGenes( ) ) 00181 remove_if( m_vecstrGenes.begin( ), m_vecstrGenes.end( ), SIsGene( GenesExclude, true ) ); 00182 00183 if( !CDatasetCompactImpl::Open( Answers, 0 ) ) 00184 return false; 00185 for( i = 0; i < vecstrData.size( ); ++i ) { 00186 CDataPair Datum; 00187 00188 if( !( Datum.Open( vecstrData[ i ].c_str( ), false ) && 00189 CDatasetCompactImpl::Open( Datum, i + 1 ) ) ) 00190 return false; } 00191 00192 /* 00193 for( i = 0; i < m_vecstrGenes.size( ); ++i ) 00194 for( j = ( i + 1 ); j < m_vecstrGenes.size( ); ++j ) { 00195 for( k = 1; k < m_iData; ++k ) 00196 if( m_aData[ k ].Get( i, j ) ) 00197 break; 00198 if( k >= m_iData ) 00199 m_aData[ 0 ].Set( i, j, 0 ); } 00200 */ 00201 00202 return true; } 00203 00235 bool CDatasetCompact::Open( const CDataPair& Answers, const std::vector<std::string>& vecstrDataFiles, 00236 bool fEverything, bool fMemmap, size_t iSkip, bool fZScore ) { 00237 size_t i, j, k; 00238 00239 if( Answers.GetGenes( ) && Answers.IsContinuous( ) ) 00240 return false; 00241 00242 m_veciMapping.resize( m_iData = 1 + vecstrDataFiles.size( ) ); 00243 for( i = 0; i < m_veciMapping.size( ); ++i ) 00244 m_veciMapping[ i ] = i; 00245 m_veccQuants.resize( m_iData ); 00246 if( m_aData ) 00247 delete[] m_aData; 00248 m_aData = new CCompactMatrix[ m_iData ]; 00249 00250 if( fEverything ) { 00251 set<string> setstrGenes; 00252 00253 for( i = 0; i < Answers.GetGenes( ); ++i ) 00254 setstrGenes.insert( Answers.GetGene( i ) ); 00255 for( i = 0; i < vecstrDataFiles.size( ); ++i ) { 00256 CDat Dat; 00257 00258 if( !Dat.OpenGenes( vecstrDataFiles[ i ].c_str( ), iSkip ) ) 00259 return false; 00260 for( j = 0; j < Dat.GetGenes( ); ++j ) 00261 setstrGenes.insert( Dat.GetGene( j ) ); } 00262 m_vecstrGenes.resize( setstrGenes.size( ) ); 00263 copy( setstrGenes.begin( ), setstrGenes.end( ), m_vecstrGenes.begin( ) ); } 00264 else { 00265 m_vecstrGenes.resize( Answers.GetGenes( ) ); 00266 for( i = 0; i < m_vecstrGenes.size( ); ++i ) 00267 m_vecstrGenes[ i ] = Answers.GetGene( i ); } 00268 00269 if( !CDatasetCompactImpl::Open( Answers, 0 ) ) 00270 return false; 00271 for( i = 0; i < vecstrDataFiles.size( ); ++i ) { 00272 CDataPair Datum; 00273 00274 if( !( Datum.Open( vecstrDataFiles[ i ].c_str( ), false, fMemmap, iSkip, fZScore ) && 00275 CDatasetCompactImpl::Open( Datum, i + 1 ) ) ) 00276 return false; } 00277 00278 if( !fEverything && ( m_iData > 1 ) ) 00279 for( i = 0; i < m_vecstrGenes.size( ); ++i ) 00280 for( j = ( i + 1 ); j < m_vecstrGenes.size( ); ++j ) { 00281 for( k = 1; k < m_iData; ++k ) 00282 if( m_aData[ k ].Get( i, j ) ) 00283 break; 00284 if( k >= m_iData ) 00285 m_aData[ 0 ].Set( i, j, 0 ); } 00286 00287 return true; } 00288 00289 bool CDatasetCompactImpl::Open( const CDataPair& Datum, size_t iExp ) { 00290 vector<size_t> veciGenes; 00291 size_t i, j, iOne, iTwo; 00292 float d; 00293 CCompactMatrix& Target = m_aData[ iExp ]; 00294 00295 m_veccQuants[ iExp ] = Datum.IsContinuous( ) ? -1 : Datum.GetValues( ); 00296 Target.Initialize( m_vecstrGenes.size( ), (unsigned char)( Datum.GetValues( ) + 1 ), 00297 true ); 00298 veciGenes.resize( m_vecstrGenes.size( ) ); 00299 for( i = 0; i < veciGenes.size( ); ++i ) 00300 veciGenes[ i ] = Datum.GetGene( m_vecstrGenes[ i ] ); 00301 00302 for( i = 0; i < veciGenes.size( ); ++i ) 00303 if( ( iOne = veciGenes[ i ] ) != -1 ) 00304 for( j = ( i + 1 ); j < veciGenes.size( ); ++j ) 00305 if( ( ( iTwo = veciGenes[ j ] ) != -1 ) && 00306 !CMeta::IsNaN( d = Datum.Get( iOne, iTwo ) ) ) 00307 Target.Set( i, j, (unsigned char)( Datum.Quantize( d ) + 1 ) ); 00308 00309 return true; } 00310 00311 bool CDatasetCompactImpl::Open( const char* szDataDir, const IBayesNet* pBayesNet, 00312 const CGenes* pGenesIn, const CGenes* pGenesEx ) { 00313 size_t i; 00314 vector<string> vecstrData, vecstrNodes; 00315 set<string> setstrGenes; 00316 set<string>::const_iterator iterGenes; 00317 00318 if( pBayesNet->IsContinuous( ) ) 00319 return false; 00320 00321 pBayesNet->GetNodes( vecstrNodes ); 00322 m_iData = (uint32_t)OpenMax( szDataDir, vecstrNodes, false, vecstrData, &setstrGenes ); 00323 m_veccQuants.resize( m_iData ); 00324 if( pGenesIn ) 00325 for( i = 0; i < pGenesIn->GetGenes( ); ++i ) 00326 setstrGenes.insert( pGenesIn->GetGene( i ).GetName( ) ); 00327 if( pGenesEx ) 00328 for( i = 0; i < pGenesEx->GetGenes( ); ++i ) 00329 setstrGenes.erase( pGenesEx->GetGene( i ).GetName( ) ); 00330 m_vecstrGenes.resize( setstrGenes.size( ) ); 00331 for( i = 0,iterGenes = setstrGenes.begin( ); iterGenes != setstrGenes.end( ); 00332 ++iterGenes ) 00333 m_vecstrGenes[ i++ ] = *iterGenes; 00334 00335 if( m_aData ) 00336 delete[] m_aData; 00337 m_aData = new CCompactMatrix[ m_iData ]; 00338 00339 for( i = 0; i < vecstrData.size( ); ++i ) { 00340 CDataPair Datum; 00341 00342 if( !( Datum.Open( vecstrData[ i ].c_str( ), false ) && 00343 CDatasetCompactImpl::Open( Datum, i ) ) ) 00344 return false; } 00345 00346 return true; } 00347 00367 bool CDatasetCompact::FilterGenes( const char* szGenes, CDat::EFilter eFilter ) { 00368 ifstream ifsm; 00369 CGenome Genome; 00370 CGenes Genes( Genome ); 00371 00372 ifsm.open( szGenes ); 00373 if( !( ifsm.is_open( ) && Genes.Open( ifsm ) ) ) 00374 return false; 00375 FilterGenes( Genes, eFilter ); 00376 00377 return true; } 00378 00386 void CDatasetCompact::FilterAnswers( ) { 00387 size_t i, j; 00388 00389 for( i = 0; i < GetGenes( ); ++i ) 00390 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 00391 if( IsExample( i, j ) && ( GetDiscrete( i, j, 0 ) == -1 ) ) 00392 Remove( i, j ); } 00393 00394 size_t CDatasetCompactImpl::GetDiscrete( size_t iX, size_t iY, size_t iNode ) const { 00395 size_t iMap; 00396 00397 if( ( iMap = m_veciMapping[ iNode ] ) == -1 ) 00398 return -1; 00399 00400 return ( m_aData[ iMap ].Get( iX, iY ) - 1 ); } 00401 00402 bool CDatasetCompactImpl::IsExample( size_t iX, size_t iY ) const { 00403 size_t i; 00404 00405 for( i = 0; i < m_iData; ++i ) 00406 if( m_aData[ i ].Get( iX, iY ) ) 00407 return true; 00408 00409 return false; } 00410 00411 void CDatasetCompactImpl::Remove( size_t iX, size_t iY ) { 00412 size_t i; 00413 00414 for( i = 0; i < m_iData; ++i ) 00415 m_aData[ i ].Set( iX, iY, 0 ); } 00416 00417 bool CDatasetCompactImpl::Open( const unsigned char* pbData ) { 00418 size_t i; 00419 00420 if( m_aData ) 00421 delete[] m_aData; 00422 00423 if( !( pbData = CDataImpl::OpenBinary( pbData ) ) ) 00424 return false; 00425 m_iData = *(uint32_t*)pbData; 00426 pbData += sizeof(m_iData); 00427 m_aData = new CCompactMatrix[ m_iData ]; 00428 for( i = 0; i < m_iData; ++i ) 00429 if( !( pbData = m_aData[ i ].Open( pbData ) ) ) 00430 return false; 00431 00432 return true; } 00433 00447 bool CDatasetCompact::Open( std::istream& istm ) { 00448 size_t i; 00449 00450 if( m_aData ) 00451 delete[] m_aData; 00452 00453 if( !CDataImpl::OpenBinary( istm ) ) 00454 return false; 00455 istm.read( (char*)&m_iData, sizeof(m_iData) ); 00456 m_aData = new CCompactMatrix[ m_iData ]; 00457 for( i = 0; i < m_iData; ++i ) 00458 if( !m_aData[ i ].Open( istm ) ) 00459 return false; 00460 00461 return true; } 00462 00463 void CDatasetCompactImpl::SaveBinary( std::ostream& ostm ) const { 00464 size_t i; 00465 00466 CDataImpl::SaveBinary( ostm ); 00467 ostm.write( (char*)&m_iData, sizeof(m_iData) ); 00468 for( i = 0; i < m_iData; ++i ) 00469 m_aData[ i ].Save( ostm ); } 00470 00471 void CDatasetCompactImpl::SaveText( std::ostream& ostm ) const { 00472 size_t i, j, k, iVal; 00473 00474 for( i = 0; i < GetGenes( ); ++i ) 00475 for( j = ( i + 1 ); j < GetGenes( ); ++j ) 00476 if( IsExample( i, j ) ) { 00477 ostm << GetGene( i ) << '\t' << GetGene( j ); 00478 for( k = 0; k < GetExperiments( ); ++k ) { 00479 ostm << '\t'; 00480 if( ( iVal = GetDiscrete( i, j, k ) ) == -1 ) 00481 ostm << "-1"; 00482 else 00483 ostm << iVal; } 00484 ostm << endl; } } 00485 00525 bool CDatasetCompact::Open( const CGenes& GenesInclude, const CGenes& GenesExclude, const CDataPair& Answers, 00526 const std::vector<std::string>& vecstrPCLs, size_t iSkip, const IMeasure* pMeasure, 00527 const std::vector<float>& vecdBinEdges ) { 00528 size_t i, j, iPCL; 00529 set<string> setstrGenes; 00530 set<string>::iterator iterGene; 00531 00532 g_CatSleipnir( ).notice( "CDatasetCompact::Open( %d ) opening PCL files", 00533 iSkip ); 00534 00535 m_veciMapping.resize( m_iData = 1 + (uint32_t)vecstrPCLs.size( ) ); 00536 for( i = 0; i < m_veciMapping.size( ); ++i ) 00537 m_veciMapping[ i ] = i; 00538 m_veccQuants.resize( m_iData ); 00539 m_veccQuants[ 0 ] = Answers.GetValues( ); 00540 for( i = 1; i < m_veccQuants.size( ); ++i ) 00541 m_veccQuants[ i ] = (unsigned char)vecdBinEdges.size( ); 00542 00543 for( i = 0; i < Answers.GetGenes( ); ++i ) 00544 setstrGenes.insert( Answers.GetGene( i ) ); 00545 for( iPCL = 0; iPCL < vecstrPCLs.size( ); ++iPCL ) { 00546 ifstream ifsm; 00547 00548 ifsm.open( vecstrPCLs[ iPCL ].c_str( ) ); 00549 if( !CDataImpl::OpenGenes( ifsm, false, true, setstrGenes ) ) { 00550 g_CatSleipnir( ).error( "CDatasetCompact::Open( %d ) could not open: %s", iSkip, 00551 vecstrPCLs[ iPCL ].c_str( ) ); 00552 return false; } } 00553 if( GenesInclude.GetGenes( ) ) { 00554 for( iterGene = setstrGenes.begin( ); iterGene != setstrGenes.end( ); ++iterGene ) 00555 if( !GenesInclude.IsGene( *iterGene ) ) 00556 setstrGenes.erase( iterGene ); 00557 for( i = 0; i < GenesInclude.GetGenes( ); ++i ) 00558 setstrGenes.insert( GenesInclude.GetGene( i ).GetName( ) ); } 00559 if( GenesExclude.GetGenes( ) ) 00560 for( i = 0; i < GenesExclude.GetGenes( ); ++i ) 00561 setstrGenes.erase( GenesExclude.GetGene( i ).GetName( ) ); 00562 m_vecstrGenes.resize( setstrGenes.size( ) ); 00563 copy( setstrGenes.begin( ), setstrGenes.end( ), m_vecstrGenes.begin( ) ); 00564 00565 if( m_aData ) 00566 delete[] m_aData; 00567 m_aData = new CCompactMatrix[ m_iData ]; 00568 if( !CDatasetCompactImpl::Open( Answers, 0 ) ) 00569 return false; 00570 00571 for( iPCL = 0; iPCL < vecstrPCLs.size( ); ++iPCL ) { 00572 CPCL PCL; 00573 ifstream ifsm; 00574 CDistanceMatrix Dist; 00575 CDataPair Datum; 00576 vector<size_t> veciGenes; 00577 vector<string> vecstrGenes; 00578 size_t iGenes, iOne, iTwo; 00579 const float* adOne; 00580 00581 g_CatSleipnir( ).notice( "CDatasetCompact::Open( %d ) opening: %s", iSkip, vecstrPCLs[ iPCL ].c_str( ) ); 00582 ifsm.open( vecstrPCLs[ iPCL ].c_str( ) ); 00583 if( !PCL.Open( ifsm, iSkip ) ) { 00584 g_CatSleipnir( ).error( "CDatasetCompact::Open( %d ) could not open: %s", iSkip, vecstrPCLs[ iPCL ].c_str( ) ); 00585 return 1; } 00586 if( pMeasure->IsRank( ) ) 00587 PCL.RankTransform( ); 00588 00589 veciGenes.resize( PCL.GetGenes( ) ); 00590 if( GenesInclude.GetGenes( ) || GenesExclude.GetGenes( ) ) 00591 for( i = 0; i < PCL.GetGenes( ); ++i ) { 00592 const string& strGene = PCL.GetGene( i ); 00593 00594 if( GenesExclude.GetGenes( ) && GenesExclude.IsGene( strGene ) ) 00595 veciGenes[ i ] = -1; 00596 else if( GenesInclude.GetGenes( ) ) 00597 veciGenes[ i ] = (unsigned int)( GenesInclude.IsGene( strGene ) ? iGenes++ : -1 ); 00598 else 00599 veciGenes[ i ] = (unsigned int)iGenes++; 00600 if( veciGenes[ i ] != -1 ) 00601 vecstrGenes.push_back( strGene ); } 00602 else { 00603 vecstrGenes.resize( PCL.GetGenes( ) ); 00604 copy( PCL.GetGeneNames( ).begin( ), PCL.GetGeneNames( ).end( ), vecstrGenes.begin( ) ); 00605 for( i = 0; i < veciGenes.size( ); ++i ) 00606 veciGenes[ i ] = i; } 00607 Dist.Initialize( vecstrGenes.size( ) ); 00608 for( i = 0; i < Dist.GetSize( ); ++i ) 00609 for( j = ( i + 1 ); j < Dist.GetSize( ); ++j ) 00610 Dist.Set( i, j, CMeta::GetNaN( ) ); 00611 for( i = 0; i < PCL.GetGenes( ); ++i ) { 00612 if( ( iOne = veciGenes[ i ] ) == -1 ) 00613 continue; 00614 adOne = PCL.Get( i ); 00615 for( j = ( i + 1 ); j < PCL.GetGenes( ); ++j ) 00616 if( ( iTwo = veciGenes[ j ] ) != -1 ) 00617 Dist.Set( iOne, iTwo, (float)pMeasure->Measure( adOne, PCL.GetExperiments( ), PCL.Get( j ), 00618 PCL.GetExperiments( ) ) ); } 00619 00620 Datum.Open( vecstrGenes, Dist ); 00621 Datum.Normalize( CDat::ENormalizeZScore ); 00622 Datum.SetQuants( vecdBinEdges ); 00623 if( !CDatasetCompactImpl::Open( Datum, iPCL + 1 ) ) 00624 return false; } 00625 00626 return true; } 00627 00638 void CDatasetCompact::Randomize( ) { 00639 size_t i; 00640 00641 if( !m_aData ) 00642 return; 00643 00644 for( i = 1; i < m_iData; ++i ) 00645 m_aData[ i ].Randomize( ); } 00646 00647 CDatasetCompactMap::CDatasetCompactMap( ) : m_pbData(NULL), m_hndlMap(0) { } 00648 00649 CDatasetCompactMap::~CDatasetCompactMap( ) { 00650 00651 CMeta::Unmap( m_pbData, m_hndlMap, m_iData ); } 00652 00663 bool CDatasetCompactMap::Open( const char* szFile ) { 00664 size_t i, j; 00665 00666 CMeta::MapRead( m_pbData, m_hndlMap, m_iData, szFile ); 00667 if( !CDatasetCompactImpl::Open( m_pbData ) ) { 00668 CMeta::Unmap( m_pbData, m_hndlMap, m_iData ); 00669 return false; } 00670 00671 m_Mask.Initialize( GetGenes( ) ); 00672 for( i = 0; i < m_Mask.GetSize( ); ++i ) 00673 for( j = ( i + 1 ); j < m_Mask.GetSize( ); ++j ) 00674 m_Mask.Set( i, j, CDatasetCompact::IsExample( i, j ) ); 00675 return true; } 00676 00677 }