Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef DATASET_H 00023 #define DATASET_H 00024 00025 #include <assert.h> 00026 00027 #include "dataseti.h" 00028 00029 namespace Sleipnir { 00030 00031 class IBayesNet; 00032 00064 class IDataset { 00065 public: 00084 virtual bool IsHidden( size_t iNode ) const = 0; 00107 virtual size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const = 0; 00131 virtual float GetContinuous( size_t iY, size_t iX, size_t iNode ) const = 0; 00148 virtual const std::string& GetGene( size_t iGene ) const = 0; 00162 virtual size_t GetGenes( ) const = 0; 00180 virtual bool IsExample( size_t iY, size_t iX ) const = 0; 00191 virtual const std::vector<std::string>& GetGeneNames( ) const = 0; 00203 virtual size_t GetExperiments( ) const = 0; 00217 virtual size_t GetGene( const std::string& strGene ) const = 0; 00232 virtual size_t GetBins( size_t iNode ) const = 0; 00249 virtual void Remove( size_t iY, size_t iX ) = 0; 00270 virtual void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) = 0; 00271 00285 virtual void Save( std::ostream& ostm, bool fBinary ) const = 0; 00286 }; 00287 00296 class CDataset : CDatasetImpl, public IDataset { 00297 public: 00298 bool Open( const char* szAnswerFile, const std::vector<std::string>& vecstrDataFiles ); 00299 bool Open( const std::vector<std::string>& vecstrDataFiles ); 00300 bool Open( const char* szAnswerFile, const char* szDataDirectory, const IBayesNet* pBayesNet ); 00301 00329 bool Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet ) { 00330 00331 return CDatasetImpl::Open( &Answers, szDataDirectory, pBayesNet ); } 00332 00355 bool Open( const char* szDataDirectory, const IBayesNet* pBayesNet ) { 00356 00357 return CDatasetImpl::Open( NULL, szDataDirectory, pBayesNet ); } 00358 00379 bool OpenGenes( const std::vector<std::string>& vecstrDataFiles ) { 00380 00381 return CDataImpl::OpenGenes( vecstrDataFiles ); } 00382 00383 size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const; 00384 bool IsExample( size_t iY, size_t iX ) const; 00385 void Remove( size_t iY, size_t iX ); 00386 00387 float GetContinuous( size_t iY, size_t iX, size_t iNode ) const { 00388 00389 return CDatasetImpl::GetContinuous( iY, iX, iNode ); } 00390 00391 void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) { 00392 00393 CDataImpl::FilterGenes( this, Genes, eFilter ); } 00394 00395 const std::vector<std::string>& GetGeneNames( ) const { 00396 00397 return CDataImpl::GetGeneNames( ); } 00398 00399 bool IsHidden( size_t iNode ) const { 00400 00401 return CDataImpl::IsHidden( iNode ); } 00402 00403 const std::string& GetGene( size_t iGene ) const { 00404 00405 return CDataImpl::GetGene( iGene ); } 00406 00407 size_t GetGenes( ) const { 00408 00409 return CDataImpl::GetGenes( ); } 00410 00411 size_t GetExperiments( ) const { 00412 00413 return CDataImpl::GetExperiments( ); } 00414 00415 size_t GetGene( const std::string& strGene ) const { 00416 00417 return CDataImpl::GetGene( strGene ); } 00418 00419 size_t GetBins( size_t iNode ) const { 00420 00421 return CDataImpl::GetBins( iNode ); } 00422 00423 void Save( std::ostream& ostm, bool fBinary ) const { 00424 00425 fBinary ? SaveBinary( ostm ) : SaveText( ostm ); } 00426 00427 }; 00428 00454 class CDatasetCompact : protected CDatasetCompactImpl, public IDataset { 00455 public: 00456 bool Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet, 00457 bool fEverything = false ); 00458 bool Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet, 00459 const CGenes& GenesInclude, const CGenes& GenesExclude, bool fEverything = false ); 00460 bool Open( const std::vector<std::string>& vecstrDataFiles, bool fMemmap = false ); 00461 bool Open( std::istream& istm ); 00462 bool Open( const CGenes& GenesInclude, const CGenes& GenesExclude, const CDataPair& Answers, 00463 const std::vector<std::string>& vecstrPCLs, size_t iSkip, const IMeasure* pMeasure, 00464 const std::vector<float>& vecdBinEdges ); 00465 bool Open( const CDataPair& Answers, const std::vector<std::string>& vecstrDataFiles, 00466 bool fEverything = false, bool fMemmap = false, size_t iSkip = 2, bool fZScore = false ); 00467 bool FilterGenes( const char* szGenes, CDat::EFilter eFilter ); 00468 void FilterAnswers( ); 00469 void Randomize( ); 00470 00491 bool Open( const char* szDataDirectory, const IBayesNet* pBayesNet ) { 00492 00493 return CDatasetCompactImpl::Open( szDataDirectory, pBayesNet ); } 00494 00521 bool Open( const char* szDataDirectory, const IBayesNet* pBayesNet, const CGenes& GenesInclude, 00522 const CGenes& GenesExclude ) { 00523 00524 if( !CDatasetCompactImpl::Open( szDataDirectory, pBayesNet, &GenesInclude, &GenesExclude ) ) 00525 return false; 00526 CDataImpl::FilterGenes( this, GenesInclude, CDat::EFilterInclude ); 00527 CDataImpl::FilterGenes( this, GenesExclude, CDat::EFilterExclude ); 00528 00529 return true; } 00530 00551 bool OpenGenes( const std::vector<std::string>& vecstrDataFiles ) { 00552 00553 return CDataImpl::OpenGenes( vecstrDataFiles ); } 00554 00555 void Save( std::ostream& ostm, bool fBinary ) const { 00556 00557 fBinary ? SaveBinary( ostm ) : SaveText( ostm ); } 00558 00559 float GetContinuous( size_t iY, size_t iX, size_t iNode ) const { 00560 UNUSED_PARAMETER(iY); 00561 UNUSED_PARAMETER(iX); 00562 UNUSED_PARAMETER(iNode); 00563 00564 return CMeta::GetNaN( ); } 00565 00566 const std::string& GetGene( size_t iGene ) const { 00567 00568 return CDataImpl::GetGene( iGene ); } 00569 00570 size_t GetGenes( ) const { 00571 00572 return CDataImpl::GetGenes( ); } 00573 00574 bool IsExample( size_t iY, size_t iX ) const { 00575 00576 return CDatasetCompactImpl::IsExample( iY, iX ); } 00577 00578 void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) { 00579 00580 CDataImpl::FilterGenes( this, Genes, eFilter ); } 00581 00582 bool IsHidden( size_t iNode ) const { 00583 00584 return CDataImpl::IsHidden( iNode ); } 00585 00586 size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const { 00587 00588 return CDatasetCompactImpl::GetDiscrete( iY, iX, iNode ); } 00589 00590 const std::vector<std::string>& GetGeneNames( ) const { 00591 00592 return CDataImpl::GetGeneNames( ); } 00593 00594 size_t GetExperiments( ) const { 00595 00596 return CDataImpl::GetExperiments( ); } 00597 00598 size_t GetGene( const std::string& strGene ) const { 00599 00600 return CDataImpl::GetGene( strGene ); } 00601 00602 size_t GetBins( size_t iNode ) const { 00603 00604 return CDataImpl::GetBins( iNode ); } 00605 00606 void Remove( size_t iY, size_t iX ) { 00607 00608 CDatasetCompactImpl::Remove( iY, iX ); } 00609 }; 00610 00624 class CDatasetCompactMap : public CDatasetCompact { 00625 public: 00626 CDatasetCompactMap( ); 00627 ~CDatasetCompactMap( ); 00628 00629 bool Open( const char* szFile ); 00630 00631 void Remove( size_t iY, size_t iX ) { 00632 00633 m_Mask.Set( iY, iX, false ); } 00634 00635 bool IsExample( size_t iY, size_t iX ) const { 00636 00637 return m_Mask.Get( iY, iX ); } 00638 00639 private: 00640 unsigned char* m_pbData; 00641 CBinaryMatrix m_Mask; 00642 size_t m_iData; 00643 HANDLE m_hndlMap; 00644 }; 00645 00658 class CDataMask : CDataMaskImpl, public IDataset { 00659 public: 00660 void Attach( const IDataset* pDataset ); 00661 void AttachRandom( const IDataset* pDataset, float dFraction ); 00662 void AttachComplement( const CDataMask& DataMask ); 00663 00664 bool IsExample( size_t iY, size_t iX ) const { 00665 00666 return m_Mask.Get( iY, iX ); } 00667 00668 void Remove( size_t iY, size_t iX ) { 00669 00670 m_Mask.Set( iY, iX, false ); } 00671 00672 const std::vector<std::string>& GetGeneNames( ) const { 00673 00674 return CDataOverlayImpl::GetGeneNames( ); } 00675 00676 size_t GetExperiments( ) const { 00677 00678 return CDataOverlayImpl::GetExperiments( ); } 00679 00680 size_t GetGene( const std::string& strGene ) const { 00681 00682 return CDataOverlayImpl::GetGene( strGene ); } 00683 00684 size_t GetBins( size_t iNode ) const { 00685 00686 return CDataOverlayImpl::GetBins( iNode ); } 00687 00688 size_t GetGenes( ) const { 00689 00690 return CDataOverlayImpl::GetGenes( ); } 00691 00692 bool IsHidden( size_t iNode ) const { 00693 00694 return CDataOverlayImpl::IsHidden( iNode ); } 00695 00696 size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const { 00697 00698 return CDataOverlayImpl::GetDiscrete( iY, iX, iNode ); } 00699 00700 float GetContinuous( size_t iY, size_t iX, size_t iNode ) const { 00701 00702 return CDataOverlayImpl::GetContinuous( iY, iX, iNode ); } 00703 00704 const std::string& GetGene( size_t iGene ) const { 00705 00706 return CDataOverlayImpl::GetGene( iGene ); } 00707 00708 void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) { 00709 00710 CDataImpl::FilterGenes( this, Genes, eFilter ); } 00711 00712 void Save( std::ostream& ostm, bool fBinary ) const { 00713 00714 CDataOverlayImpl::Save( ostm, fBinary ); } 00715 }; 00716 00734 class CDataFilter : CDataFilterImpl, public IDataset { 00735 public: 00736 void Attach( const IDataset* pDataset, const CGenes& Genes, CDat::EFilter eFilter, 00737 const CDat* pAnswers = NULL ); 00738 00739 bool IsExample( size_t iY, size_t iX ) const; 00740 00741 void Remove( size_t iY, size_t iX ) { 00742 00743 assert( !"Unimplemented" ); } 00744 00745 const std::vector<std::string>& GetGeneNames( ) const { 00746 00747 return CDataOverlayImpl::GetGeneNames( ); } 00748 00749 size_t GetExperiments( ) const { 00750 00751 return CDataOverlayImpl::GetExperiments( ); } 00752 00753 size_t GetGene( const std::string& strGene ) const { 00754 00755 return CDataOverlayImpl::GetGene( strGene ); } 00756 00757 size_t GetBins( size_t iNode ) const { 00758 00759 return CDataOverlayImpl::GetBins( iNode ); } 00760 00761 size_t GetGenes( ) const { 00762 00763 return CDataOverlayImpl::GetGenes( ); } 00764 00765 bool IsHidden( size_t iNode ) const { 00766 00767 return CDataOverlayImpl::IsHidden( iNode ); } 00768 00769 size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const { 00770 00771 return ( IsExample( iY, iX ) ? CDataOverlayImpl::GetDiscrete( iY, iX, iNode ) : -1 ); } 00772 00773 float GetContinuous( size_t iY, size_t iX, size_t iNode ) const { 00774 00775 return ( IsExample( iY, iX ) ? CDataOverlayImpl::GetContinuous( iY, iX, iNode ) : 00776 CMeta::GetNaN( ) ); } 00777 00778 const std::string& GetGene( size_t iGene ) const { 00779 00780 return CDataOverlayImpl::GetGene( iGene ); } 00781 00782 void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) { 00783 00784 CDataImpl::FilterGenes( this, Genes, eFilter ); } 00785 00786 void Save( std::ostream& ostm, bool fBinary ) const { 00787 00788 CDataOverlayImpl::Save( ostm, fBinary ); } 00789 }; 00790 00811 class CDataSubset : CDataSubsetImpl, public IDataset { 00812 public: 00813 bool Initialize( const char* szDataDirectory, const IBayesNet* pBayesNet, size_t iGeneSize ); 00814 bool Initialize( const std::vector<std::string>& vecstrDataFiles, size_t iGeneSize ); 00815 bool Open( size_t iGeneOffset ); 00816 00817 bool IsHidden( size_t iNode ) const { 00818 00819 return CDataImpl::IsHidden( iNode ); } 00820 00821 size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const { 00822 size_t iMap; 00823 00824 return ( ( ( iMap = m_veciMapping[ iNode ] ) == -1 ) ? -1 : 00825 m_Examples.Get( iY - m_iOffset, iX ).GetDiscrete( iMap ) ); } 00826 00827 float GetContinuous( size_t iY, size_t iX, size_t iNode ) const { 00828 size_t iMap; 00829 00830 return ( ( ( iMap = m_veciMapping[ iNode ] ) == -1 ) ? CMeta::GetNaN( ) : 00831 m_Examples.Get( iY - m_iOffset, iX ).GetContinuous( iMap ) ); } 00832 00833 const std::string& GetGene( size_t iGene ) const { 00834 00835 return CDataImpl::GetGene( iGene ); } 00836 00837 size_t GetGenes( ) const { 00838 00839 return CDataImpl::GetGenes( ); } 00840 00841 bool IsExample( size_t iY, size_t iX ) const { 00842 00843 return ( ( ( iY < m_iOffset ) || ( ( iY - m_iOffset ) >= m_iSize ) ) ? false : 00844 m_Examples.Get( iY - m_iOffset, iX ).IsSet( ) ); } 00845 00846 const std::vector<std::string>& GetGeneNames( ) const { 00847 00848 return CDataImpl::GetGeneNames( ); } 00849 00850 size_t GetExperiments( ) const { 00851 00852 return CDataImpl::GetExperiments( ); } 00853 00854 size_t GetGene( const std::string& strGene ) const { 00855 00856 return CDataImpl::GetGene( strGene ); } 00857 00858 size_t GetBins( size_t iNode ) const { 00859 00860 return CDataImpl::GetBins( iNode ); } 00861 00862 void Remove( size_t iY, size_t iX ) { 00863 00864 m_Examples.Get( iY - m_iOffset, iX ).Reset( ); } 00865 00866 void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) { 00867 00868 CDataImpl::FilterGenes( this, Genes, eFilter ); } 00869 }; 00870 00871 } 00872 00873 #endif // DATASET_H