Sleipnir
src/dataset.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef DATASET_H
00023 #define DATASET_H
00024 
00025 #include <assert.h>
00026 
00027 #include "dataseti.h"
00028 
00029 namespace Sleipnir {
00030 
00031 class IBayesNet;
00032 
00064 class IDataset {
00065 public:
00084     virtual bool IsHidden( size_t iNode ) const = 0;
00107     virtual size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const = 0;
00131     virtual float GetContinuous( size_t iY, size_t iX, size_t iNode ) const = 0;
00148     virtual const std::string& GetGene( size_t iGene ) const = 0;
00162     virtual size_t GetGenes( ) const = 0;
00180     virtual bool IsExample( size_t iY, size_t iX ) const = 0;
00191     virtual const std::vector<std::string>& GetGeneNames( ) const = 0;
00203     virtual size_t GetExperiments( ) const = 0;
00217     virtual size_t GetGene( const std::string& strGene ) const = 0;
00232     virtual size_t GetBins( size_t iNode ) const = 0;
00249     virtual void Remove( size_t iY, size_t iX ) = 0;
00270     virtual void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) = 0;
00271 
00285     virtual void Save( std::ostream& ostm, bool fBinary ) const = 0;
00286 };
00287 
00296 class CDataset : CDatasetImpl, public IDataset {
00297 public:
00298     bool Open( const char* szAnswerFile, const std::vector<std::string>& vecstrDataFiles );
00299     bool Open( const std::vector<std::string>& vecstrDataFiles );
00300     bool Open( const char* szAnswerFile, const char* szDataDirectory, const IBayesNet* pBayesNet );
00301 
00329     bool Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet ) {
00330 
00331         return CDatasetImpl::Open( &Answers, szDataDirectory, pBayesNet ); }
00332 
00355     bool Open( const char* szDataDirectory, const IBayesNet* pBayesNet ) {
00356 
00357         return CDatasetImpl::Open( NULL, szDataDirectory, pBayesNet ); }
00358 
00379     bool OpenGenes( const std::vector<std::string>& vecstrDataFiles ) {
00380 
00381         return CDataImpl::OpenGenes( vecstrDataFiles ); }
00382 
00383     size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const;
00384     bool IsExample( size_t iY, size_t iX ) const;
00385     void Remove( size_t iY, size_t iX );
00386 
00387     float GetContinuous( size_t iY, size_t iX, size_t iNode ) const {
00388 
00389         return CDatasetImpl::GetContinuous( iY, iX, iNode ); }
00390 
00391     void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) {
00392 
00393         CDataImpl::FilterGenes( this, Genes, eFilter ); }
00394 
00395     const std::vector<std::string>& GetGeneNames( ) const {
00396 
00397         return CDataImpl::GetGeneNames( ); }
00398 
00399     bool IsHidden( size_t iNode ) const {
00400 
00401         return CDataImpl::IsHidden( iNode ); }
00402 
00403     const std::string& GetGene( size_t iGene ) const {
00404 
00405         return CDataImpl::GetGene( iGene ); }
00406 
00407     size_t GetGenes( ) const {
00408 
00409         return CDataImpl::GetGenes( ); }
00410 
00411     size_t GetExperiments( ) const {
00412 
00413         return CDataImpl::GetExperiments( ); }
00414 
00415     size_t GetGene( const std::string& strGene ) const {
00416 
00417         return CDataImpl::GetGene( strGene ); }
00418 
00419     size_t GetBins( size_t iNode ) const {
00420 
00421         return CDataImpl::GetBins( iNode ); }
00422 
00423     void Save( std::ostream& ostm, bool fBinary ) const {
00424 
00425         fBinary ? SaveBinary( ostm ) : SaveText( ostm ); }
00426     
00427 };
00428 
00454 class CDatasetCompact : protected CDatasetCompactImpl, public IDataset {
00455 public:
00456     bool Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet,
00457         bool fEverything = false );
00458     bool Open( const CDataPair& Answers, const char* szDataDirectory, const IBayesNet* pBayesNet,
00459         const CGenes& GenesInclude, const CGenes& GenesExclude, bool fEverything = false );
00460     bool Open( const std::vector<std::string>& vecstrDataFiles, bool fMemmap = false );
00461     bool Open( std::istream& istm );
00462     bool Open( const CGenes& GenesInclude, const CGenes& GenesExclude, const CDataPair& Answers,
00463         const std::vector<std::string>& vecstrPCLs, size_t iSkip, const IMeasure* pMeasure,
00464         const std::vector<float>& vecdBinEdges );
00465     bool Open( const CDataPair& Answers, const std::vector<std::string>& vecstrDataFiles,
00466         bool fEverything = false, bool fMemmap = false, size_t iSkip = 2, bool fZScore = false );
00467     bool FilterGenes( const char* szGenes, CDat::EFilter eFilter );
00468     void FilterAnswers( );
00469     void Randomize( );
00470 
00491     bool Open( const char* szDataDirectory, const IBayesNet* pBayesNet ) {
00492 
00493         return CDatasetCompactImpl::Open( szDataDirectory, pBayesNet ); }
00494 
00521     bool Open( const char* szDataDirectory, const IBayesNet* pBayesNet, const CGenes& GenesInclude,
00522         const CGenes& GenesExclude ) {
00523 
00524         if( !CDatasetCompactImpl::Open( szDataDirectory, pBayesNet, &GenesInclude, &GenesExclude ) )
00525             return false;
00526         CDataImpl::FilterGenes( this, GenesInclude, CDat::EFilterInclude );
00527         CDataImpl::FilterGenes( this, GenesExclude, CDat::EFilterExclude );
00528 
00529         return true; }
00530 
00551     bool OpenGenes( const std::vector<std::string>& vecstrDataFiles ) {
00552 
00553         return CDataImpl::OpenGenes( vecstrDataFiles ); }
00554 
00555     void Save( std::ostream& ostm, bool fBinary ) const {
00556 
00557         fBinary ? SaveBinary( ostm ) : SaveText( ostm ); }
00558 
00559     float GetContinuous( size_t iY, size_t iX, size_t iNode ) const {
00560         UNUSED_PARAMETER(iY);
00561         UNUSED_PARAMETER(iX);
00562         UNUSED_PARAMETER(iNode);
00563 
00564         return CMeta::GetNaN( ); }
00565 
00566     const std::string& GetGene( size_t iGene ) const {
00567 
00568         return CDataImpl::GetGene( iGene ); }
00569 
00570     size_t GetGenes( ) const {
00571 
00572         return CDataImpl::GetGenes( ); }
00573 
00574     bool IsExample( size_t iY, size_t iX ) const {
00575 
00576         return CDatasetCompactImpl::IsExample( iY, iX ); }
00577 
00578     void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) {
00579 
00580         CDataImpl::FilterGenes( this, Genes, eFilter ); }
00581 
00582     bool IsHidden( size_t iNode ) const {
00583 
00584         return CDataImpl::IsHidden( iNode ); }
00585 
00586     size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const {
00587 
00588         return CDatasetCompactImpl::GetDiscrete( iY, iX, iNode ); }
00589 
00590     const std::vector<std::string>& GetGeneNames( ) const {
00591 
00592         return CDataImpl::GetGeneNames( ); }
00593 
00594     size_t GetExperiments( ) const {
00595 
00596         return CDataImpl::GetExperiments( ); }
00597 
00598     size_t GetGene( const std::string& strGene ) const {
00599 
00600         return CDataImpl::GetGene( strGene ); }
00601 
00602     size_t GetBins( size_t iNode ) const {
00603 
00604         return CDataImpl::GetBins( iNode ); }
00605 
00606     void Remove( size_t iY, size_t iX ) {
00607 
00608         CDatasetCompactImpl::Remove( iY, iX ); }
00609 };
00610 
00624 class CDatasetCompactMap : public CDatasetCompact {
00625 public:
00626     CDatasetCompactMap( );
00627     ~CDatasetCompactMap( );
00628 
00629     bool Open( const char* szFile );
00630 
00631     void Remove( size_t iY, size_t iX ) {
00632 
00633         m_Mask.Set( iY, iX, false ); }
00634 
00635     bool IsExample( size_t iY, size_t iX ) const {
00636 
00637         return m_Mask.Get( iY, iX ); }
00638 
00639 private:
00640     unsigned char*  m_pbData;
00641     CBinaryMatrix   m_Mask;
00642     size_t          m_iData;
00643     HANDLE          m_hndlMap;
00644 };
00645 
00658 class CDataMask : CDataMaskImpl, public IDataset {
00659 public:
00660     void Attach( const IDataset* pDataset );
00661     void AttachRandom( const IDataset* pDataset, float dFraction );
00662     void AttachComplement( const CDataMask& DataMask );
00663 
00664     bool IsExample( size_t iY, size_t iX ) const {
00665 
00666         return m_Mask.Get( iY, iX ); }
00667 
00668     void Remove( size_t iY, size_t iX ) {
00669 
00670         m_Mask.Set( iY, iX, false ); }
00671 
00672     const std::vector<std::string>& GetGeneNames( ) const {
00673 
00674         return CDataOverlayImpl::GetGeneNames( ); }
00675 
00676     size_t GetExperiments( ) const {
00677 
00678         return CDataOverlayImpl::GetExperiments( ); }
00679 
00680     size_t GetGene( const std::string& strGene ) const {
00681 
00682         return CDataOverlayImpl::GetGene( strGene ); }
00683 
00684     size_t GetBins( size_t iNode ) const {
00685 
00686         return CDataOverlayImpl::GetBins( iNode ); }
00687 
00688     size_t GetGenes( ) const {
00689 
00690         return CDataOverlayImpl::GetGenes( ); }
00691 
00692     bool IsHidden( size_t iNode ) const {
00693 
00694         return CDataOverlayImpl::IsHidden( iNode ); }
00695 
00696     size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const {
00697 
00698         return CDataOverlayImpl::GetDiscrete( iY, iX, iNode ); }
00699 
00700     float GetContinuous( size_t iY, size_t iX, size_t iNode ) const {
00701 
00702         return CDataOverlayImpl::GetContinuous( iY, iX, iNode ); }
00703 
00704     const std::string& GetGene( size_t iGene ) const {
00705 
00706         return CDataOverlayImpl::GetGene( iGene ); }
00707 
00708     void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) {
00709 
00710         CDataImpl::FilterGenes( this, Genes, eFilter ); }
00711 
00712     void Save( std::ostream& ostm, bool fBinary ) const {
00713 
00714         CDataOverlayImpl::Save( ostm, fBinary ); }
00715 };
00716 
00734 class CDataFilter : CDataFilterImpl, public IDataset {
00735 public:
00736     void Attach( const IDataset* pDataset, const CGenes& Genes, CDat::EFilter eFilter,
00737         const CDat* pAnswers = NULL );
00738 
00739     bool IsExample( size_t iY, size_t iX ) const;
00740 
00741     void Remove( size_t iY, size_t iX ) {
00742 
00743         assert( !"Unimplemented" ); }
00744 
00745     const std::vector<std::string>& GetGeneNames( ) const {
00746 
00747         return CDataOverlayImpl::GetGeneNames( ); }
00748 
00749     size_t GetExperiments( ) const {
00750 
00751         return CDataOverlayImpl::GetExperiments( ); }
00752 
00753     size_t GetGene( const std::string& strGene ) const {
00754 
00755         return CDataOverlayImpl::GetGene( strGene ); }
00756 
00757     size_t GetBins( size_t iNode ) const {
00758 
00759         return CDataOverlayImpl::GetBins( iNode ); }
00760 
00761     size_t GetGenes( ) const {
00762 
00763         return CDataOverlayImpl::GetGenes( ); }
00764 
00765     bool IsHidden( size_t iNode ) const {
00766 
00767         return CDataOverlayImpl::IsHidden( iNode ); }
00768 
00769     size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const {
00770 
00771         return ( IsExample( iY, iX ) ? CDataOverlayImpl::GetDiscrete( iY, iX, iNode ) : -1 ); }
00772 
00773     float GetContinuous( size_t iY, size_t iX, size_t iNode ) const {
00774 
00775         return ( IsExample( iY, iX ) ? CDataOverlayImpl::GetContinuous( iY, iX, iNode ) :
00776             CMeta::GetNaN( ) ); }
00777 
00778     const std::string& GetGene( size_t iGene ) const {
00779 
00780         return CDataOverlayImpl::GetGene( iGene ); }
00781 
00782     void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) {
00783 
00784         CDataImpl::FilterGenes( this, Genes, eFilter ); }
00785 
00786     void Save( std::ostream& ostm, bool fBinary ) const {
00787 
00788         CDataOverlayImpl::Save( ostm, fBinary ); }
00789 };
00790 
00811 class CDataSubset : CDataSubsetImpl, public IDataset {
00812 public:
00813     bool Initialize( const char* szDataDirectory, const IBayesNet* pBayesNet, size_t iGeneSize );
00814     bool Initialize( const std::vector<std::string>& vecstrDataFiles, size_t iGeneSize );
00815     bool Open( size_t iGeneOffset );
00816 
00817     bool IsHidden( size_t iNode ) const {
00818 
00819         return CDataImpl::IsHidden( iNode ); }
00820 
00821     size_t GetDiscrete( size_t iY, size_t iX, size_t iNode ) const {
00822         size_t  iMap;
00823 
00824         return ( ( ( iMap = m_veciMapping[ iNode ] ) == -1 ) ? -1 :
00825             m_Examples.Get( iY - m_iOffset, iX ).GetDiscrete( iMap ) ); }
00826 
00827     float GetContinuous( size_t iY, size_t iX, size_t iNode ) const {
00828         size_t  iMap;
00829 
00830         return ( ( ( iMap = m_veciMapping[ iNode ] ) == -1 ) ? CMeta::GetNaN( ) :
00831             m_Examples.Get( iY - m_iOffset, iX ).GetContinuous( iMap ) ); }
00832 
00833     const std::string& GetGene( size_t iGene ) const {
00834 
00835         return CDataImpl::GetGene( iGene ); }
00836 
00837     size_t GetGenes( ) const {
00838 
00839         return CDataImpl::GetGenes( ); }
00840 
00841     bool IsExample( size_t iY, size_t iX ) const {
00842 
00843         return ( ( ( iY < m_iOffset ) || ( ( iY - m_iOffset ) >= m_iSize ) ) ? false :
00844             m_Examples.Get( iY - m_iOffset, iX ).IsSet( ) ); }
00845 
00846     const std::vector<std::string>& GetGeneNames( ) const {
00847 
00848         return CDataImpl::GetGeneNames( ); }
00849 
00850     size_t GetExperiments( ) const {
00851 
00852         return CDataImpl::GetExperiments( ); }
00853 
00854     size_t GetGene( const std::string& strGene ) const {
00855 
00856         return CDataImpl::GetGene( strGene ); }
00857 
00858     size_t GetBins( size_t iNode ) const {
00859 
00860         return CDataImpl::GetBins( iNode ); }
00861 
00862     void Remove( size_t iY, size_t iX ) {
00863 
00864         m_Examples.Get( iY - m_iOffset, iX ).Reset( ); }
00865 
00866     void FilterGenes( const CGenes& Genes, CDat::EFilter eFilter ) {
00867 
00868         CDataImpl::FilterGenes( this, Genes, eFilter ); }
00869 };
00870 
00871 }
00872 
00873 #endif // DATASET_H