Sleipnir
src/dataseti.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef DATASETI_H
00023 #define DATASETI_H
00024 
00025 #include <set>
00026 
00027 #include "examplei.h"
00028 #include "fullmatrix.h"
00029 #include "halfmatrix.h"
00030 
00031 namespace Sleipnir {
00032 
00033 class CCompactMatrix;
00034 class CDataPair;
00035 class IBayesNet;
00036 class IDataset;
00037 
00038 class CDataImpl {
00039     friend class CDataFilter;
00040     friend class CDataMask;
00041 protected:
00042     static const char   c_cSeparator    = '/';
00043     static const char   c_szDat[];
00044     static const char   c_szDab[];
00045 
00046     static void FilterGenes( IDataset*, const CGenes&, CDat::EFilter );
00047 
00048     size_t OpenMax( const char*, const std::vector<std::string>&, bool,
00049         std::vector<std::string>&, std::set<std::string>* = NULL );
00050     bool OpenGenes( std::istream&, bool, bool, std::set<std::string>& ) const;
00051     bool OpenGenes( const std::vector<std::string>& );
00052 
00053     size_t GetGene( const std::string& ) const;
00054     bool OpenBinary( std::istream& );
00055     const unsigned char* OpenBinary( const unsigned char* );
00056     void SaveBinary( std::ostream& ) const;
00057 
00058     bool IsHidden( size_t iNode ) const {
00059 
00060         return ( m_veciMapping[ iNode ] == -1 ); }
00061 
00062     const std::string& GetGene( size_t iGene ) const {
00063 
00064         return m_vecstrGenes[ iGene ]; }
00065 
00066     size_t GetGenes( ) const {
00067 
00068         return m_vecstrGenes.size( ); }
00069 
00070     const std::vector<std::string>& GetGeneNames( ) const {
00071 
00072         return m_vecstrGenes; }
00073 
00074     size_t GetExperiments( ) const {
00075 
00076         return m_veciMapping.size( ); }
00077 
00078     unsigned char GetBins( size_t iExp ) const {
00079 
00080         return m_veccQuants[ iExp ]; }
00081 
00082     bool                        m_fContinuous;
00083     std::vector<size_t>         m_veciMapping;
00084     std::vector<std::string>    m_vecstrGenes;
00085     std::vector<unsigned char>  m_veccQuants;
00086 };
00087 
00088 class CDatasetImpl : protected CDataImpl {
00089 protected:
00090     CDatasetImpl( );
00091     ~CDatasetImpl( );
00092 
00093     void Reset( );
00094     bool Open( const CDataPair&, size_t );
00095     bool Open( const CDataPair*, const char*, const IBayesNet* );
00096     void SaveText( std::ostream& ) const;
00097     void SaveBinary( std::ostream& ) const;
00098     float GetContinuous( size_t, size_t, size_t ) const;
00099 
00100     void**  m_apData;
00101 };
00102 
00103 class CDataOverlayImpl {
00104 protected:
00105     CDataOverlayImpl( ) : m_pDataset(NULL) { }
00106 
00107     const std::vector<std::string>& GetGeneNames( ) const;
00108     size_t GetExperiments( ) const;
00109     size_t GetGene( const std::string& ) const;
00110     size_t GetBins( size_t ) const;
00111     size_t GetGenes( ) const;
00112     bool IsHidden( size_t ) const;
00113     size_t GetDiscrete( size_t, size_t, size_t ) const;
00114     float GetContinuous( size_t, size_t, size_t ) const;
00115     const std::string& GetGene( size_t ) const;
00116     void Save( std::ostream&, bool ) const;
00117 
00118     const IDataset* m_pDataset;
00119 };
00120 
00121 class CDataMaskImpl : protected CDataOverlayImpl {
00122 protected:
00123     CBinaryMatrix   m_Mask;
00124 };
00125 
00126 class CDataFilterImpl : protected CDataOverlayImpl {
00127 protected:
00128     CDataFilterImpl( ) : m_pGenes(NULL), m_pAnswers(NULL) { }
00129 
00130     const CGenes*       m_pGenes;
00131     CDat::EFilter       m_eFilter;
00132     std::vector<bool>   m_vecfGenes;
00133     const CDat*         m_pAnswers;
00134     std::vector<size_t> m_veciAnswers;
00135 };
00136 
00137 class CDataSubsetImpl : protected CDataImpl {
00138 protected:
00139     bool Open( const CDataPair&, size_t );
00140 
00141     size_t                      m_iSize;
00142     size_t                      m_iOffset;
00143     std::vector<std::string>    m_vecstrData;
00144     CFullMatrix<CExampleImpl>   m_Examples;
00145 };
00146 
00147 class CDatasetCompactImpl : protected CDataImpl {
00148 protected:
00149     CDatasetCompactImpl( );
00150     ~CDatasetCompactImpl( );
00151 
00152     bool Open( const CDataPair&, size_t );
00153     bool Open( const char*, const IBayesNet*, const CGenes* = NULL, const CGenes* = NULL );
00154     bool Open( const unsigned char* );
00155     virtual void Remove( size_t, size_t );
00156     size_t GetDiscrete( size_t, size_t, size_t ) const;
00157     void SaveText( std::ostream& ) const;
00158     void SaveBinary( std::ostream& ) const;
00159     virtual bool IsExample( size_t, size_t ) const;
00160 
00161     uint32_t        m_iData;
00162     CCompactMatrix* m_aData;
00163 };
00164 
00165 }
00166 
00167 #endif // DATASETI_H