Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef DATASETI_H 00023 #define DATASETI_H 00024 00025 #include <set> 00026 00027 #include "examplei.h" 00028 #include "fullmatrix.h" 00029 #include "halfmatrix.h" 00030 00031 namespace Sleipnir { 00032 00033 class CCompactMatrix; 00034 class CDataPair; 00035 class IBayesNet; 00036 class IDataset; 00037 00038 class CDataImpl { 00039 friend class CDataFilter; 00040 friend class CDataMask; 00041 protected: 00042 static const char c_cSeparator = '/'; 00043 static const char c_szDat[]; 00044 static const char c_szDab[]; 00045 00046 static void FilterGenes( IDataset*, const CGenes&, CDat::EFilter ); 00047 00048 size_t OpenMax( const char*, const std::vector<std::string>&, bool, 00049 std::vector<std::string>&, std::set<std::string>* = NULL ); 00050 bool OpenGenes( std::istream&, bool, bool, std::set<std::string>& ) const; 00051 bool OpenGenes( const std::vector<std::string>& ); 00052 00053 size_t GetGene( const std::string& ) const; 00054 bool OpenBinary( std::istream& ); 00055 const unsigned char* OpenBinary( const unsigned char* ); 00056 void SaveBinary( std::ostream& ) const; 00057 00058 bool IsHidden( size_t iNode ) const { 00059 00060 return ( m_veciMapping[ iNode ] == -1 ); } 00061 00062 const std::string& GetGene( size_t iGene ) const { 00063 00064 return m_vecstrGenes[ iGene ]; } 00065 00066 size_t GetGenes( ) const { 00067 00068 return m_vecstrGenes.size( ); } 00069 00070 const std::vector<std::string>& GetGeneNames( ) const { 00071 00072 return m_vecstrGenes; } 00073 00074 size_t GetExperiments( ) const { 00075 00076 return m_veciMapping.size( ); } 00077 00078 unsigned char GetBins( size_t iExp ) const { 00079 00080 return m_veccQuants[ iExp ]; } 00081 00082 bool m_fContinuous; 00083 std::vector<size_t> m_veciMapping; 00084 std::vector<std::string> m_vecstrGenes; 00085 std::vector<unsigned char> m_veccQuants; 00086 }; 00087 00088 class CDatasetImpl : protected CDataImpl { 00089 protected: 00090 CDatasetImpl( ); 00091 ~CDatasetImpl( ); 00092 00093 void Reset( ); 00094 bool Open( const CDataPair&, size_t ); 00095 bool Open( const CDataPair*, const char*, const IBayesNet* ); 00096 void SaveText( std::ostream& ) const; 00097 void SaveBinary( std::ostream& ) const; 00098 float GetContinuous( size_t, size_t, size_t ) const; 00099 00100 void** m_apData; 00101 }; 00102 00103 class CDataOverlayImpl { 00104 protected: 00105 CDataOverlayImpl( ) : m_pDataset(NULL) { } 00106 00107 const std::vector<std::string>& GetGeneNames( ) const; 00108 size_t GetExperiments( ) const; 00109 size_t GetGene( const std::string& ) const; 00110 size_t GetBins( size_t ) const; 00111 size_t GetGenes( ) const; 00112 bool IsHidden( size_t ) const; 00113 size_t GetDiscrete( size_t, size_t, size_t ) const; 00114 float GetContinuous( size_t, size_t, size_t ) const; 00115 const std::string& GetGene( size_t ) const; 00116 void Save( std::ostream&, bool ) const; 00117 00118 const IDataset* m_pDataset; 00119 }; 00120 00121 class CDataMaskImpl : protected CDataOverlayImpl { 00122 protected: 00123 CBinaryMatrix m_Mask; 00124 }; 00125 00126 class CDataFilterImpl : protected CDataOverlayImpl { 00127 protected: 00128 CDataFilterImpl( ) : m_pGenes(NULL), m_pAnswers(NULL) { } 00129 00130 const CGenes* m_pGenes; 00131 CDat::EFilter m_eFilter; 00132 std::vector<bool> m_vecfGenes; 00133 const CDat* m_pAnswers; 00134 std::vector<size_t> m_veciAnswers; 00135 }; 00136 00137 class CDataSubsetImpl : protected CDataImpl { 00138 protected: 00139 bool Open( const CDataPair&, size_t ); 00140 00141 size_t m_iSize; 00142 size_t m_iOffset; 00143 std::vector<std::string> m_vecstrData; 00144 CFullMatrix<CExampleImpl> m_Examples; 00145 }; 00146 00147 class CDatasetCompactImpl : protected CDataImpl { 00148 protected: 00149 CDatasetCompactImpl( ); 00150 ~CDatasetCompactImpl( ); 00151 00152 bool Open( const CDataPair&, size_t ); 00153 bool Open( const char*, const IBayesNet*, const CGenes* = NULL, const CGenes* = NULL ); 00154 bool Open( const unsigned char* ); 00155 virtual void Remove( size_t, size_t ); 00156 size_t GetDiscrete( size_t, size_t, size_t ) const; 00157 void SaveText( std::ostream& ) const; 00158 void SaveBinary( std::ostream& ) const; 00159 virtual bool IsExample( size_t, size_t ) const; 00160 00161 uint32_t m_iData; 00162 CCompactMatrix* m_aData; 00163 }; 00164 00165 } 00166 00167 #endif // DATASETI_H