Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef DATI_H 00023 #define DATI_H 00024 00025 #include <map> 00026 00027 #include "halfmatrix.h" 00028 #include "file.h" 00029 #include "measure.h" 00030 #include "meta.h" 00031 #include "pcl.h" 00032 00033 namespace Sleipnir { 00034 00035 class CColor; 00036 class CGenes; 00037 class CSlim; 00038 00039 class CDatImpl : protected CFile { 00040 protected: 00041 typedef std::map<std::string,size_t> TMapStrI; 00042 typedef std::vector<std::string> TVecStr; 00043 typedef std::vector<float> TAF; 00044 typedef std::vector<TAF> TAAF; 00045 00046 static const size_t c_iGeneLimit = 100000; 00047 static const size_t c_iNeighborhood = 40; 00048 static const size_t c_iDegree = 1; 00049 static const char c_acComment[]; 00050 static const CColor& c_ColorMid; 00051 static const CColor& c_ColorMin; 00052 static const CColor& c_ColorMax; 00053 00054 static size_t MapGene( TMapStrI&, TVecStr&, const std::string& ); 00055 static void ResizeNaN( TAF&, size_t ); 00056 static void DabGene( std::istream&, char* ); 00057 00058 CDatImpl( ) : m_pMeasure(NULL), m_pPCL(NULL), m_abData(NULL), m_iData(0), m_aadData(NULL), m_hndlData(0) { } 00059 ~CDatImpl( ); 00060 00061 void Reset( ); 00062 bool OpenPCL( std::istream&, size_t, bool ); 00063 bool OpenText( std::istream&, float, bool ); 00064 bool OpenBinary( std::istream&, bool = false ); 00065 bool OpenSparse( std::istream& ); 00066 bool OpenQdab( std::istream& ); 00067 bool OpenGenes( std::istream&, bool, bool ); 00068 void SaveText( std::ostream& ) const; 00069 void SaveBinary( std::ostream& ) const; 00070 void SaveSparse( std::ostream& ) const; 00071 void SaveGenes( std::ostream& ) const; 00072 size_t GetGene( const std::string& ) const; 00073 void SlimCache( const CSlim&, std::vector<std::vector<size_t> >& ) const; 00074 void AveStd( double&, double&, size_t&, size_t = -1 ) const; 00075 void NormalizeMinmax( ); 00076 void NormalizeMinmaxNPone( ); 00077 void NormalizePCC( ); 00078 void NormalizeStdev( ); 00079 void NormalizeSigmoid( ); 00080 void NormalizeNormCDF( ); 00081 void OpenHelper( const CGenes*, float ); 00082 void OpenHelper( const CGenes*, const CGenes*, float ); 00083 bool OpenHelper( ); 00084 bool OpenMemmap( const unsigned char* ); 00085 void FilterGenesGraph( const CGenes&, std::vector<bool>&, size_t, float, bool, bool, const std::vector<float>* ); 00086 00087 struct size_t_comp { 00088 bool operator ()(size_t const& a, size_t const& b) const { 00089 return (a<b); 00090 } 00091 } size_t_comp; 00092 00093 00094 float* GetFullRow(const size_t &iY){ 00095 float *d_array = m_Data.GetFullRow(iY); 00096 d_array[iY] = CMeta::GetNaN(); 00097 return d_array; 00098 } 00099 00100 float& Get( size_t iX, size_t iY ) const { 00101 static float s_dRet; 00102 00103 return ( m_pMeasure ? ( s_dRet = (float)m_pMeasure->Measure( m_pPCL->Get( iX ), m_pPCL->GetExperiments( ), 00104 m_pPCL->Get( iY ), m_pPCL->GetExperiments( ) ) ) : 00105 ( ( iX == iY ) ? ( s_dRet = CMeta::GetNaN( ) ) : m_Data.Get( iX, iY ) ) ); } 00106 00107 bool Set( size_t iX, size_t iY, float dValue ) { 00108 00109 if( iX == iY ) 00110 return false; 00111 00112 m_Data.Set( iX, iY, dValue ); 00113 return true; } 00114 00115 bool Set( size_t iX, const float* adValues ) { 00116 00117 m_Data.Set( iX, adValues ); 00118 return true; } 00119 00120 size_t GetGenes( ) const { 00121 00122 return ( m_pPCL ? m_pPCL->GetGenes( ) : m_vecstrGenes.size( ) ); } 00123 00124 size_t GetGeneIndex(const std::string &strGene) const { 00125 std::map<std::string, size_t>::const_iterator iterGene; 00126 return ( ( ( iterGene = m_mapstrGenes.find( strGene ) ) == m_mapstrGenes.end( ) ) ? -1 : 00127 iterGene->second ); 00128 } 00129 00130 std::string GetGene( size_t iGene ) const { 00131 00132 return ( m_pPCL ? m_pPCL->GetGene( iGene ) : m_vecstrGenes[ iGene ] ); } 00133 00134 const std::vector<std::string>& GetGeneNames( ) const { 00135 00136 return ( m_pMeasure ? m_pPCL->GetGeneNames( ) : m_vecstrGenes ); } 00137 00138 void EstimateSeekPositions(istream &istm){ 00139 m_iHeader = istm.tellg(); 00140 size_t i; 00141 m_veciSeekPos.resize(m_vecstrGenes.size()); 00142 m_veciSeekPos[0] = 0; 00143 for(i=1; i<m_vecstrGenes.size()-1; i++){ 00144 m_veciSeekPos[i] = m_veciSeekPos[i-1] + 00145 (sizeof(float)*(m_vecstrGenes.size()-1 - i)); 00146 } 00147 } 00148 00149 float* GetRowSeek(std::istream& istm, const std::string &strGene) const; 00150 float* GetRowSeek(std::istream& istm, const size_t &ind) const; 00151 bool OpenHeader(std::istream& istm); 00152 00153 00154 CDistanceMatrix m_Data; 00155 TVecStr m_vecstrGenes; 00156 std::map<std::string, size_t> m_mapstrGenes; 00157 // PCL back end 00158 CPCL* m_pPCL; 00159 bool m_fPCLMemory; 00160 const IMeasure* m_pMeasure; 00161 bool m_fMeasureMemory; 00162 // Memory mapped back end 00163 unsigned char* m_abData; 00164 size_t m_iData; 00165 HANDLE m_hndlData; 00166 float** m_aadData; 00167 // Seek positions 00168 std::vector<size_t> m_veciSeekPos; 00169 size_t m_iHeader; 00170 bool m_fSeek; 00171 /* handle used to open this file 00172 * used for reading sparse number of values 00173 * without reading the entire file 00174 */ 00175 ifstream m_ifsm; 00176 00177 }; 00178 00179 } 00180 00181 #endif // DATI_H