Sleipnir
src/dati.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef DATI_H
00023 #define DATI_H
00024 
00025 #include <map>
00026 
00027 #include "halfmatrix.h"
00028 #include "file.h"
00029 #include "measure.h"
00030 #include "meta.h"
00031 #include "pcl.h"
00032 
00033 namespace Sleipnir {
00034 
00035 class CColor;
00036 class CGenes;
00037 class CSlim;
00038 
00039 class CDatImpl : protected CFile {
00040 protected:
00041     typedef std::map<std::string,size_t>    TMapStrI;
00042     typedef std::vector<std::string>        TVecStr;
00043     typedef std::vector<float>              TAF;
00044     typedef std::vector<TAF>                TAAF;
00045 
00046     static const size_t     c_iGeneLimit    = 100000;
00047     static const size_t     c_iNeighborhood = 40;
00048     static const size_t     c_iDegree       = 1;
00049     static const char       c_acComment[];
00050     static const CColor&    c_ColorMid;
00051     static const CColor&    c_ColorMin;
00052     static const CColor&    c_ColorMax;
00053 
00054     static size_t MapGene( TMapStrI&, TVecStr&, const std::string& );
00055     static void ResizeNaN( TAF&, size_t );
00056     static void DabGene( std::istream&, char* );
00057 
00058     CDatImpl( ) : m_pMeasure(NULL), m_pPCL(NULL), m_abData(NULL), m_iData(0), m_aadData(NULL), m_hndlData(0) { }
00059     ~CDatImpl( );
00060 
00061     void Reset( );
00062     bool OpenPCL( std::istream&, size_t, bool );
00063     bool OpenText( std::istream&, float, bool );
00064     bool OpenBinary( std::istream&, bool = false );
00065     bool OpenSparse( std::istream& );
00066     bool OpenQdab( std::istream& );
00067     bool OpenGenes( std::istream&, bool, bool );
00068     void SaveText( std::ostream& ) const;
00069     void SaveBinary( std::ostream& ) const;
00070     void SaveSparse( std::ostream& ) const;
00071     void SaveGenes( std::ostream& ) const;
00072     size_t GetGene( const std::string& ) const;
00073     void SlimCache( const CSlim&, std::vector<std::vector<size_t> >& ) const;
00074     void AveStd( double&, double&, size_t&, size_t = -1 ) const;
00075     void NormalizeMinmax( );
00076     void NormalizeMinmaxNPone( );
00077     void NormalizePCC( );
00078     void NormalizeStdev( );
00079     void NormalizeSigmoid( );
00080     void NormalizeNormCDF( );
00081     void OpenHelper( const CGenes*, float );
00082     void OpenHelper( const CGenes*, const CGenes*, float );
00083     bool OpenHelper( );
00084     bool OpenMemmap( const unsigned char* );
00085     void FilterGenesGraph( const CGenes&, std::vector<bool>&, size_t, float, bool, bool, const std::vector<float>* );
00086 
00087     struct size_t_comp {
00088         bool operator ()(size_t const& a, size_t const& b) const {
00089             return (a<b);
00090         }
00091     } size_t_comp;
00092 
00093 
00094     float* GetFullRow(const size_t &iY){
00095         float *d_array = m_Data.GetFullRow(iY);
00096         d_array[iY] = CMeta::GetNaN();
00097         return d_array;
00098     }
00099 
00100     float& Get( size_t iX, size_t iY ) const {
00101         static float    s_dRet;
00102 
00103         return ( m_pMeasure ? ( s_dRet = (float)m_pMeasure->Measure( m_pPCL->Get( iX ), m_pPCL->GetExperiments( ),
00104             m_pPCL->Get( iY ), m_pPCL->GetExperiments( ) ) ) :
00105             ( ( iX == iY ) ? ( s_dRet = CMeta::GetNaN( ) ) : m_Data.Get( iX, iY ) ) ); }
00106 
00107     bool Set( size_t iX, size_t iY, float dValue ) {
00108 
00109         if( iX == iY )
00110             return false;
00111 
00112         m_Data.Set( iX, iY, dValue );
00113         return true; }
00114 
00115     bool Set( size_t iX, const float* adValues ) {
00116 
00117         m_Data.Set( iX, adValues );
00118         return true; }
00119 
00120     size_t GetGenes( ) const {
00121 
00122         return ( m_pPCL ? m_pPCL->GetGenes( ) : m_vecstrGenes.size( ) ); }
00123 
00124     size_t GetGeneIndex(const std::string &strGene) const {
00125         std::map<std::string, size_t>::const_iterator   iterGene;
00126         return ( ( ( iterGene = m_mapstrGenes.find( strGene ) ) == m_mapstrGenes.end( ) ) ? -1 :
00127             iterGene->second );
00128     }
00129 
00130     std::string GetGene( size_t iGene ) const {
00131 
00132         return ( m_pPCL ? m_pPCL->GetGene( iGene ) : m_vecstrGenes[ iGene ] ); }
00133 
00134     const std::vector<std::string>& GetGeneNames( ) const {
00135 
00136         return ( m_pMeasure ? m_pPCL->GetGeneNames( ) : m_vecstrGenes ); }
00137 
00138     void EstimateSeekPositions(istream &istm){
00139         m_iHeader = istm.tellg();
00140         size_t i;
00141         m_veciSeekPos.resize(m_vecstrGenes.size());
00142         m_veciSeekPos[0] = 0;
00143         for(i=1; i<m_vecstrGenes.size()-1; i++){
00144             m_veciSeekPos[i] = m_veciSeekPos[i-1] +
00145                 (sizeof(float)*(m_vecstrGenes.size()-1 - i));
00146         }
00147     }
00148 
00149     float* GetRowSeek(std::istream& istm, const std::string &strGene) const;
00150     float* GetRowSeek(std::istream& istm, const size_t &ind) const;
00151     bool OpenHeader(std::istream& istm);
00152 
00153 
00154     CDistanceMatrix m_Data;
00155     TVecStr         m_vecstrGenes;
00156     std::map<std::string, size_t> m_mapstrGenes;
00157 // PCL back end
00158     CPCL*           m_pPCL;
00159     bool            m_fPCLMemory;
00160     const IMeasure* m_pMeasure;
00161     bool            m_fMeasureMemory;
00162 // Memory mapped back end
00163     unsigned char*  m_abData;
00164     size_t          m_iData;
00165     HANDLE          m_hndlData;
00166     float**         m_aadData;
00167 // Seek positions
00168     std::vector<size_t> m_veciSeekPos;
00169     size_t          m_iHeader;
00170     bool            m_fSeek;
00171     /* handle used to open this file
00172      * used for reading sparse number of values
00173      * without reading the entire file
00174      */
00175     ifstream    m_ifsm;
00176 
00177 };
00178 
00179 }
00180 
00181 #endif // DATI_H