Sleipnir
src/dat.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef DAT_H
00023 #define DAT_H
00024 
00025 #include <iostream>
00026 #include <string>
00027 #include <vector>
00028 
00029 #include "dati.h"
00030 
00031 namespace Sleipnir {
00032 
00033 class CGenes;
00034 class CGenome;
00035 
00075 class CDat : protected CDatImpl {
00076 public:
00084     enum EFilter {
00089         EFilterInclude      = 0,
00094         EFilterTerm         = EFilterInclude + 1,
00099         EFilterExclude      = EFilterTerm + 1,
00104         EFilterPixie        = EFilterExclude + 1,
00109         EFilterEdge         = EFilterPixie + 1,
00114         EFilterHefalmp      = EFilterEdge + 1,
00119         EFilterIncludePos   = EFilterHefalmp +1,        
00124         EFilterExEdge   = EFilterIncludePos +1
00125 
00126     };
00127 
00135     enum EFormat {
00140         EFormatBinary   = 0,
00145         EFormatText     = EFormatBinary + 1,
00150         EFormatPCL      = EFormatText + 1,
00155         EFormatSparse   = EFormatPCL + 1,
00156 
00161         EFormatQdab = EFormatSparse + 1
00162 
00163     };
00164 
00172     enum ENormalize {
00173         ENormalizeNone      = 0,
00178         ENormalizeMinMax    = ENormalizeNone + 1,
00183         ENormalizeMinMaxNPone   = ENormalizeMinMax + 1,
00188         ENormalizeZScore    = ENormalizeMinMaxNPone + 1,
00193         ENormalizeSigmoid   = ENormalizeZScore + 1,
00194         ENormalizeNormCDF   = ENormalizeSigmoid + 1,
00195         ENormalizePCC       = ENormalizeNormCDF + 1
00196     };
00197 
00198 
00199     bool Open( const char* szFile, bool fMemmap = false, size_t iSkip = 2, bool fZScore = false,
00200         bool fDuplicates = false, bool fSeek = false );
00201     bool Open( std::istream& istm, EFormat eFormat = EFormatBinary, float dDefault = HUGE_VAL,
00202         bool fDuplicates = false, size_t iSkip = 2, bool fZScore = false, bool fSeek = false );
00203     bool Open( const CSlim& Slim );
00204     bool Open( const CSlim& SlimPositives, const CSlim& SlimNonnegatives );
00205     bool Open( const std::vector<std::string>& vecstrGenes, bool fClear = true, const char* szFile = NULL );
00206     bool Open( const std::vector<std::string>& vecstrGenes, const CDistanceMatrix& MatValues );
00207     bool Open( const std::vector<CGenes*>& vecpPositives, const std::vector<CGenes*>& vecpNonnegatives,
00208         float dPValue, const CGenome& Genome, bool fIncident = false );
00209     bool Open( const CDat& DatKnown, const std::vector<CGenes*>& vecpOther, const CGenome& Genome,
00210            bool fKnownNegatives, bool fIncident = false );
00211     bool Open( const CPCL& PCL, const IMeasure* pMeasure, bool fMeasureMemory );
00212     bool Open( const CDat& Dat );
00213 
00214     bool OpenGenes( std::istream& istm, bool fBinary, bool fPCL = false );
00215     bool OpenGenes( const char* szFile, size_t iSkip = 2 );
00216     void Save( std::ostream& ostm, EFormat eFormat = EFormatBinary ) const;
00217     void Save( const char* szFile ) const;
00218     void SaveDOT( std::ostream& ostm, float dCutoff = HUGE_VAL, const CGenome* pGenome = NULL,
00219         bool fUnlabeled = false, bool fHashes = true, const std::vector<float>* pvecdColors = NULL,
00220         const std::vector<float>* pvecdBorders = NULL ) const;
00221     void SaveGDF( std::ostream& ostm, float dCutoff = HUGE_VAL ) const;
00222     void SaveNET( std::ostream& ostm, float dCutoff = HUGE_VAL ) const;
00223     void SaveMATISSE( std::ostream& ostm, float dCutoff = HUGE_VAL, const CGenome* pGenome = NULL ) const;
00224     void Invert( );
00225     void Rank( );
00226     bool FilterGenes( const char* szGenes, EFilter eFilter, size_t iLimit = -1 );
00227     void FilterGenes( const CGenes& Genes, EFilter eFilter, size_t iLimit = -1,
00228               float dEdgeAggressiveness = 0.5, bool fAbsolute = false, const std::vector<float>* pvecdWeights = NULL );
00229     void NormalizeQuantiles( size_t iQuantiles );
00230 
00231     float* GetRowSeek(const string &strGene) {
00232         return CDatImpl::GetRowSeek(m_ifsm, strGene);
00233     }
00234     float* GetRowSeek(const size_t &i){
00235         return CDatImpl::GetRowSeek(m_ifsm, i);
00236     }
00237 
00238     size_t GetGeneIndex(const string &strGene) const {
00239         return CDatImpl::GetGeneIndex(strGene);
00240     }
00241 
00242     void AveStd( double& a, double& b, size_t& c){
00243         return CDatImpl::AveStd(a, b, c);
00244     }
00245 
00246     void Clear( float dValue ) {
00247         size_t  i;
00248 
00249         for( i = 0; i < GetGenes( ); ++i )
00250             memset( Get( i ), *(int*)&dValue, ( GetGenes( ) - i - 1 ) * sizeof(*Get( i )) ); }
00251 
00252     bool AddGene( const std::string& strGene ) {
00253         std::vector<std::string>    vecstrGenes;
00254 
00255         vecstrGenes.push_back( strGene );
00256         return AddGenes( vecstrGenes ); }
00257 
00258     bool AddGenes( const std::vector<std::string>& vecstrGenes ) {
00259 
00260         if( m_pPCL || m_abData || !m_Data.SetSize( m_Data.GetSize( ) + vecstrGenes.size( ), true ) )
00261             return false;
00262 
00263         m_vecstrGenes.insert( m_vecstrGenes.end( ), vecstrGenes.begin( ), vecstrGenes.end( ) );
00264         return true; }
00265 
00279     void Normalize( ENormalize eNormalize ) {
00280 
00281         switch( eNormalize ) {
00282             case ENormalizeMinMax:
00283                 NormalizeMinmax( );
00284                 break;
00285 
00286             case ENormalizeMinMaxNPone:
00287                 NormalizeMinmaxNPone( );
00288                 break;
00289 
00290             case ENormalizeZScore:
00291                 NormalizeStdev( );
00292                 break;
00293 
00294             case ENormalizeNormCDF:
00295                 NormalizeNormCDF( );
00296                 break;
00297 
00298             case ENormalizePCC:
00299                 NormalizePCC( );
00300                 break;
00301 
00302             default:
00303                 NormalizeSigmoid( ); } }
00304 
00318     size_t GetGene( const std::string& strGene ) const {
00319 
00320         return CDatImpl::GetGene( strGene ); }
00321 
00322     float* GetFullRow( const size_t &iY ) {
00323         return CDatImpl::GetFullRow(iY);
00324     }
00325 
00326 
00347     float& Get( size_t iY, size_t iX ) const {
00348 
00349         return CDatImpl::Get( iY, iX ); }
00350 
00362     size_t GetGenes( ) const {
00363 
00364         return CDatImpl::GetGenes( ); }
00365 
00373     const CDistanceMatrix& Get( ) const {
00374 
00375         return m_Data; }
00376 
00384     CDistanceMatrix& Get( ) {
00385 
00386         return m_Data; }
00387 
00411     bool Set( size_t iY, size_t iX, float dValue ) {
00412 
00413         return CDatImpl::Set( iY, iX, dValue ); }
00414 
00428     std::string GetGene( size_t iGene ) const {
00429 
00430         return CDatImpl::GetGene( iGene ); }
00431 
00442     const std::vector<std::string>& GetGeneNames( ) const {
00443 
00444         return CDatImpl::GetGeneNames( ); }
00445 
00463     void Set( size_t iY, const float* adValues ) {
00464 
00465         m_Data.Set( iY, adValues ); }
00466 
00484     const float* Get( size_t iY ) const {
00485 
00486         return m_Data.Get( iY ); }
00487 
00505     float* Get( size_t iY ) {
00506 
00507         return m_Data.Get( iY ); }
00508 
00525     void SetGene( size_t iGene, const std::string& strGene ) {
00526 
00527         if( m_pPCL )
00528             m_pPCL->SetGene( iGene, strGene );
00529         else
00530             m_vecstrGenes[ iGene ] = strGene; }
00531 
00536     void Randomize( ) {
00537         size_t  i, j, iOne, iTwo;
00538         float   dOne, dTwo;
00539 
00540         for( i = 0; i < GetGenes( ); ++i )
00541             for( j = ( i + 1 ); j < GetGenes( ); ++j ) {
00542                 if( CMeta::IsNaN( dOne = Get( i, j ) ) )
00543                     continue;
00544                 while( true ) {
00545                     iOne = rand( ) % GetGenes( );
00546                     iTwo = rand( ) % GetGenes( );
00547                     if( iOne > iTwo )
00548                         std::swap( iOne, iTwo );
00549                     if( ( ( iOne != i ) || ( iTwo != j ) ) && !CMeta::IsNaN( dTwo = Get( iOne, iTwo ) ) )
00550                         break; }
00551                 Set( i, j, dTwo );
00552                 Set( iOne, iTwo, dOne ); } }
00553 
00554 
00555 };
00556 
00557 }
00558 
00559 #endif // DAT_H