Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef DAT_H 00023 #define DAT_H 00024 00025 #include <iostream> 00026 #include <string> 00027 #include <vector> 00028 00029 #include "dati.h" 00030 00031 namespace Sleipnir { 00032 00033 class CGenes; 00034 class CGenome; 00035 00075 class CDat : protected CDatImpl { 00076 public: 00084 enum EFilter { 00089 EFilterInclude = 0, 00094 EFilterTerm = EFilterInclude + 1, 00099 EFilterExclude = EFilterTerm + 1, 00104 EFilterPixie = EFilterExclude + 1, 00109 EFilterEdge = EFilterPixie + 1, 00114 EFilterHefalmp = EFilterEdge + 1, 00119 EFilterIncludePos = EFilterHefalmp +1, 00124 EFilterExEdge = EFilterIncludePos +1 00125 00126 }; 00127 00135 enum EFormat { 00140 EFormatBinary = 0, 00145 EFormatText = EFormatBinary + 1, 00150 EFormatPCL = EFormatText + 1, 00155 EFormatSparse = EFormatPCL + 1, 00156 00161 EFormatQdab = EFormatSparse + 1 00162 00163 }; 00164 00172 enum ENormalize { 00173 ENormalizeNone = 0, 00178 ENormalizeMinMax = ENormalizeNone + 1, 00183 ENormalizeMinMaxNPone = ENormalizeMinMax + 1, 00188 ENormalizeZScore = ENormalizeMinMaxNPone + 1, 00193 ENormalizeSigmoid = ENormalizeZScore + 1, 00194 ENormalizeNormCDF = ENormalizeSigmoid + 1, 00195 ENormalizePCC = ENormalizeNormCDF + 1 00196 }; 00197 00198 00199 bool Open( const char* szFile, bool fMemmap = false, size_t iSkip = 2, bool fZScore = false, 00200 bool fDuplicates = false, bool fSeek = false ); 00201 bool Open( std::istream& istm, EFormat eFormat = EFormatBinary, float dDefault = HUGE_VAL, 00202 bool fDuplicates = false, size_t iSkip = 2, bool fZScore = false, bool fSeek = false ); 00203 bool Open( const CSlim& Slim ); 00204 bool Open( const CSlim& SlimPositives, const CSlim& SlimNonnegatives ); 00205 bool Open( const std::vector<std::string>& vecstrGenes, bool fClear = true, const char* szFile = NULL ); 00206 bool Open( const std::vector<std::string>& vecstrGenes, const CDistanceMatrix& MatValues ); 00207 bool Open( const std::vector<CGenes*>& vecpPositives, const std::vector<CGenes*>& vecpNonnegatives, 00208 float dPValue, const CGenome& Genome, bool fIncident = false ); 00209 bool Open( const CDat& DatKnown, const std::vector<CGenes*>& vecpOther, const CGenome& Genome, 00210 bool fKnownNegatives, bool fIncident = false ); 00211 bool Open( const CPCL& PCL, const IMeasure* pMeasure, bool fMeasureMemory ); 00212 bool Open( const CDat& Dat ); 00213 00214 bool OpenGenes( std::istream& istm, bool fBinary, bool fPCL = false ); 00215 bool OpenGenes( const char* szFile, size_t iSkip = 2 ); 00216 void Save( std::ostream& ostm, EFormat eFormat = EFormatBinary ) const; 00217 void Save( const char* szFile ) const; 00218 void SaveDOT( std::ostream& ostm, float dCutoff = HUGE_VAL, const CGenome* pGenome = NULL, 00219 bool fUnlabeled = false, bool fHashes = true, const std::vector<float>* pvecdColors = NULL, 00220 const std::vector<float>* pvecdBorders = NULL ) const; 00221 void SaveGDF( std::ostream& ostm, float dCutoff = HUGE_VAL ) const; 00222 void SaveNET( std::ostream& ostm, float dCutoff = HUGE_VAL ) const; 00223 void SaveMATISSE( std::ostream& ostm, float dCutoff = HUGE_VAL, const CGenome* pGenome = NULL ) const; 00224 void Invert( ); 00225 void Rank( ); 00226 bool FilterGenes( const char* szGenes, EFilter eFilter, size_t iLimit = -1 ); 00227 void FilterGenes( const CGenes& Genes, EFilter eFilter, size_t iLimit = -1, 00228 float dEdgeAggressiveness = 0.5, bool fAbsolute = false, const std::vector<float>* pvecdWeights = NULL ); 00229 void NormalizeQuantiles( size_t iQuantiles ); 00230 00231 float* GetRowSeek(const string &strGene) { 00232 return CDatImpl::GetRowSeek(m_ifsm, strGene); 00233 } 00234 float* GetRowSeek(const size_t &i){ 00235 return CDatImpl::GetRowSeek(m_ifsm, i); 00236 } 00237 00238 size_t GetGeneIndex(const string &strGene) const { 00239 return CDatImpl::GetGeneIndex(strGene); 00240 } 00241 00242 void AveStd( double& a, double& b, size_t& c){ 00243 return CDatImpl::AveStd(a, b, c); 00244 } 00245 00246 void Clear( float dValue ) { 00247 size_t i; 00248 00249 for( i = 0; i < GetGenes( ); ++i ) 00250 memset( Get( i ), *(int*)&dValue, ( GetGenes( ) - i - 1 ) * sizeof(*Get( i )) ); } 00251 00252 bool AddGene( const std::string& strGene ) { 00253 std::vector<std::string> vecstrGenes; 00254 00255 vecstrGenes.push_back( strGene ); 00256 return AddGenes( vecstrGenes ); } 00257 00258 bool AddGenes( const std::vector<std::string>& vecstrGenes ) { 00259 00260 if( m_pPCL || m_abData || !m_Data.SetSize( m_Data.GetSize( ) + vecstrGenes.size( ), true ) ) 00261 return false; 00262 00263 m_vecstrGenes.insert( m_vecstrGenes.end( ), vecstrGenes.begin( ), vecstrGenes.end( ) ); 00264 return true; } 00265 00279 void Normalize( ENormalize eNormalize ) { 00280 00281 switch( eNormalize ) { 00282 case ENormalizeMinMax: 00283 NormalizeMinmax( ); 00284 break; 00285 00286 case ENormalizeMinMaxNPone: 00287 NormalizeMinmaxNPone( ); 00288 break; 00289 00290 case ENormalizeZScore: 00291 NormalizeStdev( ); 00292 break; 00293 00294 case ENormalizeNormCDF: 00295 NormalizeNormCDF( ); 00296 break; 00297 00298 case ENormalizePCC: 00299 NormalizePCC( ); 00300 break; 00301 00302 default: 00303 NormalizeSigmoid( ); } } 00304 00318 size_t GetGene( const std::string& strGene ) const { 00319 00320 return CDatImpl::GetGene( strGene ); } 00321 00322 float* GetFullRow( const size_t &iY ) { 00323 return CDatImpl::GetFullRow(iY); 00324 } 00325 00326 00347 float& Get( size_t iY, size_t iX ) const { 00348 00349 return CDatImpl::Get( iY, iX ); } 00350 00362 size_t GetGenes( ) const { 00363 00364 return CDatImpl::GetGenes( ); } 00365 00373 const CDistanceMatrix& Get( ) const { 00374 00375 return m_Data; } 00376 00384 CDistanceMatrix& Get( ) { 00385 00386 return m_Data; } 00387 00411 bool Set( size_t iY, size_t iX, float dValue ) { 00412 00413 return CDatImpl::Set( iY, iX, dValue ); } 00414 00428 std::string GetGene( size_t iGene ) const { 00429 00430 return CDatImpl::GetGene( iGene ); } 00431 00442 const std::vector<std::string>& GetGeneNames( ) const { 00443 00444 return CDatImpl::GetGeneNames( ); } 00445 00463 void Set( size_t iY, const float* adValues ) { 00464 00465 m_Data.Set( iY, adValues ); } 00466 00484 const float* Get( size_t iY ) const { 00485 00486 return m_Data.Get( iY ); } 00487 00505 float* Get( size_t iY ) { 00506 00507 return m_Data.Get( iY ); } 00508 00525 void SetGene( size_t iGene, const std::string& strGene ) { 00526 00527 if( m_pPCL ) 00528 m_pPCL->SetGene( iGene, strGene ); 00529 else 00530 m_vecstrGenes[ iGene ] = strGene; } 00531 00536 void Randomize( ) { 00537 size_t i, j, iOne, iTwo; 00538 float dOne, dTwo; 00539 00540 for( i = 0; i < GetGenes( ); ++i ) 00541 for( j = ( i + 1 ); j < GetGenes( ); ++j ) { 00542 if( CMeta::IsNaN( dOne = Get( i, j ) ) ) 00543 continue; 00544 while( true ) { 00545 iOne = rand( ) % GetGenes( ); 00546 iTwo = rand( ) % GetGenes( ); 00547 if( iOne > iTwo ) 00548 std::swap( iOne, iTwo ); 00549 if( ( ( iOne != i ) || ( iTwo != j ) ) && !CMeta::IsNaN( dTwo = Get( iOne, iTwo ) ) ) 00550 break; } 00551 Set( i, j, dTwo ); 00552 Set( iOne, iTwo, dOne ); } } 00553 00554 00555 }; 00556 00557 } 00558 00559 #endif // DAT_H