Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef PCL_H 00023 #define PCL_H 00024 00025 #include <algorithm> 00026 #include <fstream> 00027 #include <string> 00028 00029 #include "pcli.h" 00030 #include "measure.h" 00031 00032 namespace Sleipnir { 00033 00034 class CDat; 00035 00069 class CPCL: protected CPCLImpl { 00070 public: 00075 enum ENormalize { 00080 ENormalizeNone, 00085 ENormalizeZScore, 00090 ENormalizeRow, 00096 ENormalizeMinMax, 00101 ENormalizeColumn, 00107 ENormalizeMean, 00112 ENormalizeColumnCenter, 00117 ENormalizeColumnFraction, 00118 EMeanSubtractColumn 00119 }; 00120 00121 static int Distance(const char* szFile, size_t iSkip, 00122 const char* szSimilarityMeasure, bool fNormalize, bool fZScore, 00123 bool fAutocorrelate, const char* szGeneFile, float dCutoff, 00124 size_t iLimit, CPCL& PCL, CDat& Dat, IMeasure::EMap eMap = 00125 IMeasure::EMapCenter, bool fFrequencyWeight = false, float dAlpha = 0, int nThreads = 1); 00126 00127 static int Distance(const char* szFile, size_t iSkip, const char* szWeights, 00128 const char* szSimilarityMeasure, bool fNormalize, bool fZScore, 00129 bool fAutocorrelate, const char* szGeneFile, float dCutoff, 00130 size_t iLimit, CPCL& PCL, CDat& Dat, IMeasure::EMap eMap = 00131 IMeasure::EMapCenter, bool fFrequencyWeight = false, float dAlpha = 0, int nThreads = 1); 00132 00140 static size_t GetSkip() { 00141 00142 return c_iSkip; 00143 } 00144 00152 static const char* GetExtension() { 00153 00154 return CPCLImpl::c_szExtension; 00155 } 00156 00167 CPCL(bool fHeader = true) : 00168 CPCLImpl(fHeader) { 00169 } 00170 00171 void Open(const CPCL& PCL); 00172 void Open(const std::vector<size_t>& veciGenes, const std::vector< 00173 std::string>& vecstrGenes, 00174 const std::vector<std::string>& vecstrExperiments); 00175 void Open(const std::vector<std::string>& vecstrGenes, const std::vector< 00176 std::string>& vecstrExperiments, 00177 const std::vector<std::string>& vecstrFeatures); 00178 bool OpenBinary(std::istream& istm); 00179 bool Save(const char* szFile, const std::vector<size_t>* pveciGenes = NULL) const; 00180 void Save(std::ostream& ostm, const std::vector<size_t>* pveciGenes = NULL) const; 00181 void SaveBinary(std::ostream& ostm) const; 00182 void SaveGene(std::ostream& ostm, size_t iGene, size_t iOriginal = -1) const; 00183 void SaveHeader(std::ostream& ostm, bool fCDT = false) const; 00184 bool SortGenes(const std::vector<size_t>& veciOrder); 00185 void RankTransform(); 00186 bool AddGenes(const std::vector<std::string>& vecstrGenes); 00187 void Normalize(ENormalize eNormalize = ENormalizeRow); 00188 void Impute(size_t iNeighbors, float dMinimumPresent, 00189 const CDat& DatSimilarity); 00190 void Impute(size_t iNeighbors, float dMinimumPresent, 00191 const IMeasure* pMeasure, bool fPrecompute = true); 00192 void MedianMultiples(size_t iSample = 100000, size_t iBins = 40, 00193 float dBinSize = 0.25); 00194 00195 bool populate(const char* szFile, float dDefault = HUGE_VAL); 00196 00210 void Save(const char* szFile = NULL); 00211 00232 bool Open(const char* szFile, size_t iSkip = 2, bool fMemmap = false, bool rTable = false); 00233 00234 bool Open(std::istream& istm, size_t iSkip, bool rTable = false); 00235 00249 bool Open(std::istream& istm) { 00250 00251 return Open(istm, GetSkip()); 00252 } 00253 00258 void Reset() { 00259 00260 CPCLImpl::Reset(); 00261 } 00262 00267 void Clear() { 00268 00269 m_Data.Clear(); 00270 } 00271 00279 size_t GetFeatures() const { 00280 00281 return m_vecstrFeatures.size(); 00282 } 00283 00297 const std::string& GetFeature(size_t iFeature) const { 00298 00299 return m_vecstrFeatures[iFeature]; 00300 } 00301 00319 const std::string& GetFeature(size_t iGene, size_t iFeature) const { 00320 00321 return m_vecvecstrFeatures[iFeature - 1][iGene]; 00322 } 00323 00341 void SetFeature(size_t iGene, size_t iFeature, const std::string& strValue) { 00342 00343 m_vecvecstrFeatures[iFeature-1][iGene] = strValue; 00344 } 00345 00366 float& Get(size_t iGene, size_t iExperiment) const { 00367 00368 return m_Data.Get(iGene, iExperiment); 00369 } 00370 00388 float* Get(size_t iGene) const { 00389 00390 return m_Data.Get(iGene); 00391 } 00392 00410 void Set(size_t iGene, const float* adValues) { 00411 00412 m_Data.Set(iGene, adValues); 00413 } 00414 00422 const CDataMatrix& Get() const { 00423 00424 return m_Data; 00425 } 00426 00437 size_t GetGenes() const { 00438 00439 return m_vecstrGenes.size(); 00440 } 00441 00452 const std::vector<std::string>& GetGeneNames() const { 00453 00454 return m_vecstrGenes; 00455 } 00456 00457 const std::vector<std::string>& GetExperimentNames() const { 00458 return m_vecstrExperiments; 00459 } 00470 size_t GetExperiments() const { 00471 00472 return m_vecstrExperiments.size(); 00473 } 00474 00488 const std::string& GetGene(size_t iGene) const { 00489 00490 return m_vecstrGenes[iGene]; 00491 } 00492 00509 void SetGene(size_t iGene, const std::string& strGene) { 00510 00511 CPCLImpl::SetGene(iGene, strGene); 00512 } 00513 00527 const std::string& GetExperiment(size_t iExperiment) const { 00528 00529 return m_vecstrExperiments[iExperiment]; 00530 } 00531 00545 size_t GetExperiment(const std::string& strExperiment) const { 00546 size_t i; 00547 00548 for (i = 0; i < m_vecstrExperiments.size(); ++i) 00549 if (m_vecstrExperiments[i] == strExperiment) 00550 return i; 00551 00552 return -1; 00553 } 00554 00568 void SetExperiment(size_t iExperiment, const std::string& strExperiment) { 00569 00570 m_vecstrExperiments[iExperiment] = strExperiment; 00571 } 00572 00592 void MaskGene(size_t iGene, bool fMask = true) { 00593 00594 if (fMask) 00595 m_setiGenes.insert(iGene); 00596 else 00597 m_setiGenes.erase(iGene); 00598 } 00599 00616 bool IsMasked(size_t iGene) const { 00617 00618 return (m_setiGenes.find(iGene) != m_setiGenes.end()); 00619 } 00620 00641 void Set(size_t iGene, size_t iExperiment, float dValue) { 00642 00643 m_Data.Set(iGene, iExperiment, dValue); 00644 } 00645 00659 size_t GetGene(const std::string& strGene) const { 00660 TMapStrI::const_iterator iterGene; 00661 00662 return (((iterGene = m_mapstriGenes.find(strGene)) 00663 == m_mapstriGenes.end()) ? -1 : iterGene->second); 00664 } 00665 00683 std::string GetFeature(size_t iGene, const char* szFeature) const { 00684 size_t i; 00685 00686 for (i = 0; i < m_vecstrFeatures.size(); ++i) 00687 if (m_vecstrFeatures[i] == szFeature) 00688 return GetFeature(iGene, i); 00689 00690 return ""; 00691 } 00692 00693 size_t AddFeature(string strName){ 00694 m_vecstrFeatures.push_back(strName); 00695 size_t iF=m_vecstrFeatures.size()-1; 00696 m_vecvecstrFeatures.resize(iF); 00697 m_vecvecstrFeatures[iF-1].resize(GetGenes()); 00698 return iF; 00699 } 00700 00707 void Randomize() { 00708 size_t i; 00709 00710 for (i = 0; i < GetGenes(); ++i) 00711 std::random_shuffle(Get(i), Get(i) + GetExperiments()); 00712 } 00713 }; 00714 00715 } 00716 00717 #endif // PCL_H