Sleipnir
src/pcl.h
00001 /*****************************************************************************
00002  * This file is provided under the Creative Commons Attribution 3.0 license.
00003  *
00004  * You are free to share, copy, distribute, transmit, or adapt this work
00005  * PROVIDED THAT you attribute the work to the authors listed below.
00006  * For more information, please see the following web page:
00007  * http://creativecommons.org/licenses/by/3.0/
00008  *
00009  * This file is a component of the Sleipnir library for functional genomics,
00010  * authored by:
00011  * Curtis Huttenhower (chuttenh@princeton.edu)
00012  * Mark Schroeder
00013  * Maria D. Chikina
00014  * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015  *
00016  * If you use this library, the included executable tools, or any related
00017  * code in your work, please cite the following publication:
00018  * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019  * Olga G. Troyanskaya.
00020  * "The Sleipnir library for computational functional genomics"
00021  *****************************************************************************/
00022 #ifndef PCL_H
00023 #define PCL_H
00024 
00025 #include <algorithm>
00026 #include <fstream>
00027 #include <string>
00028 
00029 #include "pcli.h"
00030 #include "measure.h"
00031 
00032 namespace Sleipnir {
00033 
00034 class CDat;
00035 
00069 class CPCL: protected CPCLImpl {
00070 public:
00075     enum ENormalize {
00080         ENormalizeNone,
00085         ENormalizeZScore,
00090         ENormalizeRow,
00096         ENormalizeMinMax,
00101         ENormalizeColumn,
00107         ENormalizeMean,
00112         ENormalizeColumnCenter,
00117         ENormalizeColumnFraction,
00118         EMeanSubtractColumn
00119     };
00120 
00121     static int Distance(const char* szFile, size_t iSkip,
00122             const char* szSimilarityMeasure, bool fNormalize, bool fZScore,
00123             bool fAutocorrelate, const char* szGeneFile, float dCutoff,
00124             size_t iLimit, CPCL& PCL, CDat& Dat, IMeasure::EMap eMap =
00125                     IMeasure::EMapCenter, bool fFrequencyWeight = false, float dAlpha = 0, int nThreads = 1);
00126 
00127     static int Distance(const char* szFile, size_t iSkip, const char* szWeights,
00128             const char* szSimilarityMeasure, bool fNormalize, bool fZScore,
00129             bool fAutocorrelate, const char* szGeneFile, float dCutoff,
00130             size_t iLimit, CPCL& PCL, CDat& Dat, IMeasure::EMap eMap =
00131                     IMeasure::EMapCenter, bool fFrequencyWeight = false, float dAlpha = 0, int nThreads = 1);
00132 
00140     static size_t GetSkip() {
00141 
00142         return c_iSkip;
00143     }
00144 
00152     static const char* GetExtension() {
00153 
00154         return CPCLImpl::c_szExtension;
00155     }
00156 
00167     CPCL(bool fHeader = true) :
00168         CPCLImpl(fHeader) {
00169     }
00170 
00171     void Open(const CPCL& PCL);
00172     void Open(const std::vector<size_t>& veciGenes, const std::vector<
00173             std::string>& vecstrGenes,
00174             const std::vector<std::string>& vecstrExperiments);
00175     void Open(const std::vector<std::string>& vecstrGenes, const std::vector<
00176             std::string>& vecstrExperiments,
00177             const std::vector<std::string>& vecstrFeatures);
00178     bool OpenBinary(std::istream& istm);
00179     bool Save(const char* szFile, const std::vector<size_t>* pveciGenes = NULL) const;
00180     void Save(std::ostream& ostm, const std::vector<size_t>* pveciGenes = NULL) const;
00181     void SaveBinary(std::ostream& ostm) const;
00182     void SaveGene(std::ostream& ostm, size_t iGene, size_t iOriginal = -1) const;
00183     void SaveHeader(std::ostream& ostm, bool fCDT = false) const;
00184     bool SortGenes(const std::vector<size_t>& veciOrder);
00185     void RankTransform();
00186     bool AddGenes(const std::vector<std::string>& vecstrGenes);
00187     void Normalize(ENormalize eNormalize = ENormalizeRow);
00188     void Impute(size_t iNeighbors, float dMinimumPresent,
00189             const CDat& DatSimilarity);
00190     void Impute(size_t iNeighbors, float dMinimumPresent,
00191             const IMeasure* pMeasure, bool fPrecompute = true);
00192     void MedianMultiples(size_t iSample = 100000, size_t iBins = 40,
00193             float dBinSize = 0.25);
00194     
00195     bool populate(const char* szFile, float dDefault = HUGE_VAL);
00196     
00210     void Save(const char* szFile = NULL);
00211 
00232     bool Open(const char* szFile, size_t iSkip = 2, bool fMemmap = false, bool rTable = false);
00233     
00234     bool Open(std::istream& istm, size_t iSkip, bool rTable = false);
00235 
00249     bool Open(std::istream& istm) {
00250 
00251         return Open(istm, GetSkip());
00252     }
00253 
00258     void Reset() {
00259 
00260         CPCLImpl::Reset();
00261     }
00262 
00267     void Clear() {
00268 
00269         m_Data.Clear();
00270     }
00271 
00279     size_t GetFeatures() const {
00280 
00281         return m_vecstrFeatures.size();
00282     }
00283 
00297     const std::string& GetFeature(size_t iFeature) const {
00298 
00299         return m_vecstrFeatures[iFeature];
00300     }
00301 
00319     const std::string& GetFeature(size_t iGene, size_t iFeature) const {
00320 
00321         return m_vecvecstrFeatures[iFeature - 1][iGene];
00322     }
00323 
00341     void SetFeature(size_t iGene, size_t iFeature, const std::string& strValue) {
00342 
00343         m_vecvecstrFeatures[iFeature-1][iGene] = strValue;
00344     }
00345 
00366     float& Get(size_t iGene, size_t iExperiment) const {
00367 
00368         return m_Data.Get(iGene, iExperiment);
00369     }
00370 
00388     float* Get(size_t iGene) const {
00389 
00390         return m_Data.Get(iGene);
00391     }
00392 
00410     void Set(size_t iGene, const float* adValues) {
00411 
00412         m_Data.Set(iGene, adValues);
00413     }
00414 
00422     const CDataMatrix& Get() const {
00423 
00424         return m_Data;
00425     }
00426 
00437     size_t GetGenes() const {
00438 
00439         return m_vecstrGenes.size();
00440     }
00441 
00452     const std::vector<std::string>& GetGeneNames() const {
00453 
00454         return m_vecstrGenes;
00455     }
00456 
00457     const std::vector<std::string>& GetExperimentNames() const {
00458         return m_vecstrExperiments;
00459     }
00470     size_t GetExperiments() const {
00471 
00472         return m_vecstrExperiments.size();
00473     }
00474 
00488     const std::string& GetGene(size_t iGene) const {
00489 
00490         return m_vecstrGenes[iGene];
00491     }
00492 
00509     void SetGene(size_t iGene, const std::string& strGene) {
00510 
00511         CPCLImpl::SetGene(iGene, strGene);
00512     }
00513 
00527     const std::string& GetExperiment(size_t iExperiment) const {
00528 
00529         return m_vecstrExperiments[iExperiment];
00530     }
00531 
00545     size_t GetExperiment(const std::string& strExperiment) const {
00546         size_t i;
00547 
00548         for (i = 0; i < m_vecstrExperiments.size(); ++i)
00549             if (m_vecstrExperiments[i] == strExperiment)
00550                 return i;
00551 
00552         return -1;
00553     }
00554 
00568     void SetExperiment(size_t iExperiment, const std::string& strExperiment) {
00569 
00570         m_vecstrExperiments[iExperiment] = strExperiment;
00571     }
00572 
00592     void MaskGene(size_t iGene, bool fMask = true) {
00593 
00594         if (fMask)
00595             m_setiGenes.insert(iGene);
00596         else
00597             m_setiGenes.erase(iGene);
00598     }
00599 
00616     bool IsMasked(size_t iGene) const {
00617 
00618         return (m_setiGenes.find(iGene) != m_setiGenes.end());
00619     }
00620 
00641     void Set(size_t iGene, size_t iExperiment, float dValue) {
00642 
00643         m_Data.Set(iGene, iExperiment, dValue);
00644     }
00645 
00659     size_t GetGene(const std::string& strGene) const {
00660         TMapStrI::const_iterator iterGene;
00661 
00662         return (((iterGene = m_mapstriGenes.find(strGene))
00663                 == m_mapstriGenes.end()) ? -1 : iterGene->second);
00664     }
00665 
00683     std::string GetFeature(size_t iGene, const char* szFeature) const {
00684         size_t i;
00685 
00686         for (i = 0; i < m_vecstrFeatures.size(); ++i)
00687             if (m_vecstrFeatures[i] == szFeature)
00688                 return GetFeature(iGene, i);
00689 
00690         return "";
00691     }
00692 
00693     size_t AddFeature(string strName){
00694         m_vecstrFeatures.push_back(strName);
00695         size_t iF=m_vecstrFeatures.size()-1;
00696         m_vecvecstrFeatures.resize(iF);
00697         m_vecvecstrFeatures[iF-1].resize(GetGenes());
00698         return iF;
00699     }
00700 
00707     void Randomize() {
00708         size_t i;
00709 
00710         for (i = 0; i < GetGenes(); ++i)
00711             std::random_shuffle(Get(i), Get(i) + GetExperiments());
00712     }
00713 };
00714 
00715 }
00716 
00717 #endif // PCL_H