Sleipnir
src/seekdataset.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef SEEKDATASET_H
00023 #define SEEKDATASET_H
00024 
00025 #include "seekbasic.h"
00026 #include "seekmap.h"
00027 #include "datapair.h"
00028 #include "seekplatform.h"
00029 
00030 namespace Sleipnir {
00031 
00032 class CSeekDBSetting{
00033 public:
00034     CSeekDBSetting(const string &gvar,
00035         const string &sinfo, const string &plat,
00036         const string &prep, const string &db,
00037         const string &gene, const string &quant,
00038         const string &dset, const utype &numDB){
00039         m_gvarDirectory = gvar;
00040         m_sinfoDirectory = sinfo;
00041         m_platformDirectory = plat;
00042         m_prepDirectory = prep;
00043         m_dbDirectory = db;
00044         m_geneMapFile = gene;
00045         m_quantFile = quant;
00046         m_dsetFile = dset;
00047         m_numDB = numDB;
00048     }
00049     CSeekDBSetting(const char *gvar,
00050         const char* sinfo, const char* plat,
00051         const char* prep, const char* db,
00052         const char* gene, const char* quant,
00053         const char* dset, const utype &numDB){
00054         m_gvarDirectory = gvar;
00055         m_sinfoDirectory = sinfo;
00056         m_platformDirectory = plat;
00057         m_prepDirectory = prep;
00058         m_dbDirectory = db;
00059         m_geneMapFile = gene;
00060         m_quantFile = quant;
00061         m_dsetFile = dset;
00062         m_numDB = numDB;
00063     }
00064 
00065     ~CSeekDBSetting(){
00066     }
00067 
00068     string GetValue(const string &str){
00069         if(str=="gene")
00070             return m_geneMapFile;
00071         else if(str=="dset")
00072             return m_dsetFile;
00073         else if(str=="quant")
00074             return m_quantFile;
00075         else if(str=="gvar")
00076             return m_gvarDirectory;
00077         else if(str=="sinfo")
00078             return m_sinfoDirectory;
00079         else if(str=="db")
00080             return m_dbDirectory;
00081         else if(str=="prep")
00082             return m_prepDirectory;
00083         else if(str=="platform")
00084             return m_platformDirectory;
00085         else
00086             return "NULL";
00087     }
00088 
00089     utype GetNumDB(){
00090         return m_numDB;
00091     }
00092 
00093 private:
00094     string m_gvarDirectory;
00095     string m_sinfoDirectory;
00096     string m_platformDirectory;
00097     string m_prepDirectory;
00098     string m_dbDirectory;
00099     string m_geneMapFile;
00100     string m_quantFile;
00101     string m_dsetFile;
00102     utype m_numDB;
00103 };
00104 
00105 
00106 
00133 class CSeekDataset{
00134 public:
00135 
00140     enum DistanceMeasure{
00141         CORRELATION = 0, 
00142         Z_SCORE = CORRELATION + 1 
00143     };
00144 
00149     CSeekDataset();
00150 
00155     ~CSeekDataset();
00156 
00166     bool ReadDatasetAverageStdev(const string &);
00167 
00177     bool ReadGeneAverage(const string &);
00178 
00188     bool ReadGeneVariance(const string &);
00189 
00199     bool ReadGenePresence(const string &);
00200 
00207     bool InitializeGeneMap();
00208 
00217     bool InitializeQuery(const vector<utype> &);
00218 
00228     bool InitializeQueryBlock(const vector<utype> &);
00229 
00236     bool DeleteQuery();
00237 
00244     bool DeleteQueryBlock();
00245 
00270     bool InitializeDataMatrix(utype**, const vector<float> &,
00271         const utype&, const utype&, const bool=true, 
00272         const bool=false, const bool=false,
00273         const enum DistanceMeasure=Z_SCORE,
00274         const float cutoff=-1.0*CMeta::GetNaN(), 
00275         const bool=false, gsl_rng *rand=NULL);
00276 
00282     bool Copy(CSeekDataset *);
00283 
00293     utype** GetDataMatrix();
00294 
00301     unsigned char** GetMatrix();
00302 
00307     CSeekIntIntMap* GetGeneMap();
00308 
00313     CSeekIntIntMap* GetDBMap();
00314 
00319     CSeekIntIntMap* GetQueryMap();
00320 
00325     const vector<utype>& GetQuery() const;
00326 
00331     const vector<utype>& GetQueryIndex() const;
00332 
00337     float GetGeneVariance(const utype&) const;
00342     float GetGeneAverage(const utype&) const;
00347     float GetDatasetAverage() const;
00352     float GetDatasetStdev() const;
00357     utype GetNumGenes() const;
00358 
00367     bool InitializeCVWeight(const utype&);
00368 
00374     bool SetCVWeight(const utype&, const float&);
00375 
00380     float GetCVWeight(const utype&);
00381 
00386     const vector<float>& GetCVWeight() const;
00387 
00392     float GetDatasetSumWeight();
00393 
00398     void SetPlatform(CSeekPlatform &);
00403     CSeekPlatform& GetPlatform() const;
00404 
00405 private:
00406     CSeekPlatform *platform;
00407     vector<float> geneAverage;
00408     vector<float> geneVariance;
00409     vector<char> genePresence;
00410     CSeekIntIntMap *geneMap;
00411 
00412     /* previously known as sinfo file */
00413     float m_fDsetAverage;
00414     float m_fDsetStdev;
00415 
00416     CSeekIntIntMap *dbMap;
00417     CSeekIntIntMap *queryMap;
00418     vector<utype> query;
00419     vector<utype> queryIndex;
00420 
00421     utype iQuerySize;
00422     utype iNumGenes;
00423     utype iDBSize;
00424 
00425     vector<float> weight;
00426 
00427     utype **rData;
00428     unsigned char **r;
00429 
00430     float sum_weight;
00431     bool m_bIsNibble;
00432 };
00433 
00434 
00435 
00436 }
00437 #endif