Sleipnir
src/seekcentral.h
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #ifndef SEEKCENTRAL_H
00023 #define SEEKCENTRAL_H
00024 
00025 #include "seekbasic.h"
00026 #include "seekdataset.h"
00027 #include "seekplatform.h"
00028 #include "seekmap.h"
00029 #include "seekreader.h"
00030 #include "seekquery.h"
00031 #include "seekevaluate.h"
00032 #include "database.h"
00033 #include "datapair.h"
00034 #include "seekweight.h"
00035 
00036 namespace Sleipnir {
00037 
00081 class CSeekCentral{
00082 public:
00083     
00088     enum SearchMode{
00089         CV=0, 
00090         EQUAL=1, 
00091         USE_WEIGHT=2, 
00092         CV_CUSTOM=3, 
00095         ORDER_STATISTICS=4, 
00096         AVERAGE_Z=5 
00097     };
00098     
00102     CSeekCentral();
00103     
00107     ~CSeekCentral();
00108 
00161     bool Initialize(
00162         const vector<CSeekDBSetting*> &vecDBSetting,
00163         const char *search_dset, const char *query,
00164         const char* output_dir,
00165         const utype buffer = 20, const bool to_output_text = false,
00166         const bool bOutputWeightComponent = false, const bool bSimulateWeight = false,
00167         const enum CSeekDataset::DistanceMeasure dist_measure = CSeekDataset::Z_SCORE,
00168         const bool bSubtractAvg = true, const bool bNormPlatform = false,
00169         const bool bLogit = false, const float fCutOff = -9999, 
00170         const float fPercentQueryRequired = 0, const float fPercentGenomeRequired = 0,
00171         const bool bSquareZ = false, const bool bRandom = false, const int iNumRandom = 10,
00172         gsl_rng *rand = NULL, const bool useNibble = false, const int numThreads = 8);
00173 
00217     bool Initialize(
00218         const vector<CSeekDBSetting*> &vecDBSetting,
00219         const utype buffer = 20, const bool to_output_text = false,
00220         const bool bOutputWeightComponent = false, const bool bSimulateWeight = false,
00221         const enum CSeekDataset::DistanceMeasure dist_measure = CSeekDataset::Z_SCORE,
00222         const bool bSubtractAvg = true, const bool bNormPlatform = false,
00223         const bool bLogit = false, const float fCutOff = -9999, 
00224         const float fPercentQueryRequired = 0, const float fPercentGenomeRequired = 0,
00225         const bool bSquareZ = false, const bool bRandom = false, const int iNumRandom = 10,
00226         gsl_rng *rand = NULL, const bool useNibble = false, const int numThreads = 8);
00227 
00249     bool Initialize(const string &output_dir, const string &query,
00250         const string &search_dset, CSeekCentral* src, const int iClient,
00251         const float query_min_required = 0, const float genome_min_required = 0,
00252         const enum CSeekDataset::DistanceMeasure = CSeekDataset::Z_SCORE,
00253         const bool bSubtractGeneAvg = true, const bool bNormPlatform = false);
00254 
00266     bool CVSearch(gsl_rng*, const CSeekQuery::PartitionMode&, const utype&, const float&);
00267 
00283     bool CVCustomSearch(const vector< vector<string> > &, gsl_rng*,
00284         const CSeekQuery::PartitionMode&, const utype&, const float&);
00285 
00290     bool EqualWeightSearch();
00291 
00302     bool WeightSearch(const vector<vector<float> >&);
00303 
00311     bool VarianceWeightSearch();
00312 
00318     bool AverageWeightSearch();
00319 
00325     bool OrderStatistics();
00326 
00331     const vector< vector<AResultFloat> >& GetAllResult()const;
00332 
00337     const vector<CSeekQuery>& GetAllQuery() const;
00338 
00345     const vector<vector<float> > &GetAllWeight() const;
00346 
00352     utype GetGene(const string &strGene) const;
00353 
00359     string GetGene(const utype &geneID) const;
00360 
00365     bool Destruct();
00366 
00371     int GetMaxGenomeCoverage();
00372 
00373 private:
00374     //network mode
00375     bool EnableNetwork(const int&);
00376     bool CheckDatasets(const bool&);
00377 
00378     /* Central search function */
00379     bool Common(CSeekCentral::SearchMode&, gsl_rng* = NULL,
00380         const CSeekQuery::PartitionMode* = NULL,
00381         const utype* = NULL, const float* = NULL,
00382         const vector< vector<float> >* = NULL,
00383         const vector< vector<string> >* = NULL);
00384 
00385     bool CheckWeight(const utype &i);
00386     bool CopyTopGenes(CSeekQuery&, const vector<AResultFloat>&, 
00387         const utype);
00388     bool SetQueryScoreNull(const CSeekQuery&);
00389     bool PrepareQuery(const vector<string>&, CSeekQuery&);
00390     bool CalculateRestart();
00391     bool PrepareOneQuery(CSeekQuery &, CSeekIntIntMap &, vector<float>&);
00392     bool AggregateThreads();
00393     bool FilterResults(const utype &);
00394     bool Sort(vector<AResultFloat> &);
00395     bool Write(const utype &);
00396     bool Display(CSeekQuery &, vector<AResultFloat>&);
00397 
00398     /* Gene, Dataset, and Platform Mapping*/
00399     vector<string> m_vecstrGenes;
00400     vector<string> m_vecstrDatasets;
00401     vector<string> m_vecstrDP;
00402     map<string, string> m_mapstrstrDatasetPlatform;
00403     map<string, utype> m_mapstrintDataset;
00404     map<string, utype> m_mapstrintGene;
00405     vector<vector<string> > m_vecstrSearchDatasets;
00406     vector<CSeekIntIntMap*> m_searchdsetMap;
00407 
00408     /* Datasets */
00409     vector<CSeekDataset*> m_vc;
00410 
00411     /* Output */
00412     bool m_bOutputText;
00413 
00414     /* If true, output random case (ie shuffle rankings per dataset)
00415        iNumRandom: number of repetitions (Oct 26, 2012) */
00416     bool m_bRandom;
00417     int m_iNumRandom;
00418     gsl_rng *m_randRandom;
00419     /* random dataset weight over all repetitions */
00420     //vector<vector<float> > m_vecRandWeight; 
00421     /* random gene scores over all repetitions */
00422     //vector<vector<float> > m_vecRandScore; 
00423 
00424     /* Gene-gene correlation matrix for all datasets
00425      Organized per thread */
00426     utype ***m_rData;
00427 
00428     /* Correlation discretization */
00429     vector<float> m_quant;
00430 
00431     /* Correlation transformation options */
00432     bool m_bSubtractGeneAvg;
00433     bool m_bNormPlatform;
00434     enum CSeekDataset::DistanceMeasure m_eDistMeasure;
00435     bool m_bLogit;
00436     bool m_bSquareZ;
00437 
00438     /* multi-threaded programming */
00439     float **m_master_rank_threads;
00440     float **m_sum_weight_threads;
00441     float **m_sum_sq_weight_threads;
00442     utype **m_counts_threads;
00443     vector<utype> *m_rank_normal_threads;
00444     vector<utype> *m_rank_threads;
00445 
00446     /* Essential search results */
00447     vector<float> m_master_rank;
00448     vector<float> m_sum_weight;
00449     vector<float> m_sum_sq_weight;
00450     vector<utype> m_counts;
00451 
00452     /* Holds results for all queries */
00453     vector< vector<float> > m_weight;
00454     vector< vector<AResultFloat> > m_final;
00455 
00456     /* Query */
00457     vector< vector<string> > m_vecstrAllQuery;
00458     vector<CSeekQuery> m_Query;
00459 
00460     /* Platform */
00461     vector<CSeekPlatform> m_vp;
00462     map<string, utype> m_mapstriPlatform;
00463     vector<string> m_vecstrPlatform;
00464 
00465     //CDatabase reference
00466     vector<CDatabase*> m_vecDB;
00467     vector<vector<string> > m_vecDBDataset; //A list of dsets in each CDatabase
00468 
00469     size_t m_iDatasets;
00470     size_t m_iGenes;
00471     utype m_numThreads;
00472 
00473     utype m_maxNumDB;
00474     map<utype, vector< vector<string> > > m_mapLoadTime;
00475     bool DEBUG;
00476 
00477     bool m_bOutputWeightComponent;
00478     bool m_bSimulateWeight;
00479 
00480     string m_output_dir;
00481     float m_fScoreCutOff;
00482     float m_fPercentQueryAfterScoreCutOff;
00483     float m_fPercentGenomeRequired;
00484 
00485     /* for order statistics, a datasets-by-genes matrix */
00486     utype **m_rank_d;
00487 
00488     /* for network mode */
00489     int m_iClient;
00490     bool m_bEnableNetwork;
00491     bool m_bSharedDB; //if m_DB is shared between multiple CSeekCentral instances
00492 };
00493 
00494 
00495 
00496 
00497 
00498 }
00499 #endif