Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #ifndef SEEKCENTRAL_H 00023 #define SEEKCENTRAL_H 00024 00025 #include "seekbasic.h" 00026 #include "seekdataset.h" 00027 #include "seekplatform.h" 00028 #include "seekmap.h" 00029 #include "seekreader.h" 00030 #include "seekquery.h" 00031 #include "seekevaluate.h" 00032 #include "database.h" 00033 #include "datapair.h" 00034 #include "seekweight.h" 00035 00036 namespace Sleipnir { 00037 00081 class CSeekCentral{ 00082 public: 00083 00088 enum SearchMode{ 00089 CV=0, 00090 EQUAL=1, 00091 USE_WEIGHT=2, 00092 CV_CUSTOM=3, 00095 ORDER_STATISTICS=4, 00096 AVERAGE_Z=5 00097 }; 00098 00102 CSeekCentral(); 00103 00107 ~CSeekCentral(); 00108 00161 bool Initialize( 00162 const vector<CSeekDBSetting*> &vecDBSetting, 00163 const char *search_dset, const char *query, 00164 const char* output_dir, 00165 const utype buffer = 20, const bool to_output_text = false, 00166 const bool bOutputWeightComponent = false, const bool bSimulateWeight = false, 00167 const enum CSeekDataset::DistanceMeasure dist_measure = CSeekDataset::Z_SCORE, 00168 const bool bSubtractAvg = true, const bool bNormPlatform = false, 00169 const bool bLogit = false, const float fCutOff = -9999, 00170 const float fPercentQueryRequired = 0, const float fPercentGenomeRequired = 0, 00171 const bool bSquareZ = false, const bool bRandom = false, const int iNumRandom = 10, 00172 gsl_rng *rand = NULL, const bool useNibble = false, const int numThreads = 8); 00173 00217 bool Initialize( 00218 const vector<CSeekDBSetting*> &vecDBSetting, 00219 const utype buffer = 20, const bool to_output_text = false, 00220 const bool bOutputWeightComponent = false, const bool bSimulateWeight = false, 00221 const enum CSeekDataset::DistanceMeasure dist_measure = CSeekDataset::Z_SCORE, 00222 const bool bSubtractAvg = true, const bool bNormPlatform = false, 00223 const bool bLogit = false, const float fCutOff = -9999, 00224 const float fPercentQueryRequired = 0, const float fPercentGenomeRequired = 0, 00225 const bool bSquareZ = false, const bool bRandom = false, const int iNumRandom = 10, 00226 gsl_rng *rand = NULL, const bool useNibble = false, const int numThreads = 8); 00227 00249 bool Initialize(const string &output_dir, const string &query, 00250 const string &search_dset, CSeekCentral* src, const int iClient, 00251 const float query_min_required = 0, const float genome_min_required = 0, 00252 const enum CSeekDataset::DistanceMeasure = CSeekDataset::Z_SCORE, 00253 const bool bSubtractGeneAvg = true, const bool bNormPlatform = false); 00254 00266 bool CVSearch(gsl_rng*, const CSeekQuery::PartitionMode&, const utype&, const float&); 00267 00283 bool CVCustomSearch(const vector< vector<string> > &, gsl_rng*, 00284 const CSeekQuery::PartitionMode&, const utype&, const float&); 00285 00290 bool EqualWeightSearch(); 00291 00302 bool WeightSearch(const vector<vector<float> >&); 00303 00311 bool VarianceWeightSearch(); 00312 00318 bool AverageWeightSearch(); 00319 00325 bool OrderStatistics(); 00326 00331 const vector< vector<AResultFloat> >& GetAllResult()const; 00332 00337 const vector<CSeekQuery>& GetAllQuery() const; 00338 00345 const vector<vector<float> > &GetAllWeight() const; 00346 00352 utype GetGene(const string &strGene) const; 00353 00359 string GetGene(const utype &geneID) const; 00360 00365 bool Destruct(); 00366 00371 int GetMaxGenomeCoverage(); 00372 00373 private: 00374 //network mode 00375 bool EnableNetwork(const int&); 00376 bool CheckDatasets(const bool&); 00377 00378 /* Central search function */ 00379 bool Common(CSeekCentral::SearchMode&, gsl_rng* = NULL, 00380 const CSeekQuery::PartitionMode* = NULL, 00381 const utype* = NULL, const float* = NULL, 00382 const vector< vector<float> >* = NULL, 00383 const vector< vector<string> >* = NULL); 00384 00385 bool CheckWeight(const utype &i); 00386 bool CopyTopGenes(CSeekQuery&, const vector<AResultFloat>&, 00387 const utype); 00388 bool SetQueryScoreNull(const CSeekQuery&); 00389 bool PrepareQuery(const vector<string>&, CSeekQuery&); 00390 bool CalculateRestart(); 00391 bool PrepareOneQuery(CSeekQuery &, CSeekIntIntMap &, vector<float>&); 00392 bool AggregateThreads(); 00393 bool FilterResults(const utype &); 00394 bool Sort(vector<AResultFloat> &); 00395 bool Write(const utype &); 00396 bool Display(CSeekQuery &, vector<AResultFloat>&); 00397 00398 /* Gene, Dataset, and Platform Mapping*/ 00399 vector<string> m_vecstrGenes; 00400 vector<string> m_vecstrDatasets; 00401 vector<string> m_vecstrDP; 00402 map<string, string> m_mapstrstrDatasetPlatform; 00403 map<string, utype> m_mapstrintDataset; 00404 map<string, utype> m_mapstrintGene; 00405 vector<vector<string> > m_vecstrSearchDatasets; 00406 vector<CSeekIntIntMap*> m_searchdsetMap; 00407 00408 /* Datasets */ 00409 vector<CSeekDataset*> m_vc; 00410 00411 /* Output */ 00412 bool m_bOutputText; 00413 00414 /* If true, output random case (ie shuffle rankings per dataset) 00415 iNumRandom: number of repetitions (Oct 26, 2012) */ 00416 bool m_bRandom; 00417 int m_iNumRandom; 00418 gsl_rng *m_randRandom; 00419 /* random dataset weight over all repetitions */ 00420 //vector<vector<float> > m_vecRandWeight; 00421 /* random gene scores over all repetitions */ 00422 //vector<vector<float> > m_vecRandScore; 00423 00424 /* Gene-gene correlation matrix for all datasets 00425 Organized per thread */ 00426 utype ***m_rData; 00427 00428 /* Correlation discretization */ 00429 vector<float> m_quant; 00430 00431 /* Correlation transformation options */ 00432 bool m_bSubtractGeneAvg; 00433 bool m_bNormPlatform; 00434 enum CSeekDataset::DistanceMeasure m_eDistMeasure; 00435 bool m_bLogit; 00436 bool m_bSquareZ; 00437 00438 /* multi-threaded programming */ 00439 float **m_master_rank_threads; 00440 float **m_sum_weight_threads; 00441 float **m_sum_sq_weight_threads; 00442 utype **m_counts_threads; 00443 vector<utype> *m_rank_normal_threads; 00444 vector<utype> *m_rank_threads; 00445 00446 /* Essential search results */ 00447 vector<float> m_master_rank; 00448 vector<float> m_sum_weight; 00449 vector<float> m_sum_sq_weight; 00450 vector<utype> m_counts; 00451 00452 /* Holds results for all queries */ 00453 vector< vector<float> > m_weight; 00454 vector< vector<AResultFloat> > m_final; 00455 00456 /* Query */ 00457 vector< vector<string> > m_vecstrAllQuery; 00458 vector<CSeekQuery> m_Query; 00459 00460 /* Platform */ 00461 vector<CSeekPlatform> m_vp; 00462 map<string, utype> m_mapstriPlatform; 00463 vector<string> m_vecstrPlatform; 00464 00465 //CDatabase reference 00466 vector<CDatabase*> m_vecDB; 00467 vector<vector<string> > m_vecDBDataset; //A list of dsets in each CDatabase 00468 00469 size_t m_iDatasets; 00470 size_t m_iGenes; 00471 utype m_numThreads; 00472 00473 utype m_maxNumDB; 00474 map<utype, vector< vector<string> > > m_mapLoadTime; 00475 bool DEBUG; 00476 00477 bool m_bOutputWeightComponent; 00478 bool m_bSimulateWeight; 00479 00480 string m_output_dir; 00481 float m_fScoreCutOff; 00482 float m_fPercentQueryAfterScoreCutOff; 00483 float m_fPercentGenomeRequired; 00484 00485 /* for order statistics, a datasets-by-genes matrix */ 00486 utype **m_rank_d; 00487 00488 /* for network mode */ 00489 int m_iClient; 00490 bool m_bEnableNetwork; 00491 bool m_bSharedDB; //if m_DB is shared between multiple CSeekCentral instances 00492 }; 00493 00494 00495 00496 00497 00498 } 00499 #endif