Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "seekcentral.h" 00023 00024 namespace Sleipnir { 00025 00026 CSeekCentral::CSeekCentral(){ 00027 m_vecstrGenes.clear(); 00028 m_vecstrDatasets.clear(); 00029 m_vecstrSearchDatasets.clear(); 00030 m_mapstrstrDatasetPlatform.clear(); 00031 m_vc.clear(); 00032 m_quant.clear(); 00033 m_vecstrAllQuery.clear(); 00034 m_vp.clear(); 00035 m_mapstriPlatform.clear(); 00036 m_vecstrPlatform.clear(); 00037 m_vecstrDP.clear(); 00038 m_mapstrintDataset.clear(); 00039 m_mapstrintGene.clear(); 00040 m_searchdsetMap.clear(); 00041 m_vecDB.clear(); 00042 m_vecDBDataset.clear(); 00043 m_rData = NULL; 00044 m_maxNumDB = 50; 00045 00046 m_master_rank_threads = NULL; 00047 m_sum_weight_threads = NULL; 00048 m_sum_sq_weight_threads = NULL; //sum squared weight 00049 m_counts_threads = NULL; 00050 m_rank_normal_threads = NULL; 00051 m_rank_threads = NULL; 00052 m_rank_d = NULL; 00053 00054 m_master_rank.clear(); 00055 m_sum_weight.clear(); 00056 m_sum_sq_weight.clear(); 00057 m_weight.clear(); 00058 m_counts.clear(); 00059 m_mapLoadTime.clear(); 00060 00061 m_Query.clear(); 00062 m_final.clear(); 00063 00064 m_iDatasets = 0; 00065 m_iGenes = 0; 00066 m_numThreads = 0; 00067 m_bSubtractGeneAvg = false; 00068 m_bNormPlatform = false; 00069 m_bLogit = false; 00070 m_eDistMeasure = CSeekDataset::Z_SCORE; 00071 m_bOutputWeightComponent = false; 00072 m_bSimulateWeight = false; 00073 m_bOutputText = false; 00074 m_bSquareZ = false; 00075 m_bSharedDB = false; 00076 00077 DEBUG = false; 00078 m_output_dir = ""; 00079 00080 m_iClient = -1; 00081 m_bEnableNetwork = false; 00082 } 00083 00084 CSeekCentral::~CSeekCentral(){ 00085 m_vecstrGenes.clear(); 00086 m_vecstrDatasets.clear(); 00087 m_vecstrSearchDatasets.clear(); 00088 m_mapstrstrDatasetPlatform.clear(); 00089 m_vecstrDP.clear(); 00090 m_mapstrintDataset.clear(); 00091 m_mapstrintGene.clear(); 00092 00093 utype i, j; 00094 for(i=0; i<m_vc.size(); i++){ 00095 if(m_vc[i]==NULL) continue; 00096 delete m_vc[i]; 00097 } 00098 m_vc.clear(); 00099 00100 m_quant.clear(); 00101 00102 for(i=0; i<m_searchdsetMap.size(); i++){ 00103 if(m_searchdsetMap[i]==NULL) continue; 00104 delete m_searchdsetMap[i]; 00105 m_searchdsetMap[i] = NULL; 00106 } 00107 m_searchdsetMap.clear(); 00108 00109 //m_rData, m_master_rank_threads, 00110 //m_sum_weight_threads, m_counts_threads 00111 //m_rank_normal_threads, and m_rank_threads 00112 //should be already freed 00113 if(m_master_rank_threads!=NULL){ 00114 CSeekTools::Free2DArray(m_master_rank_threads); 00115 m_master_rank_threads = NULL; 00116 } 00117 if(m_sum_weight_threads != NULL){ 00118 CSeekTools::Free2DArray(m_sum_weight_threads); 00119 m_sum_weight_threads = NULL; 00120 } 00121 if(m_sum_sq_weight_threads != NULL){ 00122 CSeekTools::Free2DArray(m_sum_sq_weight_threads); 00123 m_sum_sq_weight_threads = NULL; 00124 } 00125 if(m_counts_threads !=NULL){ 00126 CSeekTools::Free2DArray(m_counts_threads); 00127 m_counts_threads = NULL; 00128 } 00129 if(m_rank_normal_threads!=NULL){ 00130 for(j=0; j<m_numThreads; j++) 00131 m_rank_normal_threads[j].clear(); 00132 delete[] m_rank_normal_threads; 00133 m_rank_normal_threads = NULL; 00134 } 00135 if(m_rank_threads !=NULL){ 00136 for(j=0; j<m_numThreads; j++) 00137 m_rank_threads[j].clear(); 00138 delete[] m_rank_threads; 00139 m_rank_threads = NULL; 00140 } 00141 if(m_rank_d!=NULL){ 00142 CSeekTools::Free2DArray(m_rank_d); 00143 m_rank_d = NULL; 00144 } 00145 00146 m_master_rank.clear(); 00147 m_sum_weight.clear(); 00148 m_sum_sq_weight.clear(); 00149 m_counts.clear(); 00150 m_weight.clear(); 00151 m_final.clear(); 00152 m_vecDBDataset.clear(); 00153 00154 m_vecstrAllQuery.clear(); 00155 m_Query.clear(); 00156 00157 m_vp.clear(); 00158 00159 m_mapstriPlatform.clear(); 00160 m_vecstrPlatform.clear(); 00161 00162 if(m_vecDB.size()!=0){ 00163 if(!m_bSharedDB){ 00164 for(i=0; i<m_vecDB.size(); i++) 00165 delete m_vecDB[i]; 00166 } 00167 for(i=0; i<m_vecDB.size(); i++) 00168 m_vecDB[i] = NULL; 00169 m_vecDB.clear(); 00170 } 00171 00172 /*if(m_DB!=NULL){ 00173 if(!m_bSharedDB){ 00174 delete m_DB; 00175 } 00176 m_DB = NULL; 00177 }*/ 00178 m_iDatasets = 0; 00179 m_iGenes = 0; 00180 m_numThreads = 0; 00181 m_mapLoadTime.clear(); 00182 m_output_dir = ""; 00183 DEBUG = false; 00184 m_bSharedDB = false; 00185 } 00186 00187 bool CSeekCentral::CalculateRestart(){ 00188 set<string> ss; 00189 m_mapLoadTime.clear(); 00190 utype i, prev; 00191 prev = 0; 00192 for(i=0; i<m_vecstrAllQuery.size(); i++){ 00193 if(m_vecstrAllQuery[i].size()>m_maxNumDB){ 00194 fprintf(stderr, "Cannot fit query %d in buffer\n", i); 00195 return false; 00196 } 00197 } 00198 00199 vector<vector<string> > *vv = NULL; 00200 m_mapLoadTime[prev] = vector< vector<string> >(); 00201 for(i=0; i<m_vecstrAllQuery.size(); i++){ 00202 ss.insert(m_vecstrAllQuery[i].begin(), m_vecstrAllQuery[i].end()); 00203 vv = &(m_mapLoadTime[prev]); 00204 vv->push_back(m_vecstrAllQuery[i]); 00205 if(ss.size()>m_maxNumDB){ 00206 vv->pop_back(); 00207 ss.clear(); 00208 ss.insert(m_vecstrAllQuery[i].begin(), m_vecstrAllQuery[i].end()); 00209 prev = i; 00210 m_mapLoadTime[prev] = vector< vector<string> >(); 00211 vv = &(m_mapLoadTime[prev]); 00212 vv->push_back(m_vecstrAllQuery[i]); 00213 } 00214 } 00215 ss.clear(); 00216 00217 //check 00218 vector<char> vc; 00219 utype tot = 0; 00220 CSeekTools::InitVector(vc, m_vecstrAllQuery.size(), (char) 0); 00221 map<utype, vector< vector<string> > >::const_iterator ci = 00222 m_mapLoadTime.begin(); 00223 for(; ci!=m_mapLoadTime.end(); ci++) tot+=(ci->second).size(); 00224 vc.clear(); 00225 if(tot!=m_vecstrAllQuery.size()){ 00226 fprintf(stderr, "ERROR, size do not match\n"); 00227 return false; 00228 } 00229 return true; 00230 } 00231 00232 //for SeekServer 00233 //assume DB has been read (with gvar, sinfo information) 00234 //assume datasets and genes have been read 00235 //assume m_enableNetwork is on 00236 //* CDatabaselet collection is shared between multiple clients (m_bSharedDB) 00237 bool CSeekCentral::Initialize( 00238 const string &output_dir, const string &query, 00239 const string &search_dset, CSeekCentral *src, const int iClient, 00240 const float query_min_required, const float genome_min_required, 00241 const enum CSeekDataset::DistanceMeasure eDistMeasure, 00242 const bool bSubtractGeneAvg, const bool bNormPlatform){ 00243 00244 fprintf(stderr, "Request received from client\n"); 00245 00246 m_output_dir = output_dir; //LATER, TO BE DELETED 00247 m_maxNumDB = src->m_maxNumDB; 00248 m_bSharedDB = true; 00249 m_numThreads = src->m_numThreads; 00250 m_fScoreCutOff = src->m_fScoreCutOff; 00251 m_fPercentQueryAfterScoreCutOff = query_min_required; 00252 m_fPercentGenomeRequired = genome_min_required; 00253 m_bSquareZ = src->m_bSquareZ; 00254 m_bOutputText = src->m_bOutputText; 00255 m_bSubtractGeneAvg = bSubtractGeneAvg; 00256 m_bNormPlatform = bNormPlatform; 00257 m_bLogit = src->m_bLogit; 00258 m_eDistMeasure = eDistMeasure; 00259 00260 m_bOutputWeightComponent = src->m_bOutputWeightComponent; 00261 m_bSimulateWeight = src->m_bSimulateWeight; 00262 00263 m_bRandom = false; 00264 m_iNumRandom = 1; 00265 m_randRandom = NULL; 00266 00267 m_vecstrGenes.resize(src->m_vecstrGenes.size()); 00268 copy(src->m_vecstrGenes.begin(), src->m_vecstrGenes.end(), m_vecstrGenes.begin()); 00269 00270 m_vecstrDatasets.resize(src->m_vecstrDatasets.size()); 00271 copy(src->m_vecstrDatasets.begin(), src->m_vecstrDatasets.end(), m_vecstrDatasets.begin()); 00272 00273 m_mapstrintDataset.insert(src->m_mapstrintDataset.begin(), 00274 src->m_mapstrintDataset.end()); 00275 00276 m_mapstrintGene.insert(src->m_mapstrintGene.begin(), src->m_mapstrintGene.end()); 00277 00278 m_mapstrstrDatasetPlatform.insert(src->m_mapstrstrDatasetPlatform.begin(), 00279 src->m_mapstrstrDatasetPlatform.end()); 00280 m_mapstriPlatform.insert(src->m_mapstriPlatform.begin(), src->m_mapstriPlatform.end()); 00281 00282 m_vecstrPlatform.resize(src->m_vecstrPlatform.size()); 00283 copy(src->m_vecstrPlatform.begin(), src->m_vecstrPlatform.end(), m_vecstrPlatform.begin()); 00284 00285 m_vecstrDP.resize(src->m_vecstrDP.size()); 00286 copy(src->m_vecstrDP.begin(), src->m_vecstrDP.end(), m_vecstrDP.begin()); 00287 00288 m_quant = src->m_quant; 00289 utype i, j; 00290 omp_set_num_threads(m_numThreads); 00291 00292 m_iDatasets = m_vecstrDatasets.size(); 00293 m_iGenes = m_vecstrGenes.size(); 00294 00295 //read search datasets 00296 vector<string> sd; 00297 CMeta::Tokenize(search_dset.c_str(), sd, "|", false); 00298 m_vecstrSearchDatasets.resize(sd.size()); 00299 for(i=0; i<sd.size(); i++){ 00300 CMeta::Tokenize(sd[i].c_str(), m_vecstrSearchDatasets[i], " ", false); 00301 } 00302 //read queries 00303 vector<string> sq; 00304 CMeta::Tokenize(query.c_str(), sq, "|", false); 00305 m_vecstrAllQuery.resize(sq.size()); 00306 for(i=0; i<sq.size(); i++){ 00307 CMeta::Tokenize(sq[i].c_str(), m_vecstrAllQuery[i], " ", false); 00308 } 00309 00310 m_searchdsetMap.resize(m_vecstrAllQuery.size()); 00311 for(i=0; i<m_vecstrAllQuery.size(); i++){ 00312 m_searchdsetMap[i] = new CSeekIntIntMap(m_vecstrDatasets.size()); 00313 for(j=0; j<m_vecstrSearchDatasets[i].size(); j++) 00314 m_searchdsetMap[i]->Add( 00315 m_mapstrintDataset[m_vecstrSearchDatasets[i][j]]); 00316 } 00317 00318 m_vecDB.resize(src->m_vecDB.size()); 00319 for(i=0; i<m_vecDB.size(); i++) 00320 m_vecDB[i] = src->m_vecDB[i]; 00321 //m_DB = src->m_DB; //shared DB 00322 00323 m_vecDBDataset.resize(src->m_vecDB.size()); 00324 for(i=0; i<m_vecDB.size(); i++){ 00325 m_vecDBDataset[i].resize(src->m_vecDBDataset[i].size()); 00326 copy(src->m_vecDBDataset[i].begin(), src->m_vecDBDataset[i].end(), 00327 m_vecDBDataset[i].begin()); 00328 } 00329 00330 CSeekTools::LoadDatabase(m_vecDB, m_iGenes, m_iDatasets, 00331 m_vc, src->m_vc, m_vp, src->m_vp, m_vecstrDatasets, 00332 m_mapstrstrDatasetPlatform, m_mapstriPlatform); 00333 00334 if(!CalculateRestart()){ 00335 fprintf(stderr, "Error occurred during CalculateRestart()\n"); 00336 return false; 00337 } 00338 00339 //fprintf(stderr, "Finished CalculateRestart()\n"); 00340 00341 if(!EnableNetwork(iClient)){ 00342 fprintf(stderr, "Error occurred during EnableNetwork()\n"); 00343 return false; 00344 } 00345 00346 //fprintf(stderr, "Finished EnableNetworks()\n"); 00347 00348 if(!CheckDatasets(true)){ //replace parameter is true 00349 fprintf(stderr, "Error occurred during CheckDatasets()\n"); 00350 return false; 00351 } 00352 00353 //fprintf(stderr, "Finished CheckDatasets()\n"); 00354 return true; 00355 } 00356 00357 //network mode, meant to be run after Initialize() 00358 bool CSeekCentral::EnableNetwork(const int &iClient){ 00359 m_bEnableNetwork = true; 00360 m_iClient = iClient; //assume client connection is already open 00361 return true; 00362 } 00363 00364 //optional step 00365 //Checks how many datasets contain the query 00366 //requires the queries and searchdatasets to be loaded! 00367 bool CSeekCentral::CheckDatasets(const bool &replace){ 00368 utype dd, j; 00369 utype l; 00370 00371 stringstream ss; //search dataset (new!) 00372 stringstream sq; //query availability 00373 stringstream aq; //query (new!) 00374 00375 int maxGCoverage = GetMaxGenomeCoverage(); 00376 00377 for(l=0; l<m_searchdsetMap.size(); l++){ 00378 utype iUserDatasets = m_searchdsetMap[l]->GetNumSet(); 00379 const vector<utype> &allRDatasets = m_searchdsetMap[l]->GetAllReverse(); 00380 vector<int> count; 00381 CSeekTools::InitVector(count, m_vecstrAllQuery[l].size(), (int) 0); 00382 bool isFirst = true; 00383 //fprintf(stderr, "iUserDatasets %d\n", iUserDatasets); 00384 00385 for(dd=0; dd<iUserDatasets; dd++){ 00386 utype i = allRDatasets[dd]; 00387 CSeekIntIntMap *si = m_vc[i]->GetGeneMap(); 00388 utype present = 0; 00389 for(j=0, present=0; j<m_vecstrAllQuery[l].size(); j++){ 00390 if(m_mapstrintGene.find(m_vecstrAllQuery[l][j])== 00391 m_mapstrintGene.end()) continue; 00392 if(CSeekTools::IsNaN(si->GetForward( 00393 m_mapstrintGene[m_vecstrAllQuery[l][j]]))) continue; 00394 count[j]++; 00395 present++; 00396 } 00397 00398 //datasets that contains all query genes (very stringent) 00399 //if(present==m_vecstrAllQuery[l].size()){ 00400 00401 int minRequired = 0; 00402 if(m_vecstrAllQuery[l].size()>=5){ 00403 minRequired = (int) (m_fPercentQueryAfterScoreCutOff * 00404 m_vecstrAllQuery[l].size()); 00405 }else if(m_vecstrAllQuery[l].size()<=2){ 00406 minRequired = m_vecstrAllQuery[l].size(); 00407 }else{ 00408 minRequired = 2; 00409 } 00410 00411 int minGRequired = (int)(m_fPercentGenomeRequired* 00412 (float) maxGCoverage); 00413 00414 //datasets containing some query genes (relaxed) [ 0 ] 00415 if(present>0 && present>=minRequired && 00416 si->GetNumSet()>=minGRequired){ 00417 if(isFirst){ 00418 isFirst = false; 00419 ss << m_vecstrDatasets[i]; 00420 }else{ 00421 ss << " " << m_vecstrDatasets[i]; 00422 } 00423 } 00424 } 00425 00426 if(isFirst){ 00427 string err = "Error: no dataset contains any of the query genes"; 00428 fprintf(stderr, "%s\n", err.c_str()); 00429 if(m_bEnableNetwork) 00430 CSeekNetwork::Send(m_iClient, err); 00431 return false; 00432 } 00433 00434 if(l!=m_searchdsetMap.size()-1){ 00435 ss << "|"; 00436 } 00437 00438 //fprintf(stderr, "ss %s\n", ss.str().c_str()); 00439 isFirst = true; 00440 for(j=0; j<m_vecstrAllQuery[l].size(); j++){ 00441 sq << m_vecstrAllQuery[l][j] << ":" << count[j]; 00442 if(j!=m_vecstrAllQuery[l].size()-1){ 00443 sq << ";"; 00444 } 00445 if(count[j]==0) continue; 00446 if(isFirst){ 00447 isFirst = false; 00448 aq << m_vecstrAllQuery[l][j]; 00449 }else{ 00450 aq << " " << m_vecstrAllQuery[l][j]; 00451 } 00452 } 00453 00454 if(isFirst){ 00455 string err = "Error: no dataset contains any of the query genes"; 00456 fprintf(stderr, "%s\n", err.c_str()); 00457 if(m_bEnableNetwork) 00458 CSeekNetwork::Send(m_iClient, err); 00459 return false; 00460 } 00461 00462 if(l!=m_searchdsetMap.size()-1){ 00463 aq << "|"; 00464 sq << "|"; 00465 } 00466 00467 //fprintf(stderr, "sq %s\n", sq.str().c_str()); 00468 //fprintf(stderr, "aq %s\n", aq.str().c_str()); 00469 } 00470 00471 string refinedQuery = aq.str(); 00472 string refinedSearchDataset = ss.str(); 00473 string refinedGeneCount = sq.str(); 00474 if(m_bEnableNetwork){ 00475 CSeekNetwork::Send(m_iClient, refinedSearchDataset); 00476 CSeekNetwork::Send(m_iClient, refinedGeneCount); 00477 } 00478 00479 if(replace){ 00480 vector<string> qq; 00481 utype i; 00482 CMeta::Tokenize(refinedQuery.c_str(), qq, "|", true); 00483 m_vecstrAllQuery.resize(qq.size()); 00484 for(i=0; i<qq.size(); i++){ 00485 m_vecstrAllQuery[i].clear(); 00486 CMeta::Tokenize(qq[i].c_str(), m_vecstrAllQuery[i], " ", true); 00487 } 00488 00489 //Change the search datasets 00490 vector<string> sd; 00491 CMeta::Tokenize(refinedSearchDataset.c_str(), sd, "|", false); 00492 m_vecstrSearchDatasets.resize(sd.size()); 00493 for(i=0; i<sd.size(); i++){ 00494 m_vecstrSearchDatasets[i].clear(); 00495 CMeta::Tokenize(sd[i].c_str(), m_vecstrSearchDatasets[i], " ", false); 00496 } 00497 00498 //fprintf(stderr, "replace datasets %d %d\n", sd.size(), m_vecstrSearchDatasets[0].size()); 00499 //fprintf(stderr, "searchdsetMap size %d\n", m_searchdsetMap.size()); 00500 00501 for(i=0; i<m_searchdsetMap.size(); i++){ 00502 delete m_searchdsetMap[i]; 00503 } 00504 m_searchdsetMap.clear(); 00505 //fprintf(stderr, "cleared searchdsetMap\n"); 00506 00507 m_searchdsetMap.resize(m_vecstrAllQuery.size()); 00508 for(i=0; i<m_vecstrAllQuery.size(); i++){ 00509 m_searchdsetMap[i] = new CSeekIntIntMap(m_vecstrDatasets.size()); 00510 for(j=0; j<m_vecstrSearchDatasets[i].size(); j++) 00511 m_searchdsetMap[i]->Add( 00512 m_mapstrintDataset[m_vecstrSearchDatasets[i][j]]); 00513 } 00514 00515 } 00516 00517 return true; 00518 } 00519 00520 //load everything except query, search datasets, output directory 00521 bool CSeekCentral::Initialize(const vector<CSeekDBSetting*> &vecDBSetting, 00522 const utype buffer, const bool to_output_text, 00523 const bool bOutputWeightComponent, const bool bSimulateWeight, 00524 const enum CSeekDataset::DistanceMeasure dist_measure, 00525 const bool bSubtractAvg, const bool bNormPlatform, 00526 const bool bLogit, const float fCutOff, const float fPercentQueryRequired, 00527 const float fPercentGenomeRequired, 00528 const bool bSquareZ, const bool bRandom, const int iNumRandom, 00529 gsl_rng *rand, const bool useNibble, const int numThreads){ 00530 00531 m_maxNumDB = buffer; 00532 m_numThreads = numThreads; //changed from 8 00533 00534 m_fScoreCutOff = fCutOff; 00535 m_fPercentQueryAfterScoreCutOff = fPercentQueryRequired; 00536 m_fPercentGenomeRequired = fPercentGenomeRequired; 00537 m_bSquareZ = bSquareZ; 00538 00539 m_bOutputWeightComponent = bOutputWeightComponent; 00540 m_bSimulateWeight = bSimulateWeight; 00541 00542 //random retrieval========================== 00543 m_bRandom = bRandom; 00544 m_iNumRandom = iNumRandom; 00545 m_randRandom = rand; //random-case only 00546 //========================================== 00547 00548 if(!m_bRandom){ 00549 m_randRandom = NULL; 00550 } 00551 00552 utype i, j; 00553 00554 omp_set_num_threads(m_numThreads); 00555 00556 m_bOutputText = to_output_text; 00557 m_bSubtractGeneAvg = bSubtractAvg; 00558 m_bNormPlatform = bNormPlatform; 00559 m_bLogit = bLogit; 00560 m_eDistMeasure = dist_measure; 00561 00562 bool bCorrelation = false; 00563 00564 if(dist_measure==CSeekDataset::CORRELATION){ 00565 bCorrelation = true; 00566 if(m_bSubtractGeneAvg || m_bNormPlatform || m_bLogit){ 00567 fprintf(stderr, 00568 "Warning: setting subtract_avg, norm platform to false\n"); 00569 m_bSubtractGeneAvg = false; 00570 m_bNormPlatform = false; 00571 m_bLogit = false; 00572 } 00573 } 00574 00575 //read genes 00576 vector<string> vecstrGeneID; 00577 if(!CSeekTools::ReadListTwoColumns(vecDBSetting[0]->GetValue("gene"), 00578 vecstrGeneID, m_vecstrGenes)) 00579 return false; 00580 for(i=0; i<m_vecstrGenes.size(); i++) 00581 m_mapstrintGene[m_vecstrGenes[i]] = i; 00582 00583 //read quant file 00584 CSeekTools::ReadQuantFile(vecDBSetting[0]->GetValue("quant"), m_quant); 00585 00586 m_vecstrDatasets.clear(); 00587 m_vecstrDP.clear(); 00588 m_mapstriPlatform.clear(); 00589 m_mapstrstrDatasetPlatform.clear(); 00590 m_mapstrintDataset.clear(); 00591 m_vp.clear(); 00592 00593 m_vecDB.resize(vecDBSetting.size()); 00594 m_vecDBDataset.resize(vecDBSetting.size()); 00595 for(i=0; i<vecDBSetting.size(); i++) 00596 m_vecDB[i] = NULL; 00597 00598 for(i=0; i<vecDBSetting.size(); i++){ 00599 if(dist_measure==CSeekDataset::CORRELATION && 00600 vecDBSetting[i]->GetValue("sinfo")=="NA"){ 00601 fprintf(stderr, "Error: not specifying sinfo!\n"); 00602 return false; 00603 } 00604 00605 m_vecDB[i] = new CDatabase(useNibble); 00606 //read datasets 00607 vector<string> vD, vDP; 00608 if(!CSeekTools::ReadListTwoColumns(vecDBSetting[i]->GetValue("dset"), vD, vDP)) 00609 return false; 00610 00611 for(j=0; j<vD.size(); j++){ 00612 m_vecstrDatasets.push_back(vD[j]); 00613 m_vecDBDataset[i].push_back(vD[j]); 00614 m_vecstrDP.push_back(vDP[j]); 00615 } 00616 00617 vector<string> vecstrPlatforms; 00618 map<string,utype> mapstriPlatform; 00619 vector<CSeekPlatform> vp; 00620 CSeekTools::ReadPlatforms(vecDBSetting[i]->GetValue("platform"), vp, 00621 vecstrPlatforms, mapstriPlatform); 00622 00623 int cur = m_vp.size(); 00624 for(map<string,utype>::iterator it=mapstriPlatform.begin(); 00625 it!=mapstriPlatform.end(); it++){ 00626 m_mapstriPlatform[it->first] = it->second + cur; 00627 } 00628 00629 m_vp.resize(cur+vp.size()); 00630 for(j=0; j<vp.size(); j++) 00631 m_vp[cur+j].Copy(vp[j]); 00632 } 00633 00634 for(i=0; i<m_vecstrDatasets.size(); i++){ 00635 m_mapstrstrDatasetPlatform[m_vecstrDatasets[i]] = m_vecstrDP[i]; 00636 m_mapstrintDataset[m_vecstrDatasets[i]] = i; 00637 } 00638 00639 m_iDatasets = m_vecstrDatasets.size(); 00640 m_iGenes = m_vecstrGenes.size(); 00641 00642 for(i=0; i<vecDBSetting.size(); i++){ 00643 m_vecDB[i]->Open(vecDBSetting[i]->GetValue("db"), 00644 m_vecstrGenes, m_vecDBDataset[i].size(), vecDBSetting[i]->GetNumDB()); 00645 } 00646 00647 CSeekTools::LoadDatabase(m_vecDB, m_iGenes, m_iDatasets, 00648 vecDBSetting, m_vecstrDatasets, m_mapstrstrDatasetPlatform, 00649 m_mapstriPlatform, m_vp, m_vc, m_vecDBDataset, m_mapstrintDataset, 00650 false, bCorrelation); 00651 00652 return true; 00653 } 00654 00655 bool CSeekCentral::Initialize( 00656 const vector<CSeekDBSetting*> &vecDBSetting, 00657 const char *search_dset, const char *query, 00658 const char *output_dir, const utype buffer, const bool to_output_text, 00659 const bool bOutputWeightComponent, const bool bSimulateWeight, 00660 const enum CSeekDataset::DistanceMeasure dist_measure, 00661 const bool bSubtractAvg, const bool bNormPlatform, 00662 const bool bLogit, const float fCutOff, 00663 const float fPercentQueryRequired, const float fPercentGenomeRequired, 00664 const bool bSquareZ, const bool bRandom, const int iNumRandom, 00665 gsl_rng *rand, const bool useNibble, const int numThreads){ 00666 00667 if(!CSeekCentral::Initialize(vecDBSetting, buffer, to_output_text, 00668 bOutputWeightComponent, bSimulateWeight, dist_measure, 00669 bSubtractAvg, bNormPlatform, bLogit, fCutOff, 00670 fPercentQueryRequired, fPercentGenomeRequired, 00671 bSquareZ, bRandom, iNumRandom, rand, useNibble, numThreads)){ 00672 return false; 00673 } 00674 00675 utype i, j; 00676 omp_set_num_threads(m_numThreads); 00677 m_output_dir = output_dir; 00678 00679 //read queries 00680 if(!CSeekTools::ReadMultipleQueries(query, m_vecstrAllQuery)) 00681 return false; 00682 00683 //read search datasets 00684 string strSearchDset = search_dset; 00685 if(strSearchDset=="NA"){ 00686 m_vecstrSearchDatasets.resize(m_vecstrAllQuery.size()); 00687 for(i=0; i<m_vecstrAllQuery.size(); i++){ 00688 m_vecstrSearchDatasets[i].resize(m_vecstrDatasets.size()); 00689 for(j=0; j<m_vecstrDatasets.size(); j++){ 00690 m_vecstrSearchDatasets[i][j] = m_vecstrDatasets[j]; 00691 } 00692 } 00693 }else{ 00694 if(!CSeekTools::ReadMultipleQueries(search_dset, m_vecstrSearchDatasets)){ 00695 fprintf(stderr, "Error reading search datasets\n"); 00696 return false; 00697 } 00698 if(m_vecstrSearchDatasets.size()!=m_vecstrAllQuery.size()){ 00699 fprintf(stderr, "Search_dset file doesn't have enough lines. Remember 1 line / query!\n"); 00700 return false; 00701 } 00702 } 00703 00704 m_searchdsetMap.resize(m_vecstrAllQuery.size()); 00705 for(i=0; i<m_vecstrAllQuery.size(); i++){ 00706 m_searchdsetMap[i] = new CSeekIntIntMap(m_vecstrDatasets.size()); 00707 for(j=0; j<m_vecstrSearchDatasets[i].size(); j++){ 00708 m_searchdsetMap[i]->Add( 00709 m_mapstrintDataset[m_vecstrSearchDatasets[i][j]]); 00710 } 00711 } 00712 00713 if(!CalculateRestart()){ 00714 fprintf(stderr, "Error occurred during CalculateRestart()\n"); 00715 return false; 00716 } 00717 //fprintf(stderr, "Finished CalculateRestart()\n"); 00718 00719 return true; 00720 } 00721 00722 bool CSeekCentral::PrepareQuery(const vector<string> &vecstrQuery, 00723 CSeekQuery &query){ 00724 vector<utype> queryGenes; 00725 utype j; 00726 for(j=0; j<vecstrQuery.size(); j++){ 00727 if(m_mapstrintGene.find(vecstrQuery[j])== 00728 m_mapstrintGene.end()) continue; 00729 //size_t m = m_DB->GetGene(vecstrQuery[j]); 00730 //if(m==-1) continue; 00731 //queryGenes.push_back(m); 00732 queryGenes.push_back(m_mapstrintGene[vecstrQuery[j]]); 00733 } 00734 queryGenes.resize(queryGenes.size()); 00735 query.InitializeQuery(queryGenes, m_iGenes); 00736 00737 return true; 00738 } 00739 00740 bool CSeekCentral::PrepareOneQuery(CSeekQuery &query, 00741 CSeekIntIntMap &dMap, vector<float> &weight){ 00742 00743 assert(m_master_rank_threads==NULL && m_counts_threads==NULL && 00744 m_sum_weight_threads==NULL && m_sum_sq_weight_threads==NULL); 00745 assert(m_rank_normal_threads==NULL && m_rank_threads==NULL); 00746 assert(m_rData==NULL); 00747 00748 utype j; 00749 const vector<utype> &queryGenes = query.GetQuery(); 00750 const vector<utype> &allRDatasets = dMap.GetAllReverse(); 00751 utype iSearchDatasets = dMap.GetNumSet(); 00752 utype iQuery = queryGenes.size(); 00753 00754 for(j=0; j<iSearchDatasets; j++) 00755 m_vc[allRDatasets[j]]->InitializeQuery(queryGenes); 00756 00757 m_rData = new utype**[m_numThreads]; 00758 for(j=0; j<m_numThreads; j++) 00759 m_rData[j] = CSeekTools::Init2DArray(m_iGenes, iQuery, (utype)0); 00760 00761 m_master_rank_threads = 00762 CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0); 00763 m_sum_weight_threads = 00764 CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0); 00765 m_sum_sq_weight_threads = 00766 CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0); 00767 m_counts_threads = 00768 CSeekTools::Init2DArray(m_numThreads, m_iGenes, (utype)0); 00769 00770 m_rank_normal_threads = new vector<utype>[m_numThreads]; 00771 m_rank_threads = new vector<utype>[m_numThreads]; 00772 00773 for(j=0; j<m_numThreads; j++){ 00774 m_rank_normal_threads[j].resize(m_iGenes); 00775 m_rank_threads[j].resize(m_iGenes); 00776 //CSeekTools::InitVector(m_rank_normal_threads[j], m_iGenes, (utype) 255); 00777 //CSeekTools::InitVector(m_rank_threads[j], m_iGenes, (utype) 255); 00778 } 00779 00780 CSeekTools::InitVector(m_master_rank, m_iGenes, (float) 0); 00781 CSeekTools::InitVector(m_sum_weight, m_iGenes, (float) 0); 00782 CSeekTools::InitVector(m_sum_sq_weight, m_iGenes, (float) 0); 00783 CSeekTools::InitVector(m_counts, m_iGenes, (utype) 0); 00784 CSeekTools::InitVector(weight, m_iDatasets, (float)0); 00785 00786 return true; 00787 } 00788 00789 bool CSeekCentral::AggregateThreads(){ 00790 assert(m_master_rank_threads!=NULL && m_counts_threads!=NULL && 00791 m_sum_sq_weight_threads!=NULL && m_sum_weight_threads!=NULL); 00792 assert(m_rank_normal_threads!=NULL && m_rank_threads!=NULL); 00793 00794 //Aggregate into three vectors: m_master_rank, m_counts, m_sum_weight, m_sum_sq_weight 00795 utype j, k; 00796 for(j=0; j<m_numThreads; j++){ 00797 for(k=0; k<m_iGenes; k++){ 00798 m_master_rank[k] += m_master_rank_threads[j][k]; 00799 m_counts[k] += m_counts_threads[j][k]; 00800 m_sum_weight[k]+=m_sum_weight_threads[j][k]; 00801 m_sum_sq_weight[k]+=m_sum_sq_weight_threads[j][k]; 00802 } 00803 } 00804 00805 CSeekTools::Free2DArray(m_master_rank_threads); 00806 CSeekTools::Free2DArray(m_counts_threads); 00807 CSeekTools::Free2DArray(m_sum_weight_threads); 00808 CSeekTools::Free2DArray(m_sum_sq_weight_threads); 00809 m_master_rank_threads=NULL; 00810 m_counts_threads=NULL; 00811 m_sum_weight_threads = NULL; 00812 m_sum_sq_weight_threads = NULL; 00813 00814 for(j=0; j<m_numThreads; j++){ 00815 m_rank_normal_threads[j].clear(); 00816 m_rank_threads[j].clear(); 00817 } 00818 00819 delete[] m_rank_normal_threads; 00820 delete[] m_rank_threads; 00821 m_rank_normal_threads = NULL; 00822 m_rank_threads = NULL; 00823 00824 return true; 00825 } 00826 00827 bool CSeekCentral::FilterResults(const utype &iSearchDatasets){ 00828 utype j, k; 00829 bool DEBUG = false; 00830 if(DEBUG) fprintf(stderr, "Aggregating genes\n"); 00831 00832 for(j=0; j<m_iGenes; j++){ 00833 if(m_counts[j]<(int)(0.5*iSearchDatasets)) 00834 m_master_rank[j] = -320; 00835 else if(m_sum_weight[j]==0) 00836 m_master_rank[j] = -320; 00837 else{ 00838 m_master_rank[j] = 00839 //(m_master_rank[j] / m_sum_weight[j] - 320) / 100.0; 00840 (m_master_rank[j] - 320 * m_sum_weight[j]) / 100.0 / m_sum_weight[j]; 00841 //(m_master_rank[j] - 320 * m_sum_weight[j]) / 100.0 / sqrt(m_sum_sq_weight[j]); 00842 if(m_eDistMeasure==CSeekDataset::CORRELATION){ 00843 m_master_rank[j] = m_master_rank[j] / 3.0; 00844 } 00845 //m_master_rank[j] = m_master_rank[j]; 00846 //m_master_rank[j] = (m_master_rank[j] / iSearchDatasets - 320) / 100.0; 00847 } 00848 if(DEBUG) fprintf(stderr, "Gene %d %.5f\n", j, m_master_rank[j]); 00849 } 00850 return true; 00851 } 00852 00853 bool CSeekCentral::Sort(vector<AResultFloat> &final){ 00854 if(DEBUG) fprintf(stderr, "Sorting genes\n"); 00855 final.resize(m_iGenes); 00856 utype j; 00857 for(j=0; j<m_iGenes; j++){ 00858 //fprintf(stderr, "%d %s\n", j, DB.GetGene((size_t) j).c_str()); 00859 final[j].i = j; 00860 final[j].f = m_master_rank[j]; 00861 } 00862 if(DEBUG) fprintf(stderr, "Begin Sorting genes\n"); 00863 sort(final.begin(), final.end()); 00864 return true; 00865 } 00866 00867 bool CSeekCentral::Display(CSeekQuery &query, vector<AResultFloat> &final){ 00868 if(DEBUG) fprintf(stderr, "Results:\n"); 00869 utype jj, ii; 00870 const vector<char> &cQuery = query.GetQueryPresence(); 00871 for(ii=0, jj=0; jj<500; ii++){ 00872 if(cQuery[final[ii].i]==1) continue; 00873 //fprintf(stderr, "%s %.5f\n", 00874 // m_DB->GetGene((size_t)final[ii].i).c_str(), final[ii].f); 00875 fprintf(stderr, "%s %.5f\n", 00876 m_vecstrGenes[(size_t)final[ii].i].c_str(), final[ii].f); 00877 jj++; 00878 } 00879 return true; 00880 } 00881 00882 bool CSeekCentral::Write(const utype &i){ 00883 //assume m_bRandom = false 00884 char acBuffer[1024]; 00885 sprintf(acBuffer, "%s/%d.query", m_output_dir.c_str(), i); 00886 CSeekTools::WriteArrayText(acBuffer, m_vecstrAllQuery[i]); 00887 00888 sprintf(acBuffer, "%s/%d.dweight", m_output_dir.c_str(), i); 00889 CSeekTools::WriteArray(acBuffer, m_weight[i]); 00890 00891 sprintf(acBuffer, "%s/%d.gscore", m_output_dir.c_str(), i); 00892 CSeekTools::WriteArray(acBuffer, m_master_rank); 00893 00894 //send data to client 00895 if(m_bEnableNetwork){ 00896 if(CSeekNetwork::Send(m_iClient, m_weight[i])==-1){ 00897 fprintf(stderr, "Error sending message to client\n"); 00898 return false; 00899 } 00900 if(CSeekNetwork::Send(m_iClient, m_master_rank)==-1){ 00901 fprintf(stderr, "Error sending message to client\n"); 00902 return false; 00903 } 00904 } 00905 00906 if(!m_bRandom && m_bOutputText){ 00907 const vector<utype> &allRDatasets = 00908 m_searchdsetMap[i]->GetAllReverse(); 00909 utype iSearchDatasets = m_searchdsetMap[i]->GetNumSet(); 00910 vector<vector<string> > vecOutput; 00911 vecOutput.resize(2); 00912 vecOutput[0] = vector<string>(); 00913 vecOutput[1] = vector<string>(); 00914 sprintf(acBuffer, "%s/%d.results.txt", m_output_dir.c_str(), i); 00915 vector<AResultFloat> w; 00916 w.resize(m_iDatasets); 00917 utype j; 00918 for(j=0; j<m_iDatasets; j++){ 00919 w[j].i = j; 00920 w[j].f = m_weight[i][j]; 00921 } 00922 sort(w.begin(), w.end()); 00923 for(j=0; j<200 && j<iSearchDatasets; j++){ 00924 if(w[j].f==0) break; 00925 vecOutput[0].push_back(m_vecstrDatasets[w[j].i]); 00926 } 00927 vector<AResultFloat> wd; 00928 Sort(wd); 00929 for(j=0; j<2000; j++){ 00930 if(wd[j].f==-320) break; 00931 vecOutput[1].push_back(m_vecstrGenes[wd[j].i]); 00932 } 00933 CSeekTools::Write2DArrayText(acBuffer, vecOutput); 00934 } 00935 return true; 00936 } 00937 00938 int CSeekCentral::GetMaxGenomeCoverage(){ 00939 utype d; 00940 int max = 0; 00941 for(d=0; d<m_vecstrDatasets.size(); d++){ 00942 CSeekIntIntMap *mapG = m_vc[d]->GetGeneMap(); 00943 if(mapG->GetNumSet()>max){ 00944 max = mapG->GetNumSet(); 00945 } 00946 } 00947 return max; 00948 } 00949 00950 00951 bool CSeekCentral::Common(CSeekCentral::SearchMode &sm, 00952 gsl_rng *rnd, const CSeekQuery::PartitionMode *PART_M, 00953 const utype *FOLD, const float *RATE, 00954 const vector< vector<float> > *providedWeight, 00955 const vector< vector<string> > *newGoldStd){ 00956 00957 utype i, j, d, dd; 00958 int k; //keeps track of genes (for random case) 00959 utype l; //keeps track of random repetition (for random case) 00960 char acBuffer[1024]; 00961 00962 m_Query.resize(m_vecstrAllQuery.size()); 00963 m_weight.resize(m_vecstrAllQuery.size()); 00964 m_final.resize(m_vecstrAllQuery.size()); 00965 00966 //random-ranking case ========================= 00967 vector<vector<float> > vecRandWeight, vecRandScore; 00968 vecRandWeight.resize(m_iNumRandom); 00969 vecRandScore.resize(m_iNumRandom); 00970 for(l=0; l<m_iNumRandom; l++){ 00971 CSeekTools::InitVector(vecRandWeight[l], m_iDatasets, (float) 0); 00972 CSeekTools::InitVector(vecRandScore[l], m_iGenes, (float) 0); 00973 } 00974 00975 //NEED TO MAKE THIS A PARAMETER 00976 bool simulateWeight = m_bSimulateWeight; 00977 00978 //output weight component (Mar 19) 00979 bool weightComponent = m_bOutputWeightComponent; 00980 00981 l = 0; 00982 //oct 20, 2012: whether to redo current query with equal weighting 00983 int redoWithEqual = 0; //tri-mode: 0, 1, 2 00984 CSeekCentral::SearchMode current_sm; 00985 CSeekQuery equalWeightGold; 00986 00987 //backup of scores (Feb 3) 00988 vector<float> backupScore; 00989 CSeekTools::InitVector(backupScore, m_iGenes, (float) 0); 00990 00991 //fprintf(stderr, "0 %lu\n", CMeta::GetMemoryUsage()); 00992 current_sm = sm; 00993 00994 int maxGCoverage = GetMaxGenomeCoverage(); 00995 00996 //fprintf(stderr, "Min gene required %.2f %d %d\n", m_fPercentGenomeRequired, 00997 // maxGCoverage, (int)(m_fPercentGenomeRequired*(float) maxGCoverage)); 00998 00999 for(i=0; i<m_vecstrAllQuery.size(); i++){ 01000 //simulated weight case ====================== 01001 /*if(simulateWeight && redoWithEqual>=1) //1 or 2 01002 current_sm = EQUAL; 01003 else //0 01004 current_sm = sm;*/ 01005 //============================================ 01006 01007 if(m_mapLoadTime.find(i)!=m_mapLoadTime.end()){ 01008 if(!m_bRandom || l==0){ //l==0: first random repetition 01009 CSeekTools::ReadDatabaselets(m_vecDB, m_iGenes, m_iDatasets, 01010 m_mapLoadTime[i], m_vc, m_mapstrintGene, m_vecDBDataset, 01011 m_mapstrintDataset, m_iClient, m_bEnableNetwork); 01012 } 01013 } 01014 01015 const vector<utype> &allRDatasets = 01016 m_searchdsetMap[i]->GetAllReverse(); 01017 utype iSearchDatasets = m_searchdsetMap[i]->GetNumSet(); 01018 01019 //fprintf(stderr, "1 %lu\n", CMeta::GetMemoryUsage()); 01020 01021 CSeekQuery &query = m_Query[i]; 01022 vector<float> &weight = m_weight[i]; 01023 vector<AResultFloat> &final = m_final[i]; 01024 01025 PrepareQuery(m_vecstrAllQuery[i], query); 01026 PrepareOneQuery(query, *(m_searchdsetMap[i]), weight); 01027 utype iQuery = query.GetQuery().size(); 01028 01029 //fprintf(stderr, "1b %lu\n", CMeta::GetMemoryUsage()); 01030 01031 //for CV_CUSTOM 01032 CSeekQuery customGoldStd; 01033 if(current_sm==CV_CUSTOM) 01034 PrepareQuery((*newGoldStd)[i], customGoldStd); 01035 01036 if(current_sm==CV || current_sm==CV_CUSTOM) 01037 query.CreateCVPartitions(rnd, *PART_M, *FOLD); 01038 01039 if(current_sm==ORDER_STATISTICS) 01040 m_rank_d = CSeekTools::Init2DArray(iSearchDatasets, m_iGenes, 01041 (utype) 0); 01042 01043 //For outputing component weights! 01044 vector<float> wc; 01045 if(weightComponent){ 01046 if(current_sm==CV || current_sm==CV_CUSTOM){ 01047 wc.resize((int)query.GetNumFold()*(int)m_iDatasets); 01048 }else{ 01049 wc.resize((int)query.GetQuery().size()*(int)m_iDatasets); 01050 } 01051 fill(wc.begin(), wc.end(), (float)0); 01052 } 01053 //fprintf(stderr, "2 %lu\n", CMeta::GetMemoryUsage()); 01054 01055 #pragma omp parallel for \ 01056 shared(customGoldStd) \ 01057 private(dd, d, j) \ 01058 firstprivate(i, iSearchDatasets, iQuery) \ 01059 schedule(dynamic) 01060 for(dd=0; dd<iSearchDatasets; dd++){ 01061 d = allRDatasets[dd]; 01062 utype tid = omp_get_thread_num(); 01063 if(DEBUG) fprintf(stderr, "Dataset %d, %s\n", 01064 d, m_vecstrDatasets[d].c_str()); 01065 01066 CSeekIntIntMap *mapG = m_vc[d]->GetGeneMap(); 01067 CSeekIntIntMap *mapQ = m_vc[d]->GetQueryMap(); 01068 01069 //if dataset contains less than required number of genes, skip 01070 if(mapG->GetNumSet()<(int)(m_fPercentGenomeRequired* 01071 (float) maxGCoverage)){ //10000 01072 continue; 01073 } 01074 01075 if(mapQ==NULL ||mapQ->GetNumSet()==0){ 01076 if(DEBUG) fprintf(stderr, "This dataset is skipped\n"); 01077 continue; 01078 } 01079 01080 vector<utype> this_q; 01081 for(j=0; j<mapQ->GetNumSet(); j++) 01082 this_q.push_back(mapQ->GetReverse(j)); 01083 01084 if(DEBUG) fprintf(stderr, "Initializing %d\n", 01085 (int) this_q.size()); 01086 m_vc[d]->InitializeDataMatrix(m_rData[tid], m_quant, m_iGenes, 01087 iQuery, m_bSubtractGeneAvg, m_bNormPlatform, m_bLogit, 01088 m_eDistMeasure, m_fScoreCutOff, m_bRandom, m_randRandom); 01089 01090 float w = -1; 01091 float report_w = -1; //for showing weight of dataset 01092 01093 if(current_sm==CV || current_sm==CV_CUSTOM){ 01094 if(DEBUG) fprintf(stderr, "Weighting dataset\n"); 01095 if(current_sm==CV) 01096 CSeekWeighter::CVWeighting(query, *m_vc[d], *RATE, 01097 m_fPercentQueryAfterScoreCutOff, m_bSquareZ, 01098 &m_rank_threads[tid]); 01099 else 01100 CSeekWeighter::CVWeighting(query, *m_vc[d], *RATE, 01101 m_fPercentQueryAfterScoreCutOff, m_bSquareZ, 01102 &m_rank_threads[tid], &customGoldStd); 01103 01104 if( (w = m_vc[d]->GetDatasetSumWeight())==-1){ 01105 if(DEBUG) fprintf(stderr, "Bad weight\n"); 01106 continue; 01107 } 01108 01109 if(weightComponent && current_sm==CV){ 01110 utype numFold = query.GetNumFold(); 01111 float ww; 01112 for(j=0; j<numFold; j++){ 01113 //if((ww=m_vc[d]->GetCVWeight(j))==-1) continue; 01114 wc[(int)d*(int)numFold+(int)j] = m_vc[d]->GetCVWeight(j); 01115 } 01116 } 01117 } 01118 else if(current_sm==AVERAGE_Z){ 01119 CSeekWeighter::AverageWeighting(query, *m_vc[d], 01120 m_fPercentQueryAfterScoreCutOff, m_bSquareZ, w); 01121 if(w==-1) continue; 01122 } 01123 else if(current_sm==EQUAL && redoWithEqual==0){ 01124 w = 1.0; 01125 } 01126 else if(current_sm==USE_WEIGHT){ 01127 w = (*providedWeight)[i][d]; 01128 } 01129 //simulated weight case ====================== 01130 else if(current_sm==EQUAL && redoWithEqual==1){ 01131 w = 1.0; 01132 if(DEBUG) fprintf(stderr, "Before doing one gene weighting\n"); 01133 //calculate reported weight here! 01134 CSeekWeighter::OneGeneWeighting(query, *m_vc[d], 0.95, 01135 m_fPercentQueryAfterScoreCutOff, m_bSquareZ, 01136 &m_rank_threads[tid], &equalWeightGold); 01137 report_w = m_vc[d]->GetDatasetSumWeight(); 01138 } 01139 //============================================ 01140 01141 if(DEBUG) fprintf(stderr, "Doing linear combination\n"); 01142 01143 const utype MIN_REQUIRED = max((utype) 1, (utype) ( 01144 m_fPercentQueryAfterScoreCutOff * this_q.size())); 01145 CSeekWeighter::LinearCombine(m_rank_normal_threads[tid], this_q, 01146 *m_vc[d], MIN_REQUIRED, m_bSquareZ); 01147 01148 if(DEBUG) fprintf(stderr, 01149 "Adding contribution of dataset %d to master ranking: %.5f\n", d, w); 01150 01151 utype iGeneSet = mapG->GetNumSet(); 01152 const vector<utype> &allRGenes = mapG->GetAllReverse(); 01153 vector<utype>::const_iterator iterR = allRGenes.begin(); 01154 vector<utype>::const_iterator endR = allRGenes.begin() + iGeneSet; 01155 vector<utype> &Rank_Normal = m_rank_normal_threads[tid]; 01156 float* Master_Rank = &m_master_rank_threads[tid][0]; 01157 float* Sum_Weight = &m_sum_weight_threads[tid][0]; 01158 float* Sum_Sq_Weight = &m_sum_sq_weight_threads[tid][0]; 01159 utype* Counts = &m_counts_threads[tid][0]; 01160 01161 if(current_sm==ORDER_STATISTICS) 01162 for(; iterR!=endR; iterR++){ 01163 //if(Rank_Normal[*iterR]==0) continue; 01164 m_rank_d[dd][*iterR] = Rank_Normal[*iterR]; 01165 Counts[*iterR]++; 01166 } 01167 else 01168 for(; iterR!=endR; iterR++){ 01169 //if(Rank_Normal[*iterR]==0) continue; 01170 Master_Rank[*iterR] += (float) Rank_Normal[*iterR] * w; 01171 Sum_Weight[*iterR] += w; 01172 Sum_Sq_Weight[*iterR] += w * w; 01173 Counts[*iterR]++; 01174 } 01175 01176 //simulated weight case ====================== 01177 if(current_sm==EQUAL && redoWithEqual==1){ 01178 weight[d] = report_w; 01179 } 01180 //============================================ 01181 else if((current_sm==EQUAL || current_sm==ORDER_STATISTICS) 01182 && redoWithEqual==0){ 01183 weight[d] = 0; 01184 } 01185 else{ 01186 weight[d] = w; 01187 } 01188 01189 } 01190 //omp finishes 01191 //fprintf(stderr, "3 %lu\n", CMeta::GetMemoryUsage()); 01192 for(j=0; j<iSearchDatasets; j++) 01193 m_vc[allRDatasets[j]]->DeleteQuery(); 01194 01195 assert(m_rData!=NULL); 01196 for(j=0; j<m_numThreads; j++) 01197 CSeekTools::Free2DArray(m_rData[j]); 01198 delete[] m_rData; 01199 m_rData = NULL; 01200 01201 AggregateThreads(); 01202 01203 if(current_sm!=ORDER_STATISTICS){ 01204 FilterResults(iSearchDatasets); 01205 }else{ 01206 CSeekWeighter::OrderStatisticsRankAggregation(iSearchDatasets, 01207 m_iGenes, m_rank_d, m_counts, m_master_rank, m_numThreads); 01208 CSeekTools::Free2DArray(m_rank_d); 01209 m_rank_d = NULL; 01210 } 01211 01212 //Display(query, final); 01213 //fprintf(stderr, "4 %lu\n", CMeta::GetMemoryUsage()); 01214 SetQueryScoreNull(query); 01215 Sort(final); 01216 int ret; //for system calls 01217 01218 if(m_bRandom){ 01219 /*utype z, cz; 01220 for(cz=0, z=0; z<m_iDatasets; z++) 01221 if(m_weight[i][z]!=0) 01222 cz++; 01223 fprintf(stderr, "Number of weighted dataset: %d\n", cz); 01224 */ 01225 } 01226 else if(simulateWeight){ 01227 if((current_sm==EQUAL || current_sm==ORDER_STATISTICS) && !CheckWeight(i)){ 01228 fprintf(stderr, "Calculate dataset ordering\n"); 01229 ret = system("date +%s%N 1>&2"); 01230 if(m_bEnableNetwork && CSeekNetwork::Send(m_iClient, 01231 "Calculate dataset ordering")==-1){ 01232 fprintf(stderr, "Error sending message to client\n"); 01233 } 01234 CopyTopGenes(equalWeightGold, final, 100); 01235 redoWithEqual = 1; 01236 if(current_sm==ORDER_STATISTICS){ 01237 current_sm = EQUAL; 01238 //backup genes to old 01239 copy(m_master_rank.begin(), m_master_rank.end(), backupScore.begin()); 01240 } 01241 i--; 01242 continue; 01243 } 01244 else if(current_sm==CV && !CheckWeight(i)){ 01245 fprintf(stderr, "Redo with equal weighting\n"); 01246 ret = system("date +%s%N 1>&2"); 01247 if(m_bEnableNetwork && CSeekNetwork::Send(m_iClient, 01248 "Redo with equal weighting")==-1){ 01249 fprintf(stderr, "Error sending message to client\n"); 01250 } 01251 current_sm = EQUAL; 01252 i--; 01253 continue; 01254 } 01255 else if(current_sm==EQUAL && CheckWeight(i)){ 01256 redoWithEqual = 0; 01257 current_sm = sm; 01258 //copy genes from old to new 01259 if(sm==ORDER_STATISTICS){ 01260 copy(backupScore.begin(), backupScore.end(), m_master_rank.begin()); 01261 } 01262 } 01263 } 01264 01265 fprintf(stderr, "Done search\n"); 01266 ret = system("date +%s%N 1>&2"); 01267 01268 if(m_bEnableNetwork && CSeekNetwork::Send(m_iClient, "Done Search")==-1){ 01269 fprintf(stderr, "Error sending message to client\n"); 01270 } 01271 01272 //random-ranking case ========================= 01273 if(m_bRandom){ 01274 sort(m_master_rank.begin(), m_master_rank.end(), greater<float>()); 01275 sort(weight.begin(), weight.end(), greater<float>()); 01276 copy(m_master_rank.begin(), m_master_rank.end(), vecRandScore[l].begin()); 01277 copy(weight.begin(), weight.end(), vecRandWeight[l].begin()); 01278 l++; 01279 01280 if(l==m_iNumRandom){ //last repetition 01281 float confidence = 0.50; 01282 int n = (int) (confidence * (float) m_iNumRandom); 01283 vector<float> new_weight, new_score; 01284 CSeekTools::InitVector(new_weight, m_iDatasets, (float) 0); 01285 CSeekTools::InitVector(new_score, m_iGenes, (float) 0); 01286 for(k=0; k<m_iGenes; k++){ 01287 vector<float> all_s; 01288 all_s.resize(m_iNumRandom); 01289 for(l=0; l<m_iNumRandom; l++) 01290 all_s[l] = vecRandScore[l][k]; 01291 std::nth_element(all_s.begin(), all_s.begin()+n, all_s.end()); 01292 new_score[k] = all_s[n]; 01293 } 01294 /*for(k=0; k<m_iGenes; k++){ 01295 fprintf(stderr, "%d %.5f\n", k, new_score[k]); 01296 } 01297 getchar();*/ 01298 for(k=0; k<m_iDatasets; k++){ 01299 vector<float> all_w; 01300 all_w.resize(m_iNumRandom); 01301 for(l=0; l<m_iNumRandom; l++) 01302 all_w[l] = vecRandWeight[l][k]; 01303 std::nth_element(all_w.begin(), all_w.begin()+n, all_w.end()); 01304 new_weight[k] = all_w[n]; 01305 } 01306 sprintf(acBuffer, "%s/%d.rand.dweight", m_output_dir.c_str(), i); 01307 CSeekTools::WriteArray(acBuffer, new_weight); 01308 sprintf(acBuffer, "%s/%d.rand.gscore", m_output_dir.c_str(), i); 01309 CSeekTools::WriteArray(acBuffer, new_score); 01310 l = 0; 01311 }else{ 01312 i--; 01313 } 01314 } 01315 01316 if(!m_bRandom){ 01317 //if m_bRandom, write at the very end when all repetitions are done 01318 Write(i); 01319 if(weightComponent){ 01320 sprintf(acBuffer, "%s/%d.dweight_comp", m_output_dir.c_str(), i); 01321 CSeekTools::WriteArray(acBuffer, wc); 01322 vector<vector<string> > vecParts; 01323 vecParts.resize(query.GetNumFold()); 01324 utype kk; 01325 string strParts = ""; 01326 for(j=0; j<query.GetNumFold(); j++){ 01327 vecParts[j] = vector<string>(); 01328 const vector<utype> &vu = query.GetCVQuery(j); 01329 string s = ""; 01330 for(kk=0; kk<vu.size(); kk++){ 01331 vecParts[j].push_back(m_vecstrGenes[vu[kk]]); 01332 s+=m_vecstrGenes[vu[kk]]; 01333 if(kk!=vu.size()-1) 01334 s+=" "; 01335 } 01336 strParts+=s; 01337 if(j!=query.GetNumFold()-1) 01338 strParts+="|"; 01339 } 01340 sprintf(acBuffer, "%s/%d.query_cvpart", m_output_dir.c_str(), i); 01341 CSeekTools::Write2DArrayText(acBuffer, vecParts); 01342 //send data to client 01343 if(m_bEnableNetwork){ 01344 if(CSeekNetwork::Send(m_iClient, wc)==-1){ 01345 fprintf(stderr, "Error sending message to client\n"); 01346 return false; 01347 } 01348 if(CSeekNetwork::Send(m_iClient, strParts)==-1){ 01349 fprintf(stderr, "Error sending message to client\n"); 01350 return false; 01351 } 01352 } 01353 } 01354 } 01355 } 01356 01357 //fprintf(stderr, "4b %lu\n", CMeta::GetMemoryUsage()); 01358 01359 return true; 01360 } 01361 01362 bool CSeekCentral::CheckWeight(const utype &i){ 01363 utype j = 0; 01364 bool valid = false; 01365 for(j=0; j<m_iDatasets; j++){ 01366 if(m_weight[i][j]!=0){ 01367 valid = true; 01368 break; 01369 } 01370 } 01371 return valid; 01372 } 01373 01374 bool CSeekCentral::CopyTopGenes(CSeekQuery &csq, 01375 const vector<AResultFloat> &src, const utype top){ 01376 utype i, j; 01377 vector<utype> topGenes; 01378 for(i=0; i<top; i++){ 01379 if(src[i].f==-320) continue; 01380 topGenes.push_back(src[i].i); 01381 } 01382 if(topGenes.size()==0){ 01383 fprintf(stderr, "Error in CopyTopGenes!\n"); 01384 return false; 01385 } 01386 csq.InitializeQuery(topGenes, m_iGenes); 01387 return true; 01388 } 01389 01390 bool CSeekCentral::SetQueryScoreNull(const CSeekQuery &csq){ 01391 utype j; 01392 const vector<utype> &query = csq.GetQuery(); 01393 for(j=0; j<query.size(); j++){ 01394 m_master_rank[query[j]] = -320; 01395 } 01396 return true; 01397 } 01398 01399 bool CSeekCentral::EqualWeightSearch(){ 01400 CSeekCentral::SearchMode sm = EQUAL; 01401 return CSeekCentral::Common(sm); 01402 } 01403 01404 bool CSeekCentral::CVSearch(gsl_rng *rnd, const CSeekQuery::PartitionMode &PART_M, 01405 const utype &FOLD, const float &RATE){ 01406 CSeekCentral::SearchMode sm = CV; 01407 return CSeekCentral::Common(sm, rnd, &PART_M, &FOLD, &RATE); 01408 } 01409 01410 /* perform CVSearch, except that the weighting is based not on co-expression 01411 of query genes, but based on similarity of query genes to some custom gold 01412 standard gene-set */ 01413 bool CSeekCentral::CVCustomSearch(const vector< vector<string> > &newGoldStd, 01414 gsl_rng *rnd, const CSeekQuery::PartitionMode &PART_M, 01415 const utype &FOLD, const float &RATE){ 01416 CSeekCentral::SearchMode sm = CV_CUSTOM; 01417 return CSeekCentral::Common(sm, rnd, &PART_M, &FOLD, &RATE, 01418 NULL, &newGoldStd); 01419 } 01420 01421 bool CSeekCentral::WeightSearch(const vector< vector<float> > &weights){ 01422 CSeekCentral::SearchMode sm = USE_WEIGHT; 01423 return CSeekCentral::Common(sm, NULL, NULL, NULL, NULL, &weights); 01424 } 01425 01426 bool CSeekCentral::OrderStatistics(){ 01427 CSeekCentral::SearchMode sm = ORDER_STATISTICS; 01428 return CSeekCentral::Common(sm); 01429 } 01430 01431 bool CSeekCentral::AverageWeightSearch(){ 01432 CSeekCentral::SearchMode sm = AVERAGE_Z; 01433 return CSeekCentral::Common(sm, NULL, NULL, NULL, NULL); 01434 } 01435 01436 bool CSeekCentral::VarianceWeightSearch(){ 01437 vector<vector<float> > weights; 01438 weights.resize(m_vecstrAllQuery.size()); 01439 utype i, j, k; 01440 for(i=0; i<m_vecstrAllQuery.size(); i++){ 01441 weights[i] = vector<float>(); 01442 CSeekTools::InitVector(weights[i], m_iDatasets, (float)0); 01443 const vector<utype> &allRDatasets = 01444 m_searchdsetMap[i]->GetAllReverse(); 01445 utype iSearchDatasets = m_searchdsetMap[i]->GetNumSet(); 01446 01447 CSeekQuery q; 01448 PrepareQuery(m_vecstrAllQuery[i], q); 01449 const vector<utype> &qGenes = q.GetQuery(); 01450 01451 for(j=0; j<iSearchDatasets; j++){ 01452 utype d = allRDatasets[j]; 01453 for(k=0; k<qGenes.size(); k++){ 01454 float gv = m_vc[d]->GetGeneVariance(qGenes[k]); 01455 if(CMeta::IsNaN(gv)) continue; 01456 weights[i][d] += gv; 01457 } 01458 } 01459 } 01460 return WeightSearch(weights); 01461 } 01462 01463 bool CSeekCentral::Destruct(){ 01464 utype j; 01465 //for(j=0; j<m_iDatasets; j++) m_vc[j]->DeleteQueryBlock(); 01466 for(j=0; j<m_iDatasets; j++){ 01467 if(m_vc[j]!=NULL) continue; 01468 delete m_vc[j]; 01469 m_vc[j] = NULL; 01470 } 01471 return true; 01472 } 01473 01474 const vector< vector<AResultFloat> >& CSeekCentral::GetAllResult() const{ 01475 return m_final; 01476 } 01477 01478 const vector<CSeekQuery>& CSeekCentral::GetAllQuery() const{ 01479 return m_Query; 01480 } 01481 01482 utype CSeekCentral::GetGene(const string &strGene) const{ 01483 if(m_mapstrintGene.find(strGene)==m_mapstrintGene.end()) 01484 return CSeekTools::GetNaN(); 01485 return m_mapstrintGene.find(strGene)->second; 01486 } 01487 string CSeekCentral::GetGene(const utype &geneID) const{ 01488 return m_vecstrGenes[(size_t) geneID]; 01489 } 01490 01491 const vector<vector<float> >& CSeekCentral::GetAllWeight() const{ 01492 return m_weight; 01493 } 01494 } 01495