Sleipnir
src/seekcentral.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "seekcentral.h"
00023 
00024 namespace Sleipnir {
00025 
00026 CSeekCentral::CSeekCentral(){
00027     m_vecstrGenes.clear();
00028     m_vecstrDatasets.clear();
00029     m_vecstrSearchDatasets.clear();
00030     m_mapstrstrDatasetPlatform.clear();
00031     m_vc.clear();
00032     m_quant.clear();
00033     m_vecstrAllQuery.clear();
00034     m_vp.clear();
00035     m_mapstriPlatform.clear();
00036     m_vecstrPlatform.clear();
00037     m_vecstrDP.clear();
00038     m_mapstrintDataset.clear();
00039     m_mapstrintGene.clear();
00040     m_searchdsetMap.clear();
00041     m_vecDB.clear();
00042     m_vecDBDataset.clear();
00043     m_rData = NULL;
00044     m_maxNumDB = 50;
00045 
00046     m_master_rank_threads = NULL;
00047     m_sum_weight_threads = NULL;
00048     m_sum_sq_weight_threads = NULL; //sum squared weight
00049     m_counts_threads = NULL;
00050     m_rank_normal_threads = NULL;
00051     m_rank_threads = NULL;
00052     m_rank_d = NULL;
00053 
00054     m_master_rank.clear();
00055     m_sum_weight.clear();
00056     m_sum_sq_weight.clear();
00057     m_weight.clear();
00058     m_counts.clear();
00059     m_mapLoadTime.clear();
00060 
00061     m_Query.clear();
00062     m_final.clear();
00063 
00064     m_iDatasets = 0;
00065     m_iGenes = 0;
00066     m_numThreads = 0;
00067     m_bSubtractGeneAvg = false;
00068     m_bNormPlatform = false;
00069     m_bLogit = false;
00070     m_eDistMeasure = CSeekDataset::Z_SCORE;
00071     m_bOutputWeightComponent = false;
00072     m_bSimulateWeight = false;
00073     m_bOutputText = false;
00074     m_bSquareZ = false;
00075     m_bSharedDB = false;
00076 
00077     DEBUG = false;
00078     m_output_dir = "";
00079 
00080     m_iClient = -1;
00081     m_bEnableNetwork = false;
00082 }
00083 
00084 CSeekCentral::~CSeekCentral(){
00085     m_vecstrGenes.clear();
00086     m_vecstrDatasets.clear();
00087     m_vecstrSearchDatasets.clear();
00088     m_mapstrstrDatasetPlatform.clear();
00089     m_vecstrDP.clear();
00090     m_mapstrintDataset.clear();
00091     m_mapstrintGene.clear();
00092 
00093     utype i, j;
00094     for(i=0; i<m_vc.size(); i++){
00095         if(m_vc[i]==NULL) continue;
00096         delete m_vc[i];
00097     }
00098     m_vc.clear();
00099     
00100     m_quant.clear();
00101 
00102     for(i=0; i<m_searchdsetMap.size(); i++){
00103         if(m_searchdsetMap[i]==NULL) continue;
00104         delete m_searchdsetMap[i];
00105         m_searchdsetMap[i] = NULL;
00106     }
00107     m_searchdsetMap.clear();
00108 
00109     //m_rData, m_master_rank_threads,
00110     //m_sum_weight_threads, m_counts_threads
00111     //m_rank_normal_threads, and m_rank_threads
00112     //should be already freed
00113     if(m_master_rank_threads!=NULL){
00114         CSeekTools::Free2DArray(m_master_rank_threads);
00115         m_master_rank_threads = NULL;       
00116     }
00117     if(m_sum_weight_threads != NULL){
00118         CSeekTools::Free2DArray(m_sum_weight_threads);
00119         m_sum_weight_threads = NULL;
00120     }
00121     if(m_sum_sq_weight_threads != NULL){
00122         CSeekTools::Free2DArray(m_sum_sq_weight_threads);
00123         m_sum_sq_weight_threads = NULL;
00124     }
00125     if(m_counts_threads !=NULL){
00126         CSeekTools::Free2DArray(m_counts_threads);
00127         m_counts_threads = NULL;
00128     }
00129     if(m_rank_normal_threads!=NULL){
00130         for(j=0; j<m_numThreads; j++)
00131             m_rank_normal_threads[j].clear();
00132         delete[] m_rank_normal_threads;
00133         m_rank_normal_threads = NULL;
00134     }
00135     if(m_rank_threads !=NULL){
00136         for(j=0; j<m_numThreads; j++)
00137             m_rank_threads[j].clear();
00138         delete[] m_rank_threads;
00139         m_rank_threads = NULL;
00140     }
00141     if(m_rank_d!=NULL){
00142         CSeekTools::Free2DArray(m_rank_d);
00143         m_rank_d = NULL;
00144     }
00145 
00146     m_master_rank.clear();
00147     m_sum_weight.clear();
00148     m_sum_sq_weight.clear();
00149     m_counts.clear();
00150     m_weight.clear();
00151     m_final.clear();
00152     m_vecDBDataset.clear();
00153 
00154     m_vecstrAllQuery.clear();
00155     m_Query.clear();
00156 
00157     m_vp.clear();
00158 
00159     m_mapstriPlatform.clear();
00160     m_vecstrPlatform.clear();
00161 
00162     if(m_vecDB.size()!=0){
00163         if(!m_bSharedDB){
00164             for(i=0; i<m_vecDB.size(); i++)
00165                 delete m_vecDB[i];
00166         }
00167         for(i=0; i<m_vecDB.size(); i++)
00168             m_vecDB[i] = NULL;
00169         m_vecDB.clear();
00170     }
00171 
00172     /*if(m_DB!=NULL){
00173         if(!m_bSharedDB){
00174             delete m_DB;
00175         }
00176         m_DB = NULL;
00177     }*/
00178     m_iDatasets = 0;
00179     m_iGenes = 0;
00180     m_numThreads = 0;
00181     m_mapLoadTime.clear();
00182     m_output_dir = "";
00183     DEBUG = false;
00184     m_bSharedDB = false;
00185 }
00186 
00187 bool CSeekCentral::CalculateRestart(){
00188     set<string> ss;
00189     m_mapLoadTime.clear();
00190     utype i, prev;
00191     prev = 0;
00192     for(i=0; i<m_vecstrAllQuery.size(); i++){
00193         if(m_vecstrAllQuery[i].size()>m_maxNumDB){
00194             fprintf(stderr, "Cannot fit query %d in buffer\n", i);
00195             return false;
00196         }
00197     }
00198 
00199     vector<vector<string> > *vv = NULL;
00200     m_mapLoadTime[prev] = vector< vector<string> >();
00201     for(i=0; i<m_vecstrAllQuery.size(); i++){
00202         ss.insert(m_vecstrAllQuery[i].begin(), m_vecstrAllQuery[i].end());
00203         vv = &(m_mapLoadTime[prev]);
00204         vv->push_back(m_vecstrAllQuery[i]);
00205         if(ss.size()>m_maxNumDB){
00206             vv->pop_back();
00207             ss.clear();
00208             ss.insert(m_vecstrAllQuery[i].begin(), m_vecstrAllQuery[i].end());
00209             prev = i;
00210             m_mapLoadTime[prev] = vector< vector<string> >();
00211             vv = &(m_mapLoadTime[prev]);
00212             vv->push_back(m_vecstrAllQuery[i]);
00213         }
00214     }
00215     ss.clear();
00216 
00217     //check
00218     vector<char> vc;
00219     utype tot = 0;
00220     CSeekTools::InitVector(vc, m_vecstrAllQuery.size(), (char) 0);
00221     map<utype, vector< vector<string> > >::const_iterator ci =
00222         m_mapLoadTime.begin();
00223     for(; ci!=m_mapLoadTime.end(); ci++) tot+=(ci->second).size();
00224     vc.clear();
00225     if(tot!=m_vecstrAllQuery.size()){
00226         fprintf(stderr, "ERROR, size do not match\n");
00227         return false;
00228     }
00229     return true;
00230 }
00231 
00232 //for SeekServer
00233 //assume DB has been read (with gvar, sinfo information)
00234 //assume datasets and genes have been read
00235 //assume m_enableNetwork is on
00236 //* CDatabaselet collection is shared between multiple clients (m_bSharedDB)
00237 bool CSeekCentral::Initialize(
00238     const string &output_dir, const string &query,
00239     const string &search_dset, CSeekCentral *src, const int iClient,
00240     const float query_min_required, const float genome_min_required,
00241     const enum CSeekDataset::DistanceMeasure eDistMeasure,
00242     const bool bSubtractGeneAvg, const bool bNormPlatform){
00243 
00244     fprintf(stderr, "Request received from client\n");
00245 
00246     m_output_dir = output_dir; //LATER, TO BE DELETED
00247     m_maxNumDB = src->m_maxNumDB;
00248     m_bSharedDB = true;
00249     m_numThreads = src->m_numThreads;
00250     m_fScoreCutOff = src->m_fScoreCutOff;
00251     m_fPercentQueryAfterScoreCutOff = query_min_required;
00252     m_fPercentGenomeRequired = genome_min_required;
00253     m_bSquareZ = src->m_bSquareZ;
00254     m_bOutputText = src->m_bOutputText;
00255     m_bSubtractGeneAvg = bSubtractGeneAvg;
00256     m_bNormPlatform = bNormPlatform;
00257     m_bLogit = src->m_bLogit;
00258     m_eDistMeasure = eDistMeasure;
00259 
00260     m_bOutputWeightComponent = src->m_bOutputWeightComponent;
00261     m_bSimulateWeight = src->m_bSimulateWeight;
00262 
00263     m_bRandom = false;
00264     m_iNumRandom = 1;
00265     m_randRandom = NULL;    
00266 
00267     m_vecstrGenes.resize(src->m_vecstrGenes.size());
00268     copy(src->m_vecstrGenes.begin(), src->m_vecstrGenes.end(), m_vecstrGenes.begin());
00269 
00270     m_vecstrDatasets.resize(src->m_vecstrDatasets.size());
00271     copy(src->m_vecstrDatasets.begin(), src->m_vecstrDatasets.end(), m_vecstrDatasets.begin());
00272 
00273     m_mapstrintDataset.insert(src->m_mapstrintDataset.begin(), 
00274         src->m_mapstrintDataset.end());
00275 
00276     m_mapstrintGene.insert(src->m_mapstrintGene.begin(), src->m_mapstrintGene.end());
00277 
00278     m_mapstrstrDatasetPlatform.insert(src->m_mapstrstrDatasetPlatform.begin(),
00279         src->m_mapstrstrDatasetPlatform.end());
00280     m_mapstriPlatform.insert(src->m_mapstriPlatform.begin(), src->m_mapstriPlatform.end());
00281 
00282     m_vecstrPlatform.resize(src->m_vecstrPlatform.size());
00283     copy(src->m_vecstrPlatform.begin(), src->m_vecstrPlatform.end(), m_vecstrPlatform.begin());
00284 
00285     m_vecstrDP.resize(src->m_vecstrDP.size());
00286     copy(src->m_vecstrDP.begin(), src->m_vecstrDP.end(), m_vecstrDP.begin());
00287 
00288     m_quant = src->m_quant;
00289     utype i, j;
00290     omp_set_num_threads(m_numThreads);
00291 
00292     m_iDatasets = m_vecstrDatasets.size();
00293     m_iGenes = m_vecstrGenes.size();
00294 
00295     //read search datasets
00296     vector<string> sd;
00297     CMeta::Tokenize(search_dset.c_str(), sd, "|", false);
00298     m_vecstrSearchDatasets.resize(sd.size());
00299     for(i=0; i<sd.size(); i++){
00300         CMeta::Tokenize(sd[i].c_str(), m_vecstrSearchDatasets[i], " ", false);
00301     }
00302     //read queries
00303     vector<string> sq;
00304     CMeta::Tokenize(query.c_str(), sq, "|", false);
00305     m_vecstrAllQuery.resize(sq.size());
00306     for(i=0; i<sq.size(); i++){
00307         CMeta::Tokenize(sq[i].c_str(), m_vecstrAllQuery[i], " ", false);
00308     }
00309 
00310     m_searchdsetMap.resize(m_vecstrAllQuery.size());
00311     for(i=0; i<m_vecstrAllQuery.size(); i++){
00312         m_searchdsetMap[i] = new CSeekIntIntMap(m_vecstrDatasets.size());
00313         for(j=0; j<m_vecstrSearchDatasets[i].size(); j++)
00314             m_searchdsetMap[i]->Add(
00315                 m_mapstrintDataset[m_vecstrSearchDatasets[i][j]]);
00316     }
00317 
00318     m_vecDB.resize(src->m_vecDB.size());
00319     for(i=0; i<m_vecDB.size(); i++)
00320         m_vecDB[i] = src->m_vecDB[i];
00321     //m_DB = src->m_DB; //shared DB
00322 
00323     m_vecDBDataset.resize(src->m_vecDB.size());
00324     for(i=0; i<m_vecDB.size(); i++){
00325         m_vecDBDataset[i].resize(src->m_vecDBDataset[i].size());
00326         copy(src->m_vecDBDataset[i].begin(), src->m_vecDBDataset[i].end(),
00327         m_vecDBDataset[i].begin());
00328     }
00329 
00330     CSeekTools::LoadDatabase(m_vecDB, m_iGenes, m_iDatasets,
00331         m_vc, src->m_vc, m_vp, src->m_vp, m_vecstrDatasets,
00332         m_mapstrstrDatasetPlatform, m_mapstriPlatform);
00333 
00334     if(!CalculateRestart()){
00335         fprintf(stderr, "Error occurred during CalculateRestart()\n");
00336         return false;
00337     }
00338 
00339     //fprintf(stderr, "Finished CalculateRestart()\n");
00340 
00341     if(!EnableNetwork(iClient)){
00342         fprintf(stderr, "Error occurred during EnableNetwork()\n");
00343         return false;
00344     }
00345 
00346     //fprintf(stderr, "Finished EnableNetworks()\n");
00347 
00348     if(!CheckDatasets(true)){ //replace parameter is true
00349         fprintf(stderr, "Error occurred during CheckDatasets()\n");
00350         return false;
00351     }
00352 
00353     //fprintf(stderr, "Finished CheckDatasets()\n");
00354     return true;
00355 }
00356 
00357 //network mode, meant to be run after Initialize()
00358 bool CSeekCentral::EnableNetwork(const int &iClient){
00359     m_bEnableNetwork = true;
00360     m_iClient = iClient; //assume client connection is already open
00361     return true;
00362 }
00363 
00364 //optional step
00365 //Checks how many datasets contain the query
00366 //requires the queries and searchdatasets to be loaded!
00367 bool CSeekCentral::CheckDatasets(const bool &replace){
00368     utype dd, j;
00369     utype l;
00370 
00371     stringstream ss; //search dataset (new!)
00372     stringstream sq; //query availability
00373     stringstream aq; //query (new!)
00374 
00375     int maxGCoverage = GetMaxGenomeCoverage();
00376 
00377     for(l=0; l<m_searchdsetMap.size(); l++){
00378         utype iUserDatasets = m_searchdsetMap[l]->GetNumSet();
00379         const vector<utype> &allRDatasets = m_searchdsetMap[l]->GetAllReverse();    
00380         vector<int> count;
00381         CSeekTools::InitVector(count, m_vecstrAllQuery[l].size(), (int) 0);
00382         bool isFirst = true;
00383         //fprintf(stderr, "iUserDatasets %d\n", iUserDatasets);
00384 
00385         for(dd=0; dd<iUserDatasets; dd++){
00386             utype i = allRDatasets[dd];
00387             CSeekIntIntMap *si = m_vc[i]->GetGeneMap();
00388             utype present = 0;
00389             for(j=0, present=0; j<m_vecstrAllQuery[l].size(); j++){
00390                 if(m_mapstrintGene.find(m_vecstrAllQuery[l][j])==
00391                     m_mapstrintGene.end()) continue;
00392                 if(CSeekTools::IsNaN(si->GetForward(
00393                     m_mapstrintGene[m_vecstrAllQuery[l][j]]))) continue;
00394                 count[j]++;
00395                 present++;
00396             }
00397 
00398             //datasets that contains all query genes (very stringent)
00399             //if(present==m_vecstrAllQuery[l].size()){
00400 
00401             int minRequired = 0;
00402             if(m_vecstrAllQuery[l].size()>=5){
00403                 minRequired = (int) (m_fPercentQueryAfterScoreCutOff * 
00404                 m_vecstrAllQuery[l].size());
00405             }else if(m_vecstrAllQuery[l].size()<=2){
00406                 minRequired = m_vecstrAllQuery[l].size();
00407             }else{
00408                 minRequired = 2;
00409             }
00410             
00411             int minGRequired = (int)(m_fPercentGenomeRequired*
00412             (float) maxGCoverage);
00413 
00414             //datasets containing some query genes (relaxed) [ 0 ] 
00415             if(present>0 && present>=minRequired && 
00416             si->GetNumSet()>=minGRequired){
00417                 if(isFirst){
00418                     isFirst = false;
00419                     ss << m_vecstrDatasets[i];
00420                 }else{
00421                     ss << " " << m_vecstrDatasets[i];
00422                 }
00423             }
00424         }
00425 
00426         if(isFirst){
00427             string err = "Error: no dataset contains any of the query genes";
00428             fprintf(stderr, "%s\n", err.c_str());
00429             if(m_bEnableNetwork)
00430                 CSeekNetwork::Send(m_iClient, err);
00431             return false;
00432         }
00433 
00434         if(l!=m_searchdsetMap.size()-1){
00435             ss << "|";
00436         }
00437 
00438         //fprintf(stderr, "ss %s\n", ss.str().c_str());
00439         isFirst = true;     
00440         for(j=0; j<m_vecstrAllQuery[l].size(); j++){
00441             sq << m_vecstrAllQuery[l][j] << ":" << count[j];
00442             if(j!=m_vecstrAllQuery[l].size()-1){
00443                 sq << ";";
00444             }
00445             if(count[j]==0) continue;
00446             if(isFirst){
00447                 isFirst = false;
00448                 aq << m_vecstrAllQuery[l][j];
00449             }else{
00450                 aq << " " << m_vecstrAllQuery[l][j];
00451             }
00452         }
00453 
00454         if(isFirst){
00455             string err = "Error: no dataset contains any of the query genes";
00456             fprintf(stderr, "%s\n", err.c_str());
00457             if(m_bEnableNetwork)
00458                 CSeekNetwork::Send(m_iClient, err);
00459             return false;
00460         }
00461 
00462         if(l!=m_searchdsetMap.size()-1){
00463             aq << "|";
00464             sq << "|";
00465         }
00466 
00467         //fprintf(stderr, "sq %s\n", sq.str().c_str());
00468         //fprintf(stderr, "aq %s\n", aq.str().c_str());
00469     }
00470 
00471     string refinedQuery = aq.str();
00472     string refinedSearchDataset = ss.str();
00473     string refinedGeneCount = sq.str();
00474     if(m_bEnableNetwork){
00475         CSeekNetwork::Send(m_iClient, refinedSearchDataset);
00476         CSeekNetwork::Send(m_iClient, refinedGeneCount);
00477     }
00478 
00479     if(replace){
00480         vector<string> qq;
00481         utype i;
00482         CMeta::Tokenize(refinedQuery.c_str(), qq, "|", true);
00483         m_vecstrAllQuery.resize(qq.size());
00484         for(i=0; i<qq.size(); i++){
00485             m_vecstrAllQuery[i].clear();
00486             CMeta::Tokenize(qq[i].c_str(), m_vecstrAllQuery[i], " ", true);
00487         }
00488 
00489         //Change the search datasets
00490         vector<string> sd;
00491         CMeta::Tokenize(refinedSearchDataset.c_str(), sd, "|", false);
00492         m_vecstrSearchDatasets.resize(sd.size());
00493         for(i=0; i<sd.size(); i++){
00494             m_vecstrSearchDatasets[i].clear();
00495             CMeta::Tokenize(sd[i].c_str(), m_vecstrSearchDatasets[i], " ", false);
00496         }
00497 
00498         //fprintf(stderr, "replace datasets %d %d\n", sd.size(), m_vecstrSearchDatasets[0].size());
00499         //fprintf(stderr, "searchdsetMap size %d\n", m_searchdsetMap.size());
00500 
00501         for(i=0; i<m_searchdsetMap.size(); i++){
00502             delete m_searchdsetMap[i];
00503         }
00504         m_searchdsetMap.clear();
00505         //fprintf(stderr, "cleared searchdsetMap\n");
00506 
00507         m_searchdsetMap.resize(m_vecstrAllQuery.size());
00508         for(i=0; i<m_vecstrAllQuery.size(); i++){
00509             m_searchdsetMap[i] = new CSeekIntIntMap(m_vecstrDatasets.size());
00510             for(j=0; j<m_vecstrSearchDatasets[i].size(); j++)
00511                 m_searchdsetMap[i]->Add(
00512                     m_mapstrintDataset[m_vecstrSearchDatasets[i][j]]);
00513         }
00514 
00515     }
00516 
00517     return true;    
00518 }
00519 
00520 //load everything except query, search datasets, output directory
00521 bool CSeekCentral::Initialize(const vector<CSeekDBSetting*> &vecDBSetting,
00522     const utype buffer, const bool to_output_text,
00523     const bool bOutputWeightComponent, const bool bSimulateWeight,
00524     const enum CSeekDataset::DistanceMeasure dist_measure,
00525     const bool bSubtractAvg, const bool bNormPlatform,
00526     const bool bLogit, const float fCutOff, const float fPercentQueryRequired,
00527     const float fPercentGenomeRequired,
00528     const bool bSquareZ, const bool bRandom, const int iNumRandom,
00529     gsl_rng *rand, const bool useNibble, const int numThreads){
00530 
00531     m_maxNumDB = buffer;
00532     m_numThreads = numThreads; //changed from 8
00533 
00534     m_fScoreCutOff = fCutOff;
00535     m_fPercentQueryAfterScoreCutOff = fPercentQueryRequired;
00536     m_fPercentGenomeRequired = fPercentGenomeRequired; 
00537     m_bSquareZ = bSquareZ;
00538 
00539     m_bOutputWeightComponent = bOutputWeightComponent;
00540     m_bSimulateWeight = bSimulateWeight;
00541 
00542     //random retrieval==========================
00543     m_bRandom = bRandom;
00544     m_iNumRandom = iNumRandom;
00545     m_randRandom = rand; //random-case only
00546     //==========================================
00547 
00548     if(!m_bRandom){
00549         m_randRandom = NULL;
00550     }
00551 
00552     utype i, j;
00553 
00554     omp_set_num_threads(m_numThreads);
00555 
00556     m_bOutputText = to_output_text;
00557     m_bSubtractGeneAvg = bSubtractAvg;
00558     m_bNormPlatform = bNormPlatform;
00559     m_bLogit = bLogit;
00560     m_eDistMeasure = dist_measure;
00561 
00562     bool bCorrelation = false;
00563 
00564     if(dist_measure==CSeekDataset::CORRELATION){
00565         bCorrelation = true;
00566         if(m_bSubtractGeneAvg || m_bNormPlatform || m_bLogit){
00567             fprintf(stderr,
00568                 "Warning: setting subtract_avg, norm platform to false\n");
00569             m_bSubtractGeneAvg = false;
00570             m_bNormPlatform = false;
00571             m_bLogit = false;
00572         }
00573     }
00574 
00575     //read genes
00576     vector<string> vecstrGeneID;
00577     if(!CSeekTools::ReadListTwoColumns(vecDBSetting[0]->GetValue("gene"),
00578         vecstrGeneID, m_vecstrGenes))
00579         return false;
00580     for(i=0; i<m_vecstrGenes.size(); i++)
00581         m_mapstrintGene[m_vecstrGenes[i]] = i;
00582 
00583     //read quant file
00584     CSeekTools::ReadQuantFile(vecDBSetting[0]->GetValue("quant"), m_quant);
00585 
00586     m_vecstrDatasets.clear();
00587     m_vecstrDP.clear();
00588     m_mapstriPlatform.clear();
00589     m_mapstrstrDatasetPlatform.clear();
00590     m_mapstrintDataset.clear();
00591     m_vp.clear();
00592 
00593     m_vecDB.resize(vecDBSetting.size());
00594     m_vecDBDataset.resize(vecDBSetting.size());
00595     for(i=0; i<vecDBSetting.size(); i++)
00596         m_vecDB[i] = NULL;
00597 
00598     for(i=0; i<vecDBSetting.size(); i++){
00599         if(dist_measure==CSeekDataset::CORRELATION &&
00600         vecDBSetting[i]->GetValue("sinfo")=="NA"){
00601             fprintf(stderr, "Error: not specifying sinfo!\n");
00602             return false;
00603         }
00604 
00605         m_vecDB[i] = new CDatabase(useNibble);
00606         //read datasets
00607         vector<string> vD, vDP;
00608         if(!CSeekTools::ReadListTwoColumns(vecDBSetting[i]->GetValue("dset"), vD, vDP))
00609             return false;
00610 
00611         for(j=0; j<vD.size(); j++){
00612             m_vecstrDatasets.push_back(vD[j]);
00613             m_vecDBDataset[i].push_back(vD[j]);
00614             m_vecstrDP.push_back(vDP[j]);
00615         }
00616 
00617         vector<string> vecstrPlatforms;
00618         map<string,utype> mapstriPlatform;
00619         vector<CSeekPlatform> vp;
00620         CSeekTools::ReadPlatforms(vecDBSetting[i]->GetValue("platform"), vp,
00621             vecstrPlatforms, mapstriPlatform);
00622         
00623         int cur = m_vp.size();
00624         for(map<string,utype>::iterator it=mapstriPlatform.begin();
00625             it!=mapstriPlatform.end(); it++){
00626             m_mapstriPlatform[it->first] = it->second + cur;
00627         }
00628 
00629         m_vp.resize(cur+vp.size());
00630         for(j=0; j<vp.size(); j++)
00631             m_vp[cur+j].Copy(vp[j]);
00632     }
00633 
00634     for(i=0; i<m_vecstrDatasets.size(); i++){
00635         m_mapstrstrDatasetPlatform[m_vecstrDatasets[i]] = m_vecstrDP[i];
00636         m_mapstrintDataset[m_vecstrDatasets[i]] = i;
00637     }
00638 
00639     m_iDatasets = m_vecstrDatasets.size();
00640     m_iGenes = m_vecstrGenes.size();
00641 
00642     for(i=0; i<vecDBSetting.size(); i++){
00643         m_vecDB[i]->Open(vecDBSetting[i]->GetValue("db"),
00644             m_vecstrGenes, m_vecDBDataset[i].size(), vecDBSetting[i]->GetNumDB());
00645     }
00646 
00647     CSeekTools::LoadDatabase(m_vecDB, m_iGenes, m_iDatasets,
00648         vecDBSetting, m_vecstrDatasets, m_mapstrstrDatasetPlatform,
00649         m_mapstriPlatform, m_vp, m_vc, m_vecDBDataset, m_mapstrintDataset,
00650         false, bCorrelation);
00651 
00652     return true;
00653 }
00654 
00655 bool CSeekCentral::Initialize(
00656     const vector<CSeekDBSetting*> &vecDBSetting,
00657     const char *search_dset, const char *query,
00658     const char *output_dir, const utype buffer, const bool to_output_text,
00659     const bool bOutputWeightComponent, const bool bSimulateWeight,
00660     const enum CSeekDataset::DistanceMeasure dist_measure,
00661     const bool bSubtractAvg, const bool bNormPlatform,
00662     const bool bLogit, const float fCutOff, 
00663     const float fPercentQueryRequired, const float fPercentGenomeRequired,
00664     const bool bSquareZ, const bool bRandom, const int iNumRandom,
00665     gsl_rng *rand, const bool useNibble, const int numThreads){
00666 
00667     if(!CSeekCentral::Initialize(vecDBSetting, buffer, to_output_text,
00668         bOutputWeightComponent, bSimulateWeight, dist_measure,
00669         bSubtractAvg, bNormPlatform, bLogit, fCutOff, 
00670         fPercentQueryRequired, fPercentGenomeRequired,
00671         bSquareZ, bRandom, iNumRandom, rand, useNibble, numThreads)){
00672         return false;
00673     }
00674 
00675     utype i, j;
00676     omp_set_num_threads(m_numThreads);
00677     m_output_dir = output_dir;
00678 
00679     //read queries
00680     if(!CSeekTools::ReadMultipleQueries(query, m_vecstrAllQuery))
00681         return false;
00682 
00683     //read search datasets
00684     string strSearchDset = search_dset;
00685     if(strSearchDset=="NA"){
00686         m_vecstrSearchDatasets.resize(m_vecstrAllQuery.size());
00687         for(i=0; i<m_vecstrAllQuery.size(); i++){
00688             m_vecstrSearchDatasets[i].resize(m_vecstrDatasets.size());
00689             for(j=0; j<m_vecstrDatasets.size(); j++){
00690                 m_vecstrSearchDatasets[i][j] = m_vecstrDatasets[j];
00691             }
00692         }
00693     }else{
00694         if(!CSeekTools::ReadMultipleQueries(search_dset, m_vecstrSearchDatasets)){
00695             fprintf(stderr, "Error reading search datasets\n");
00696             return false;
00697         }
00698         if(m_vecstrSearchDatasets.size()!=m_vecstrAllQuery.size()){
00699             fprintf(stderr, "Search_dset file doesn't have enough lines. Remember 1 line / query!\n");
00700             return false;
00701         }
00702     }
00703 
00704     m_searchdsetMap.resize(m_vecstrAllQuery.size());
00705     for(i=0; i<m_vecstrAllQuery.size(); i++){
00706         m_searchdsetMap[i] = new CSeekIntIntMap(m_vecstrDatasets.size());
00707         for(j=0; j<m_vecstrSearchDatasets[i].size(); j++){
00708             m_searchdsetMap[i]->Add(
00709                 m_mapstrintDataset[m_vecstrSearchDatasets[i][j]]);
00710         }
00711     }
00712 
00713     if(!CalculateRestart()){
00714         fprintf(stderr, "Error occurred during CalculateRestart()\n");
00715         return false;
00716     }
00717     //fprintf(stderr, "Finished CalculateRestart()\n");
00718 
00719     return true;
00720 }
00721 
00722 bool CSeekCentral::PrepareQuery(const vector<string> &vecstrQuery,
00723     CSeekQuery &query){
00724     vector<utype> queryGenes;
00725     utype j;
00726     for(j=0; j<vecstrQuery.size(); j++){
00727         if(m_mapstrintGene.find(vecstrQuery[j])==
00728             m_mapstrintGene.end()) continue;
00729         //size_t m = m_DB->GetGene(vecstrQuery[j]);
00730         //if(m==-1) continue;
00731         //queryGenes.push_back(m);
00732         queryGenes.push_back(m_mapstrintGene[vecstrQuery[j]]);
00733     }
00734     queryGenes.resize(queryGenes.size());
00735     query.InitializeQuery(queryGenes, m_iGenes);
00736 
00737     return true;
00738 }
00739 
00740 bool CSeekCentral::PrepareOneQuery(CSeekQuery &query,
00741     CSeekIntIntMap &dMap, vector<float> &weight){
00742 
00743     assert(m_master_rank_threads==NULL && m_counts_threads==NULL &&
00744         m_sum_weight_threads==NULL && m_sum_sq_weight_threads==NULL);
00745     assert(m_rank_normal_threads==NULL && m_rank_threads==NULL);
00746     assert(m_rData==NULL);
00747 
00748     utype j;
00749     const vector<utype> &queryGenes = query.GetQuery();
00750     const vector<utype> &allRDatasets = dMap.GetAllReverse();
00751     utype iSearchDatasets = dMap.GetNumSet();
00752     utype iQuery = queryGenes.size();
00753 
00754     for(j=0; j<iSearchDatasets; j++)
00755         m_vc[allRDatasets[j]]->InitializeQuery(queryGenes);
00756 
00757     m_rData = new utype**[m_numThreads];
00758     for(j=0; j<m_numThreads; j++)
00759         m_rData[j] = CSeekTools::Init2DArray(m_iGenes, iQuery, (utype)0);
00760     
00761     m_master_rank_threads =
00762         CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0);
00763     m_sum_weight_threads =
00764         CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0);
00765     m_sum_sq_weight_threads =
00766         CSeekTools::Init2DArray(m_numThreads, m_iGenes, (float)0);
00767     m_counts_threads =
00768         CSeekTools::Init2DArray(m_numThreads, m_iGenes, (utype)0);
00769     
00770     m_rank_normal_threads = new vector<utype>[m_numThreads];
00771     m_rank_threads = new vector<utype>[m_numThreads];
00772 
00773     for(j=0; j<m_numThreads; j++){
00774         m_rank_normal_threads[j].resize(m_iGenes);
00775         m_rank_threads[j].resize(m_iGenes);
00776         //CSeekTools::InitVector(m_rank_normal_threads[j], m_iGenes, (utype) 255);
00777         //CSeekTools::InitVector(m_rank_threads[j], m_iGenes, (utype) 255);
00778     }
00779     
00780     CSeekTools::InitVector(m_master_rank, m_iGenes, (float) 0);
00781     CSeekTools::InitVector(m_sum_weight, m_iGenes, (float) 0);
00782     CSeekTools::InitVector(m_sum_sq_weight, m_iGenes, (float) 0);
00783     CSeekTools::InitVector(m_counts, m_iGenes, (utype) 0);
00784     CSeekTools::InitVector(weight, m_iDatasets, (float)0);
00785     
00786     return true;
00787 }
00788 
00789 bool CSeekCentral::AggregateThreads(){
00790     assert(m_master_rank_threads!=NULL && m_counts_threads!=NULL &&
00791         m_sum_sq_weight_threads!=NULL && m_sum_weight_threads!=NULL);
00792     assert(m_rank_normal_threads!=NULL && m_rank_threads!=NULL);
00793 
00794     //Aggregate into three vectors: m_master_rank, m_counts, m_sum_weight, m_sum_sq_weight
00795     utype j, k;
00796     for(j=0; j<m_numThreads; j++){
00797         for(k=0; k<m_iGenes; k++){
00798             m_master_rank[k] += m_master_rank_threads[j][k];
00799             m_counts[k] += m_counts_threads[j][k];
00800             m_sum_weight[k]+=m_sum_weight_threads[j][k];
00801             m_sum_sq_weight[k]+=m_sum_sq_weight_threads[j][k];
00802         }
00803     }
00804 
00805     CSeekTools::Free2DArray(m_master_rank_threads);
00806     CSeekTools::Free2DArray(m_counts_threads);
00807     CSeekTools::Free2DArray(m_sum_weight_threads);
00808     CSeekTools::Free2DArray(m_sum_sq_weight_threads);
00809     m_master_rank_threads=NULL;
00810     m_counts_threads=NULL;
00811     m_sum_weight_threads = NULL;
00812     m_sum_sq_weight_threads = NULL;
00813 
00814     for(j=0; j<m_numThreads; j++){
00815         m_rank_normal_threads[j].clear();
00816         m_rank_threads[j].clear();
00817     }
00818 
00819     delete[] m_rank_normal_threads;
00820     delete[] m_rank_threads;
00821     m_rank_normal_threads = NULL;
00822     m_rank_threads = NULL;
00823 
00824     return true;
00825 }
00826 
00827 bool CSeekCentral::FilterResults(const utype &iSearchDatasets){
00828     utype j, k;
00829     bool DEBUG = false;
00830     if(DEBUG) fprintf(stderr, "Aggregating genes\n");
00831 
00832     for(j=0; j<m_iGenes; j++){
00833         if(m_counts[j]<(int)(0.5*iSearchDatasets))
00834             m_master_rank[j] = -320;
00835         else if(m_sum_weight[j]==0)
00836             m_master_rank[j] = -320;
00837         else{
00838             m_master_rank[j] =
00839                 //(m_master_rank[j] / m_sum_weight[j] - 320) / 100.0;
00840                 (m_master_rank[j] - 320 * m_sum_weight[j]) / 100.0 / m_sum_weight[j];
00841                 //(m_master_rank[j] - 320 * m_sum_weight[j]) / 100.0 / sqrt(m_sum_sq_weight[j]);
00842             if(m_eDistMeasure==CSeekDataset::CORRELATION){
00843                 m_master_rank[j] = m_master_rank[j] / 3.0;
00844             }
00845             //m_master_rank[j] = m_master_rank[j];
00846             //m_master_rank[j] = (m_master_rank[j] / iSearchDatasets - 320) / 100.0;
00847         }
00848         if(DEBUG) fprintf(stderr, "Gene %d %.5f\n", j, m_master_rank[j]);
00849     }
00850     return true;
00851 }
00852 
00853 bool CSeekCentral::Sort(vector<AResultFloat> &final){
00854     if(DEBUG) fprintf(stderr, "Sorting genes\n");
00855     final.resize(m_iGenes);
00856     utype j;
00857     for(j=0; j<m_iGenes; j++){
00858         //fprintf(stderr, "%d %s\n", j, DB.GetGene((size_t) j).c_str());
00859         final[j].i = j;
00860         final[j].f = m_master_rank[j];
00861     }
00862     if(DEBUG) fprintf(stderr, "Begin Sorting genes\n");
00863     sort(final.begin(), final.end());
00864     return true;
00865 }
00866 
00867 bool CSeekCentral::Display(CSeekQuery &query, vector<AResultFloat> &final){
00868     if(DEBUG) fprintf(stderr, "Results:\n");
00869     utype jj, ii;
00870     const vector<char> &cQuery = query.GetQueryPresence();
00871     for(ii=0, jj=0; jj<500; ii++){
00872         if(cQuery[final[ii].i]==1) continue;
00873         //fprintf(stderr, "%s %.5f\n",
00874         //  m_DB->GetGene((size_t)final[ii].i).c_str(), final[ii].f);
00875         fprintf(stderr, "%s %.5f\n",
00876             m_vecstrGenes[(size_t)final[ii].i].c_str(), final[ii].f);
00877         jj++;
00878     }
00879     return true;
00880 }
00881 
00882 bool CSeekCentral::Write(const utype &i){
00883     //assume m_bRandom = false
00884     char acBuffer[1024];
00885     sprintf(acBuffer, "%s/%d.query", m_output_dir.c_str(), i);
00886     CSeekTools::WriteArrayText(acBuffer, m_vecstrAllQuery[i]);
00887 
00888     sprintf(acBuffer, "%s/%d.dweight", m_output_dir.c_str(), i);
00889     CSeekTools::WriteArray(acBuffer, m_weight[i]);
00890     
00891     sprintf(acBuffer, "%s/%d.gscore", m_output_dir.c_str(), i);
00892     CSeekTools::WriteArray(acBuffer, m_master_rank);
00893 
00894     //send data to client
00895     if(m_bEnableNetwork){
00896         if(CSeekNetwork::Send(m_iClient, m_weight[i])==-1){
00897             fprintf(stderr, "Error sending message to client\n");
00898             return false;
00899         }
00900         if(CSeekNetwork::Send(m_iClient, m_master_rank)==-1){
00901             fprintf(stderr, "Error sending message to client\n");
00902             return false;
00903         }
00904     }
00905 
00906     if(!m_bRandom && m_bOutputText){
00907         const vector<utype> &allRDatasets =
00908             m_searchdsetMap[i]->GetAllReverse();
00909         utype iSearchDatasets = m_searchdsetMap[i]->GetNumSet();
00910         vector<vector<string> > vecOutput;
00911         vecOutput.resize(2);
00912         vecOutput[0] = vector<string>();
00913         vecOutput[1] = vector<string>();
00914         sprintf(acBuffer, "%s/%d.results.txt", m_output_dir.c_str(), i);
00915         vector<AResultFloat> w;
00916         w.resize(m_iDatasets);
00917         utype j;
00918         for(j=0; j<m_iDatasets; j++){
00919             w[j].i = j;
00920             w[j].f = m_weight[i][j];
00921         }
00922         sort(w.begin(), w.end());
00923         for(j=0; j<200 && j<iSearchDatasets; j++){
00924             if(w[j].f==0) break;
00925             vecOutput[0].push_back(m_vecstrDatasets[w[j].i]);
00926         }
00927         vector<AResultFloat> wd;
00928         Sort(wd);
00929         for(j=0; j<2000; j++){
00930             if(wd[j].f==-320) break;
00931             vecOutput[1].push_back(m_vecstrGenes[wd[j].i]);
00932         }
00933         CSeekTools::Write2DArrayText(acBuffer, vecOutput);
00934     }
00935     return true;
00936 }
00937 
00938 int CSeekCentral::GetMaxGenomeCoverage(){
00939     utype d;
00940     int max = 0;
00941     for(d=0; d<m_vecstrDatasets.size(); d++){
00942         CSeekIntIntMap *mapG = m_vc[d]->GetGeneMap();
00943         if(mapG->GetNumSet()>max){
00944             max = mapG->GetNumSet();
00945         }
00946     }
00947     return max;
00948 }
00949 
00950 
00951 bool CSeekCentral::Common(CSeekCentral::SearchMode &sm,
00952     gsl_rng *rnd, const CSeekQuery::PartitionMode *PART_M,
00953     const utype *FOLD, const float *RATE,
00954     const vector< vector<float> > *providedWeight,
00955     const vector< vector<string> > *newGoldStd){
00956 
00957     utype i, j, d, dd;
00958     int k; //keeps track of genes (for random case)
00959     utype l; //keeps track of random repetition (for random case)
00960     char acBuffer[1024];
00961     
00962     m_Query.resize(m_vecstrAllQuery.size());
00963     m_weight.resize(m_vecstrAllQuery.size());
00964     m_final.resize(m_vecstrAllQuery.size());
00965         
00966     //random-ranking case =========================
00967     vector<vector<float> > vecRandWeight, vecRandScore;
00968     vecRandWeight.resize(m_iNumRandom);
00969     vecRandScore.resize(m_iNumRandom);
00970     for(l=0; l<m_iNumRandom; l++){
00971         CSeekTools::InitVector(vecRandWeight[l], m_iDatasets, (float) 0);
00972         CSeekTools::InitVector(vecRandScore[l], m_iGenes, (float) 0);
00973     }
00974 
00975     //NEED TO MAKE THIS A PARAMETER
00976     bool simulateWeight = m_bSimulateWeight;
00977 
00978     //output weight component (Mar 19)
00979     bool weightComponent = m_bOutputWeightComponent;
00980 
00981     l = 0;
00982     //oct 20, 2012: whether to redo current query with equal weighting
00983     int redoWithEqual = 0; //tri-mode: 0, 1, 2
00984     CSeekCentral::SearchMode current_sm;
00985     CSeekQuery equalWeightGold;
00986 
00987     //backup of scores (Feb 3)
00988     vector<float> backupScore;
00989     CSeekTools::InitVector(backupScore, m_iGenes, (float) 0);
00990 
00991     //fprintf(stderr, "0 %lu\n", CMeta::GetMemoryUsage());
00992     current_sm = sm;
00993 
00994     int maxGCoverage = GetMaxGenomeCoverage();
00995 
00996     //fprintf(stderr, "Min gene required %.2f %d %d\n", m_fPercentGenomeRequired,
00997     //  maxGCoverage, (int)(m_fPercentGenomeRequired*(float) maxGCoverage));
00998 
00999     for(i=0; i<m_vecstrAllQuery.size(); i++){
01000         //simulated weight case ======================
01001         /*if(simulateWeight && redoWithEqual>=1) //1 or 2 
01002             current_sm = EQUAL;
01003         else //0
01004             current_sm = sm;*/
01005         //============================================
01006 
01007         if(m_mapLoadTime.find(i)!=m_mapLoadTime.end()){
01008             if(!m_bRandom || l==0){ //l==0: first random repetition
01009                 CSeekTools::ReadDatabaselets(m_vecDB, m_iGenes, m_iDatasets,
01010                     m_mapLoadTime[i], m_vc, m_mapstrintGene, m_vecDBDataset,
01011                     m_mapstrintDataset, m_iClient, m_bEnableNetwork);
01012             }
01013         }
01014 
01015         const vector<utype> &allRDatasets =
01016             m_searchdsetMap[i]->GetAllReverse();
01017         utype iSearchDatasets = m_searchdsetMap[i]->GetNumSet();
01018 
01019         //fprintf(stderr, "1 %lu\n", CMeta::GetMemoryUsage());
01020 
01021         CSeekQuery &query = m_Query[i];
01022         vector<float> &weight = m_weight[i];
01023         vector<AResultFloat> &final = m_final[i];
01024 
01025         PrepareQuery(m_vecstrAllQuery[i], query);
01026         PrepareOneQuery(query, *(m_searchdsetMap[i]), weight);
01027         utype iQuery = query.GetQuery().size();
01028 
01029         //fprintf(stderr, "1b %lu\n", CMeta::GetMemoryUsage());
01030         
01031         //for CV_CUSTOM
01032         CSeekQuery customGoldStd;
01033         if(current_sm==CV_CUSTOM)
01034             PrepareQuery((*newGoldStd)[i], customGoldStd);
01035 
01036         if(current_sm==CV || current_sm==CV_CUSTOM)
01037             query.CreateCVPartitions(rnd, *PART_M, *FOLD);
01038 
01039         if(current_sm==ORDER_STATISTICS)
01040             m_rank_d = CSeekTools::Init2DArray(iSearchDatasets, m_iGenes,
01041                 (utype) 0);
01042 
01043         //For outputing component weights!
01044         vector<float> wc;
01045         if(weightComponent){
01046             if(current_sm==CV || current_sm==CV_CUSTOM){
01047                 wc.resize((int)query.GetNumFold()*(int)m_iDatasets);
01048             }else{
01049                 wc.resize((int)query.GetQuery().size()*(int)m_iDatasets);
01050             }
01051             fill(wc.begin(), wc.end(), (float)0);
01052         }
01053         //fprintf(stderr, "2 %lu\n", CMeta::GetMemoryUsage());
01054 
01055         #pragma omp parallel for \
01056         shared(customGoldStd) \
01057         private(dd, d, j) \
01058         firstprivate(i, iSearchDatasets, iQuery) \
01059         schedule(dynamic)
01060         for(dd=0; dd<iSearchDatasets; dd++){
01061             d = allRDatasets[dd];
01062             utype tid = omp_get_thread_num();
01063             if(DEBUG) fprintf(stderr, "Dataset %d, %s\n",
01064                 d, m_vecstrDatasets[d].c_str());
01065 
01066             CSeekIntIntMap *mapG = m_vc[d]->GetGeneMap();
01067             CSeekIntIntMap *mapQ = m_vc[d]->GetQueryMap();
01068 
01069             //if dataset contains less than required number of genes, skip
01070             if(mapG->GetNumSet()<(int)(m_fPercentGenomeRequired*
01071             (float) maxGCoverage)){ //10000
01072                 continue;
01073             }
01074 
01075             if(mapQ==NULL ||mapQ->GetNumSet()==0){
01076                 if(DEBUG) fprintf(stderr, "This dataset is skipped\n");
01077                 continue;
01078             }
01079 
01080             vector<utype> this_q;
01081             for(j=0; j<mapQ->GetNumSet(); j++)
01082                 this_q.push_back(mapQ->GetReverse(j));
01083 
01084             if(DEBUG) fprintf(stderr, "Initializing %d\n",
01085                 (int) this_q.size());
01086             m_vc[d]->InitializeDataMatrix(m_rData[tid], m_quant, m_iGenes,
01087                 iQuery, m_bSubtractGeneAvg, m_bNormPlatform, m_bLogit,
01088                 m_eDistMeasure, m_fScoreCutOff, m_bRandom, m_randRandom);
01089 
01090             float w = -1;
01091             float report_w = -1; //for showing weight of dataset
01092 
01093             if(current_sm==CV || current_sm==CV_CUSTOM){
01094                 if(DEBUG) fprintf(stderr, "Weighting dataset\n");
01095                 if(current_sm==CV)
01096                     CSeekWeighter::CVWeighting(query, *m_vc[d], *RATE,
01097                         m_fPercentQueryAfterScoreCutOff, m_bSquareZ,
01098                         &m_rank_threads[tid]);
01099                 else
01100                     CSeekWeighter::CVWeighting(query, *m_vc[d], *RATE,
01101                         m_fPercentQueryAfterScoreCutOff, m_bSquareZ,
01102                         &m_rank_threads[tid], &customGoldStd);
01103 
01104                 if( (w = m_vc[d]->GetDatasetSumWeight())==-1){
01105                     if(DEBUG) fprintf(stderr, "Bad weight\n");
01106                     continue;
01107                 }
01108 
01109                 if(weightComponent && current_sm==CV){
01110                     utype numFold = query.GetNumFold();
01111                     float ww;
01112                     for(j=0; j<numFold; j++){
01113                         //if((ww=m_vc[d]->GetCVWeight(j))==-1) continue;
01114                         wc[(int)d*(int)numFold+(int)j] = m_vc[d]->GetCVWeight(j);
01115                     }
01116                 }
01117             }
01118             else if(current_sm==AVERAGE_Z){
01119                 CSeekWeighter::AverageWeighting(query, *m_vc[d],
01120                     m_fPercentQueryAfterScoreCutOff, m_bSquareZ, w);
01121                 if(w==-1) continue;
01122             }
01123             else if(current_sm==EQUAL && redoWithEqual==0){
01124                 w = 1.0;
01125             }
01126             else if(current_sm==USE_WEIGHT){
01127                 w = (*providedWeight)[i][d];
01128             }
01129             //simulated weight case ======================
01130             else if(current_sm==EQUAL && redoWithEqual==1){
01131                 w = 1.0;
01132                 if(DEBUG) fprintf(stderr, "Before doing one gene weighting\n");
01133                 //calculate reported weight here!
01134                 CSeekWeighter::OneGeneWeighting(query, *m_vc[d], 0.95,
01135                     m_fPercentQueryAfterScoreCutOff, m_bSquareZ,
01136                     &m_rank_threads[tid], &equalWeightGold);
01137                 report_w = m_vc[d]->GetDatasetSumWeight();
01138             }
01139             //============================================
01140 
01141             if(DEBUG) fprintf(stderr, "Doing linear combination\n");
01142 
01143             const utype MIN_REQUIRED = max((utype) 1, (utype) (
01144                 m_fPercentQueryAfterScoreCutOff * this_q.size()));
01145             CSeekWeighter::LinearCombine(m_rank_normal_threads[tid], this_q,
01146                 *m_vc[d], MIN_REQUIRED, m_bSquareZ);
01147 
01148             if(DEBUG) fprintf(stderr,
01149                 "Adding contribution of dataset %d to master ranking: %.5f\n", d, w);
01150 
01151             utype iGeneSet = mapG->GetNumSet();
01152             const vector<utype> &allRGenes = mapG->GetAllReverse();
01153             vector<utype>::const_iterator iterR = allRGenes.begin();
01154             vector<utype>::const_iterator endR = allRGenes.begin() + iGeneSet;
01155             vector<utype> &Rank_Normal = m_rank_normal_threads[tid];
01156             float* Master_Rank = &m_master_rank_threads[tid][0];
01157             float* Sum_Weight = &m_sum_weight_threads[tid][0];
01158             float* Sum_Sq_Weight = &m_sum_sq_weight_threads[tid][0];
01159             utype* Counts = &m_counts_threads[tid][0];
01160 
01161             if(current_sm==ORDER_STATISTICS)
01162                 for(; iterR!=endR; iterR++){
01163                     //if(Rank_Normal[*iterR]==0) continue;
01164                     m_rank_d[dd][*iterR] = Rank_Normal[*iterR];
01165                     Counts[*iterR]++;
01166                 }
01167             else
01168                 for(; iterR!=endR; iterR++){
01169                     //if(Rank_Normal[*iterR]==0) continue;
01170                     Master_Rank[*iterR] += (float) Rank_Normal[*iterR] * w;
01171                     Sum_Weight[*iterR] += w;
01172                     Sum_Sq_Weight[*iterR] += w * w;
01173                     Counts[*iterR]++;
01174                 }
01175 
01176             //simulated weight case ======================
01177             if(current_sm==EQUAL && redoWithEqual==1){
01178                 weight[d] = report_w;
01179             }
01180             //============================================
01181             else if((current_sm==EQUAL || current_sm==ORDER_STATISTICS) 
01182             && redoWithEqual==0){
01183                 weight[d] = 0;
01184             }
01185             else{
01186                 weight[d] = w;
01187             }
01188 
01189         }
01190         //omp finishes
01191         //fprintf(stderr, "3 %lu\n", CMeta::GetMemoryUsage());
01192         for(j=0; j<iSearchDatasets; j++)
01193             m_vc[allRDatasets[j]]->DeleteQuery();
01194 
01195         assert(m_rData!=NULL);
01196         for(j=0; j<m_numThreads; j++)
01197             CSeekTools::Free2DArray(m_rData[j]);
01198         delete[] m_rData;
01199         m_rData = NULL;
01200 
01201         AggregateThreads();
01202 
01203         if(current_sm!=ORDER_STATISTICS){
01204             FilterResults(iSearchDatasets);
01205         }else{
01206             CSeekWeighter::OrderStatisticsRankAggregation(iSearchDatasets,
01207                 m_iGenes, m_rank_d, m_counts, m_master_rank, m_numThreads);
01208             CSeekTools::Free2DArray(m_rank_d);
01209             m_rank_d = NULL;
01210         }
01211 
01212         //Display(query, final);
01213         //fprintf(stderr, "4 %lu\n", CMeta::GetMemoryUsage());
01214         SetQueryScoreNull(query);
01215         Sort(final);
01216         int ret; //for system calls
01217 
01218         if(m_bRandom){
01219             /*utype z, cz;
01220             for(cz=0, z=0; z<m_iDatasets; z++)
01221                 if(m_weight[i][z]!=0) 
01222                     cz++;
01223             fprintf(stderr, "Number of weighted dataset: %d\n", cz);
01224             */
01225         }
01226         else if(simulateWeight){
01227             if((current_sm==EQUAL || current_sm==ORDER_STATISTICS) && !CheckWeight(i)){
01228                 fprintf(stderr, "Calculate dataset ordering\n"); 
01229                 ret = system("date +%s%N 1>&2");
01230                 if(m_bEnableNetwork && CSeekNetwork::Send(m_iClient, 
01231                     "Calculate dataset ordering")==-1){
01232                     fprintf(stderr, "Error sending message to client\n");
01233                 }
01234                 CopyTopGenes(equalWeightGold, final, 100);
01235                 redoWithEqual = 1;
01236                 if(current_sm==ORDER_STATISTICS){
01237                     current_sm = EQUAL;
01238                     //backup genes to old
01239                     copy(m_master_rank.begin(), m_master_rank.end(), backupScore.begin());  
01240                 }
01241                 i--;
01242                 continue;
01243             }
01244             else if(current_sm==CV && !CheckWeight(i)){
01245                 fprintf(stderr, "Redo with equal weighting\n"); 
01246                 ret = system("date +%s%N 1>&2");
01247                 if(m_bEnableNetwork && CSeekNetwork::Send(m_iClient, 
01248                     "Redo with equal weighting")==-1){
01249                     fprintf(stderr, "Error sending message to client\n");
01250                 }
01251                 current_sm = EQUAL;
01252                 i--;
01253                 continue;
01254             }
01255             else if(current_sm==EQUAL && CheckWeight(i)){
01256                 redoWithEqual = 0;
01257                 current_sm = sm;
01258                 //copy genes from old to new
01259                 if(sm==ORDER_STATISTICS){
01260                     copy(backupScore.begin(), backupScore.end(), m_master_rank.begin());    
01261                 }
01262             }
01263         }
01264 
01265         fprintf(stderr, "Done search\n"); 
01266         ret = system("date +%s%N 1>&2");
01267 
01268         if(m_bEnableNetwork && CSeekNetwork::Send(m_iClient, "Done Search")==-1){
01269             fprintf(stderr, "Error sending message to client\n");
01270         }
01271     
01272         //random-ranking case =========================
01273         if(m_bRandom){
01274             sort(m_master_rank.begin(), m_master_rank.end(), greater<float>());
01275             sort(weight.begin(), weight.end(), greater<float>());
01276             copy(m_master_rank.begin(), m_master_rank.end(), vecRandScore[l].begin());
01277             copy(weight.begin(), weight.end(), vecRandWeight[l].begin());
01278             l++;
01279 
01280             if(l==m_iNumRandom){ //last repetition
01281                 float confidence = 0.50;
01282                 int n = (int) (confidence * (float) m_iNumRandom); 
01283                 vector<float> new_weight, new_score;
01284                 CSeekTools::InitVector(new_weight, m_iDatasets, (float) 0);
01285                 CSeekTools::InitVector(new_score, m_iGenes, (float) 0);
01286                 for(k=0; k<m_iGenes; k++){
01287                     vector<float> all_s;
01288                     all_s.resize(m_iNumRandom);
01289                     for(l=0; l<m_iNumRandom; l++)
01290                         all_s[l] = vecRandScore[l][k];
01291                     std::nth_element(all_s.begin(), all_s.begin()+n, all_s.end());
01292                     new_score[k] = all_s[n];
01293                 }
01294                 /*for(k=0; k<m_iGenes; k++){
01295                     fprintf(stderr, "%d %.5f\n", k, new_score[k]);
01296                 }
01297                 getchar();*/
01298                 for(k=0; k<m_iDatasets; k++){
01299                     vector<float> all_w;
01300                     all_w.resize(m_iNumRandom);
01301                     for(l=0; l<m_iNumRandom; l++)
01302                         all_w[l] = vecRandWeight[l][k]; 
01303                     std::nth_element(all_w.begin(), all_w.begin()+n, all_w.end());
01304                     new_weight[k] = all_w[n];
01305                 }
01306                 sprintf(acBuffer, "%s/%d.rand.dweight", m_output_dir.c_str(), i);
01307                 CSeekTools::WriteArray(acBuffer, new_weight);
01308                 sprintf(acBuffer, "%s/%d.rand.gscore", m_output_dir.c_str(), i);
01309                 CSeekTools::WriteArray(acBuffer, new_score);                
01310                 l = 0;
01311             }else{
01312                 i--;
01313             }
01314         }
01315 
01316         if(!m_bRandom){
01317             //if m_bRandom, write at the very end when all repetitions are done
01318             Write(i);
01319             if(weightComponent){
01320                 sprintf(acBuffer, "%s/%d.dweight_comp", m_output_dir.c_str(), i);
01321                 CSeekTools::WriteArray(acBuffer, wc);
01322                 vector<vector<string> > vecParts;
01323                 vecParts.resize(query.GetNumFold());
01324                 utype kk;
01325                 string strParts = "";
01326                 for(j=0; j<query.GetNumFold(); j++){
01327                     vecParts[j] = vector<string>();
01328                     const vector<utype> &vu = query.GetCVQuery(j);
01329                     string s = "";
01330                     for(kk=0; kk<vu.size(); kk++){
01331                         vecParts[j].push_back(m_vecstrGenes[vu[kk]]);
01332                         s+=m_vecstrGenes[vu[kk]];
01333                         if(kk!=vu.size()-1)
01334                             s+=" ";
01335                     }
01336                     strParts+=s;
01337                     if(j!=query.GetNumFold()-1)
01338                         strParts+="|";
01339                 }
01340                 sprintf(acBuffer, "%s/%d.query_cvpart", m_output_dir.c_str(), i);
01341                 CSeekTools::Write2DArrayText(acBuffer, vecParts);
01342                 //send data to client
01343                 if(m_bEnableNetwork){
01344                     if(CSeekNetwork::Send(m_iClient, wc)==-1){
01345                         fprintf(stderr, "Error sending message to client\n");
01346                         return false;
01347                     }
01348                     if(CSeekNetwork::Send(m_iClient, strParts)==-1){
01349                         fprintf(stderr, "Error sending message to client\n");
01350                         return false;
01351                     }
01352                 }
01353             }
01354         }
01355     }
01356 
01357     //fprintf(stderr, "4b %lu\n", CMeta::GetMemoryUsage());
01358 
01359     return true;
01360 }
01361 
01362 bool CSeekCentral::CheckWeight(const utype &i){
01363     utype j = 0;
01364     bool valid = false;
01365     for(j=0; j<m_iDatasets; j++){
01366         if(m_weight[i][j]!=0){
01367             valid = true;
01368             break;
01369         }
01370     }
01371     return valid;
01372 }
01373 
01374 bool CSeekCentral::CopyTopGenes(CSeekQuery &csq, 
01375     const vector<AResultFloat> &src, const utype top){
01376     utype i, j;
01377     vector<utype> topGenes;
01378     for(i=0; i<top; i++){
01379         if(src[i].f==-320) continue;
01380         topGenes.push_back(src[i].i);
01381     }
01382     if(topGenes.size()==0){
01383         fprintf(stderr, "Error in CopyTopGenes!\n");
01384         return false;
01385     }
01386     csq.InitializeQuery(topGenes, m_iGenes);
01387     return true;
01388 }
01389 
01390 bool CSeekCentral::SetQueryScoreNull(const CSeekQuery &csq){
01391     utype j;
01392     const vector<utype> &query = csq.GetQuery();
01393     for(j=0; j<query.size(); j++){
01394         m_master_rank[query[j]] = -320;
01395     }
01396     return true;    
01397 }
01398 
01399 bool CSeekCentral::EqualWeightSearch(){
01400     CSeekCentral::SearchMode sm = EQUAL;
01401     return CSeekCentral::Common(sm);
01402 }
01403 
01404 bool CSeekCentral::CVSearch(gsl_rng *rnd, const CSeekQuery::PartitionMode &PART_M,
01405     const utype &FOLD, const float &RATE){
01406     CSeekCentral::SearchMode sm = CV;
01407     return CSeekCentral::Common(sm, rnd, &PART_M, &FOLD, &RATE);
01408 }
01409 
01410 /*  perform CVSearch, except that the weighting is based not on co-expression
01411     of query genes, but based on similarity of query genes to some custom gold
01412     standard gene-set */
01413 bool CSeekCentral::CVCustomSearch(const vector< vector<string> > &newGoldStd,
01414     gsl_rng *rnd, const CSeekQuery::PartitionMode &PART_M,
01415     const utype &FOLD, const float &RATE){
01416     CSeekCentral::SearchMode sm = CV_CUSTOM;
01417     return CSeekCentral::Common(sm, rnd, &PART_M, &FOLD, &RATE,
01418         NULL, &newGoldStd);
01419 }
01420 
01421 bool CSeekCentral::WeightSearch(const vector< vector<float> > &weights){
01422     CSeekCentral::SearchMode sm = USE_WEIGHT;
01423     return CSeekCentral::Common(sm, NULL, NULL, NULL, NULL, &weights);
01424 }
01425 
01426 bool CSeekCentral::OrderStatistics(){
01427     CSeekCentral::SearchMode sm = ORDER_STATISTICS;
01428     return CSeekCentral::Common(sm);
01429 }
01430 
01431 bool CSeekCentral::AverageWeightSearch(){
01432     CSeekCentral::SearchMode sm = AVERAGE_Z;
01433     return CSeekCentral::Common(sm, NULL, NULL, NULL, NULL);
01434 }
01435 
01436 bool CSeekCentral::VarianceWeightSearch(){
01437     vector<vector<float> > weights;
01438     weights.resize(m_vecstrAllQuery.size());
01439     utype i, j, k;
01440     for(i=0; i<m_vecstrAllQuery.size(); i++){
01441         weights[i] = vector<float>();
01442         CSeekTools::InitVector(weights[i], m_iDatasets, (float)0);
01443         const vector<utype> &allRDatasets =
01444             m_searchdsetMap[i]->GetAllReverse();
01445         utype iSearchDatasets = m_searchdsetMap[i]->GetNumSet();
01446 
01447         CSeekQuery q;
01448         PrepareQuery(m_vecstrAllQuery[i], q);
01449         const vector<utype> &qGenes = q.GetQuery();
01450 
01451         for(j=0; j<iSearchDatasets; j++){
01452             utype d = allRDatasets[j];
01453             for(k=0; k<qGenes.size(); k++){
01454                 float gv = m_vc[d]->GetGeneVariance(qGenes[k]);
01455                 if(CMeta::IsNaN(gv)) continue;
01456                 weights[i][d] += gv;
01457             }
01458         }
01459     }
01460     return WeightSearch(weights);
01461 }
01462 
01463 bool CSeekCentral::Destruct(){
01464     utype j;
01465     //for(j=0; j<m_iDatasets; j++) m_vc[j]->DeleteQueryBlock();
01466     for(j=0; j<m_iDatasets; j++){
01467         if(m_vc[j]!=NULL) continue;
01468         delete m_vc[j];
01469         m_vc[j] = NULL;
01470     }
01471     return true;
01472 }
01473 
01474 const vector< vector<AResultFloat> >& CSeekCentral::GetAllResult() const{
01475     return m_final;
01476 }
01477 
01478 const vector<CSeekQuery>& CSeekCentral::GetAllQuery() const{
01479     return m_Query;
01480 }
01481 
01482 utype CSeekCentral::GetGene(const string &strGene) const{
01483     if(m_mapstrintGene.find(strGene)==m_mapstrintGene.end())
01484         return CSeekTools::GetNaN();
01485     return m_mapstrintGene.find(strGene)->second;
01486 }
01487 string CSeekCentral::GetGene(const utype &geneID) const{
01488     return m_vecstrGenes[(size_t) geneID];
01489 }
01490 
01491 const vector<vector<float> >& CSeekCentral::GetAllWeight() const{
01492     return m_weight;
01493 }
01494 }
01495