Sleipnir
tools/SeekMiner/SeekMiner.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 
00026 int main( int iArgs, char** aszArgs ) {
00027     static const size_t c_iBuffer   = 1024;
00028 #ifdef WIN32
00029     pthread_win32_process_attach_np( );
00030 #endif // WIN32
00031     gengetopt_args_info sArgs;
00032     const int lineSize = 1024;
00033 
00034     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00035         cmdline_parser_print_help( );
00036         return 1;
00037     }
00038 
00039     string method = sArgs.weighting_method_arg;
00040     string cv = sArgs.CV_partition_arg;
00041     int cv_fold = sArgs.CV_fold_arg;
00042     float rbp_p = sArgs.CV_rbp_p_arg;
00043     string dist_measure = sArgs.dist_measure_arg;
00044 
00045     if(dist_measure=="pearson"){
00046         string sinfo_dir = sArgs.dir_sinfo_arg;
00047         if(sinfo_dir=="NA"){
00048             fprintf(stderr, "Pearson selected. Please give the sinfo directory (-u).\n");
00049             return 1;
00050         }
00051         if(!!sArgs.norm_subavg_flag){
00052             fprintf(stderr, "Warning: -m flag is ignored due to --dist_measure pearson.\n");
00053         }
00054         if(!!sArgs.norm_subavg_plat_flag){
00055             fprintf(stderr, "Warning: -M flag is ignored due to --dist_measure pearson.\n");
00056         }
00057         
00058     }else if(dist_measure=="z_score"){
00059         if(!!sArgs.norm_subavg_plat_flag && !(!!sArgs.norm_subavg_flag)){
00060             fprintf(stderr, "Please enable -m flag. --norm_subavg_plat requires -m flag.\n");
00061             return 1;
00062         }
00063     }
00064 
00065     if(!sArgs.input_arg || !sArgs.quant_arg ||
00066         !sArgs.dset_arg ||
00067         !sArgs.query_arg || !sArgs.dir_platform_arg ||
00068         !sArgs.dir_in_arg || !sArgs.dir_prep_in_arg){
00069         fprintf(stderr, "Arguments missing!\n");
00070         return 1;
00071     }
00072 
00073     bool useNibble = false;
00074     if(sArgs.is_nibble_flag==1){
00075         fprintf(stderr, "Nibble integration is not supported! Please use a non-nibble CDatabase.\n");
00076         useNibble = true;
00077         return 1;
00078     }
00079 
00080     bool bOutputWeightComponent = !!sArgs.output_w_comp_flag;
00081     bool bSimulateWeight = !!sArgs.simulate_w_flag;
00082 
00083     // Random Number Generator Initializations
00084     gsl_rng_env_setup();
00085 
00086     const gsl_rng_type *T;
00087     T = gsl_rng_default;
00088 
00089     gsl_rng *rnd = gsl_rng_alloc(T);
00090 
00091     const gsl_rng_type *T2;
00092     T2 = gsl_rng_default;
00093 
00094     gsl_rng *random_ranking_rnd = gsl_rng_alloc(T2);
00095 
00096     float RATE = rbp_p;
00097     utype FOLD = (utype) cv_fold;
00098     enum CSeekQuery::PartitionMode PART_M;
00099     if(cv=="LOI"){
00100         PART_M = CSeekQuery::LEAVE_ONE_IN;
00101     }else if(cv=="LOO"){
00102         PART_M = CSeekQuery::LEAVE_ONE_OUT;
00103     }else if(cv=="XFOLD"){
00104         PART_M = CSeekQuery::CUSTOM_PARTITION;
00105     }
00106 
00107     utype i,j;
00108     //utype TOP = 1000;
00109     //utype TOP = 0;
00110 
00111 /*
00112     CSeekCentral *func = new CSeekCentral();
00113     if(!func->Initialize(sArgs.input_arg, sArgs.func_quant_arg,
00114         sArgs.func_dset_arg, sArgs.func_dset_arg, sArgs.query_arg,
00115         sArgs.func_prep_arg, sArgs.func_db_arg, sArgs.func_prep_arg,
00116         useNibble, sArgs.func_n_arg, sArgs.buffer_arg,
00117         "fn", false, false, false, false,
00118         sArgs.score_cutoff_arg, sArgs.per_q_required_arg)){
00119         return -1;
00120     }
00121 
00122     func->EqualWeightSearch();
00123     const vector< vector<AResultFloat> > &vfunc = func->GetAllResult();
00124     const vector<CSeekQuery> &vq = func->GetAllQuery();
00125 
00126     vector<vector<string> > origQuery;
00127     vector<vector<string> > newQuery;
00128     newQuery.resize(vfunc.size());
00129     origQuery.resize(vfunc.size());
00130 
00131     for(i=0; i<vfunc.size(); i++){
00132         newQuery[i] = vector<string>();
00133         origQuery[i] = vector<string>();
00134         const vector<utype> &queryGenes = vq[i].GetQuery();
00135         for(j=0; j<queryGenes.size(); j++){
00136             origQuery[i].push_back(func->GetGene(queryGenes[j]));
00137             newQuery[i].push_back(func->GetGene(queryGenes[j]));
00138         }
00139         for(j=0; j<200; j++)
00140             newQuery[i].push_back(func->GetGene(vfunc[i][j].i));
00141     }
00142 
00143     func->Destruct();
00144     delete func;
00145     
00146     CSeekTools::Write2DArrayText("/tmp/ex_query.txt", newQuery);
00147 */
00148 /*  
00149     CSeekCentral *csk = new CSeekCentral();
00150 
00151     if(!csk->Initialize(sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg,
00152         sArgs.search_dset_arg, sArgs.query_arg,
00153         sArgs.dir_platform_arg,
00154         sArgs.dir_in_arg, sArgs.dir_prep_in_arg, useNibble, sArgs.num_db_arg,
00155         sArgs.buffer_arg, "normal",
00156         !!sArgs.norm_subavg_flag, !!sArgs.norm_platsubavg_flag,
00157         !!sArgs.norm_platstdev_flag, false,
00158         sArgs.score_cutoff_arg, sArgs.per_q_required_arg))
00159             return -1;
00160 
00161 
00162     //csk->CVCustomSearch(newQuery, rnd, PART_M, FOLD, RATE);
00163     csk->CVSearch(rnd, PART_M, FOLD, RATE);
00164     const vector<vector<float> > &csk_weight = csk->GetAllWeight();
00165 
00166     vector<vector<float> > csk_weight_copy;
00167     csk_weight_copy.resize(csk_weight.size());
00168     for(i=0; i<csk_weight.size(); i++){
00169         csk_weight_copy[i] = vector<float>();
00170         for(j=0; j<csk_weight[i].size(); j++)
00171             csk_weight_copy[i].push_back(csk_weight[i][j]);
00172     }
00173 
00174     const vector< vector<AResultFloat> > &vcsk = csk->GetAllResult();
00175     vector< vector<string> > vcNew;
00176     vcNew.resize(vcsk.size());
00177     for(i=0; i<vcsk.size(); i++){
00178         vcNew[i] = vector<string>();
00179         for(j=0; j<TOP; j++){
00180             vcNew[i].push_back(csk->GetGene(vcsk[i][j].i));
00181         }
00182     }
00183     csk->Destruct();
00184     delete csk;
00185 */
00186 /*
00187     vector< vector<string> > vcIntersect;
00188     vcIntersect.resize(vcNew.size());
00189     for(i=0; i<vcNew.size(); i++){
00190         vcIntersect[i] = vector<string>();
00191         vector<string> s1, s2;
00192         vector<string> intersect;
00193         intersect.resize(TOP);
00194         vector<string>::iterator it;
00195 
00196         for(j=0; j<origQuery[i].size(); j++)
00197             vcIntersect[i].push_back(origQuery[i][j]);
00198 
00199         //int G = max((int)1, (int)(origQuery[i].size()*0.3));
00200         //int G = max((int)1, (int)(20 - origQuery[i].size()));
00201 
00202         for(j=0; j<TOP; j++)
00203             s1.push_back(vcNew[i][j]);
00204         for(j=0; j<20; j++)
00205             s2.push_back(newQuery[i][j]);
00206 
00207         sort(s1.begin(), s1.end());
00208         sort(s2.begin(), s2.end());
00209 
00210         //fprintf(stderr, "G: %d\n", G);
00211         //for(j=0; j<TOP; j++){
00212         //  s1.push_back(vcNew[i][j]);
00213         //  s2.push_back(newQuery[i][j]);
00214         //  sort(s1.begin(), s1.end());
00215         //  sort(s2.begin(), s2.end());
00216         //  it = set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(),
00217         //      intersect.begin());
00218         //  //if((int)(it - intersect.begin()) > G) break;
00219         //}
00220         it = set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(),
00221             intersect.begin());
00222 
00223         int size2 = (int) (it - intersect.begin());
00224         for(j=0; j<size2; j++) vcIntersect[i].push_back(intersect[j]);
00225     }
00226 
00227     CSeekTools::Write2DArrayText("/tmp/ex_query.txt", vcIntersect);
00228 */
00229 
00230     //vector<vector<string> > newQ;
00231     //CSeekTools::ReadMultipleQueries("/tmp/ex_query2.txt", newQ);
00232 
00233 
00234     enum CSeekDataset::DistanceMeasure eDistMeasure;
00235     if(dist_measure=="pearson"){
00236         eDistMeasure = CSeekDataset::CORRELATION;
00237     }else{
00238         eDistMeasure = CSeekDataset::Z_SCORE;
00239     }
00240 
00241     /*fprintf(stderr, "input: %s quant: %s dset: %s, search_dset: %s\n", sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg, sArgs.search_dset_arg);
00242     fprintf(stderr, "query: %s dir_plat: %s dir_in: %s, dir_prep: %s\n", sArgs.query_arg, sArgs.dir_platform_arg, sArgs.dir_in_arg, sArgs.dir_prep_in_arg);
00243     fprintf(stderr, "dir_gvar: %s dir_sinfo: %s useNibble: %d, num_db: %s\n", sArgs.dir_gvar_arg, sArgs.dir_sinfo_arg, sArgs.is_nibble_flag, sArgs.num_db_arg);
00244     getchar();*/
00245 
00246     CSeekCentral *csfinal = new CSeekCentral();
00247     CSeekDBSetting *dbSetting = new CSeekDBSetting(sArgs.dir_gvar_arg,
00248         sArgs.dir_sinfo_arg, sArgs.dir_platform_arg, sArgs.dir_prep_in_arg,
00249         sArgs.dir_in_arg, sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg,
00250         sArgs.num_db_arg);
00251     vector<CSeekDBSetting*> cc;
00252     cc.push_back(dbSetting);
00253 
00254     string add_db = sArgs.additional_db_arg;
00255     if(add_db!="NA"){
00256         ifstream ifsm;
00257         ifsm.open(add_db.c_str());
00258         if(!ifsm.is_open()){
00259             fprintf(stderr, "Error opening file %s\n", add_db.c_str());
00260             return false;
00261         }
00262         char acBuffer[lineSize];
00263         utype c_iBuffer = lineSize;
00264         vector<map<string,string> > parameters; //an array of CDatabase's
00265         while(!ifsm.eof()){
00266             ifsm.getline(acBuffer, c_iBuffer-1);
00267             if(acBuffer[0]==0) break;
00268             acBuffer[c_iBuffer-1]=0;
00269             string strB = acBuffer;
00270             if(strB=="START"){
00271                 map<string,string> p;
00272                 while(!ifsm.eof()){
00273                     ifsm.getline(acBuffer, c_iBuffer-1);
00274                     if(acBuffer[0]==0){
00275                         fprintf(stderr, "Invalid line (empty)\n");
00276                         return 1;
00277                     }
00278                     strB = acBuffer;
00279                     if(strB=="END") break;
00280                     vector<string> tok;
00281                     CMeta::Tokenize(acBuffer, tok); //separator is tab
00282                     p[tok[0]] = tok[1];
00283                 }
00284                 parameters.push_back(p);
00285             }
00286         }
00287         ifsm.close();
00288         if(parameters.size()==0){
00289             fprintf(stderr, "Error, extra_db setting file must begin with START and end with END lines\n");
00290             return 1;
00291         }
00292 
00293         //i=0;
00294         for(i=0; i<parameters.size(); i++){
00295         string sinfo_dir = "NA";
00296         string gvar_dir = "NA";
00297         string platform_dir = "NA";
00298         string prep_dir = "NA";
00299         string db_dir = "NA";
00300         string dset_map_file = "NA";
00301         string gene_map_file = "NA";
00302         string quant_file = "NA";
00303         int num_db = -1;
00304 
00305         if(eDistMeasure==CSeekDataset::CORRELATION){
00306             if(parameters[i].find("SINFO_DIR")==parameters[i].end() ||
00307                 parameters[i].find("SINFO_DIR")->second=="NA"){
00308                 fprintf(stderr, "Please specify an sinfo directory for the extra db\n");
00309                 return false;
00310             }
00311             sinfo_dir = parameters[i].find("SINFO_DIR")->second;
00312         }
00313         if(parameters[i].find("GVAR_DIR")!=parameters[i].end())
00314             gvar_dir = parameters[i].find("GVAR_DIR")->second;
00315         if(parameters[i].find("PREP_DIR")==parameters[i].end() ||
00316             parameters[i].find("PLATFORM_DIR")==parameters[i].end() ||
00317             parameters[i].find("DB_DIR")==parameters[i].end() ||
00318             parameters[i].find("DSET_MAP_FILE")==parameters[i].end() ||
00319             parameters[i].find("GENE_MAP_FILE")==parameters[i].end() ||
00320             parameters[i].find("QUANT_FILE")==parameters[i].end() ||
00321             parameters[i].find("NUMBER_OF_DB")==parameters[i].end()){
00322             fprintf(stderr, "Some arguments are missing. Please make sure the following are provided:\n");
00323             fprintf(stderr, "PREP_DIR, DB_DIR, DSET_MAP_FILE, GENE_MAP_FILE, QUANT_FILE, NUMBER_OF_DB\n");
00324         }
00325 
00326         platform_dir = parameters[i].find("PLATFORM_DIR")->second;
00327         db_dir = parameters[i].find("DB_DIR")->second;
00328         prep_dir = parameters[i].find("PREP_DIR")->second;
00329         dset_map_file = parameters[i].find("DSET_MAP_FILE")->second;
00330         gene_map_file = parameters[i].find("GENE_MAP_FILE")->second;
00331         quant_file = parameters[i].find("QUANT_FILE")->second;
00332         num_db = atoi(parameters[i].find("NUMBER_OF_DB")->second.c_str());
00333 
00334         CSeekDBSetting *dbSetting2 = new CSeekDBSetting(gvar_dir, sinfo_dir,
00335             platform_dir, prep_dir, db_dir, gene_map_file, quant_file, dset_map_file,
00336             num_db);
00337         cc.push_back(dbSetting2);
00338         }
00339     }
00340 
00341     if(!csfinal->Initialize(cc,
00342         sArgs.search_dset_arg, 
00343         //"/tmp/ex_query2.txt", 
00344         sArgs.query_arg,
00345         sArgs.output_dir_arg,
00346         sArgs.buffer_arg, !!sArgs.output_text_flag,
00347         bOutputWeightComponent, bSimulateWeight,
00348         eDistMeasure,
00349         !!sArgs.norm_subavg_flag, !!sArgs.norm_subavg_plat_flag,
00350         false,
00351         sArgs.score_cutoff_arg, 
00352         sArgs.per_q_required_arg, sArgs.per_g_required_arg,
00353         !!sArgs.square_z_flag,
00354         !!sArgs.random_flag, sArgs.num_random_arg, random_ranking_rnd, useNibble, 
00355         sArgs.num_threads_arg))
00356         return -1;
00357 
00358     if(method=="CV"){
00359         csfinal->CVSearch(rnd, PART_M, FOLD, RATE);
00360     }else if(method=="EQUAL"){
00361         csfinal->EqualWeightSearch();
00362     }else if(method=="ORDER_STAT"){
00363         csfinal->OrderStatistics();
00364     }else if(method=="USER"){
00365         string uw = sArgs.user_weight_list_arg;
00366         vector<string> uww;
00367         if(!CSeekTools::ReadListOneColumn(uw.c_str(), uww)){
00368             fprintf(stderr, "Error reading user weight list\n");
00369             return -1;
00370         }
00371         vector<vector<float> > fw;
00372         fw.resize(uww.size());
00373         for(i=0; i<uww.size(); i++){
00374             if(!CSeekTools::ReadArray(uww[i].c_str(), fw[i])){
00375                 return -1;
00376             }
00377         }
00378         csfinal->WeightSearch(fw);
00379     }else if(method=="VAR"){
00380         for(i=0; i<cc.size(); i++){
00381             CSeekDBSetting* pc = cc[i];
00382             if(pc->GetValue("gvar")=="NULL"){
00383                 fprintf(stderr, "Must specify gvar directory!\n");
00384                 return -1;
00385             }
00386         }
00387         if(bSimulateWeight){
00388             fprintf(stderr, "simulate weight option is not supported for variance-based weighting\n");
00389             return -1;
00390         }
00391         csfinal->VarianceWeightSearch();
00392     }else if(method=="AVERAGE_Z"){
00393         csfinal->AverageWeightSearch();
00394     }
00395     //csfinal->WeightSearch(csk_weight_copy);
00396     //csfinal->CVCustomSearch(newQ, rnd, PART_M, FOLD, RATE);
00397     //csfinal->EqualWeightSearch();
00398     //csfinal->CVSearch(rnd, PART_M, FOLD, RATE);
00399     //csfinal->OrderStatistics();
00400     fprintf(stderr, "Destructing...\n");
00401     csfinal->Destruct();
00402     fprintf(stderr, "Deleting...\n");
00403     delete csfinal;
00404 
00405     fprintf(stderr, "Deleting DBSetting...\n");
00406     if(add_db!="NA"){
00407         for(i=0; i<cc.size(); i++){
00408             delete cc[i];
00409         }
00410     }
00411     fprintf(stderr, "Finished deleting DBSetting...\n");
00412 
00413     cc.clear();
00414 
00415 #ifdef WIN32
00416     pthread_win32_process_detach_np( );
00417 #endif // WIN32
00418     return 0; 
00419 }