Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 00026 int main( int iArgs, char** aszArgs ) { 00027 static const size_t c_iBuffer = 1024; 00028 #ifdef WIN32 00029 pthread_win32_process_attach_np( ); 00030 #endif // WIN32 00031 gengetopt_args_info sArgs; 00032 const int lineSize = 1024; 00033 00034 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00035 cmdline_parser_print_help( ); 00036 return 1; 00037 } 00038 00039 string method = sArgs.weighting_method_arg; 00040 string cv = sArgs.CV_partition_arg; 00041 int cv_fold = sArgs.CV_fold_arg; 00042 float rbp_p = sArgs.CV_rbp_p_arg; 00043 string dist_measure = sArgs.dist_measure_arg; 00044 00045 if(dist_measure=="pearson"){ 00046 string sinfo_dir = sArgs.dir_sinfo_arg; 00047 if(sinfo_dir=="NA"){ 00048 fprintf(stderr, "Pearson selected. Please give the sinfo directory (-u).\n"); 00049 return 1; 00050 } 00051 if(!!sArgs.norm_subavg_flag){ 00052 fprintf(stderr, "Warning: -m flag is ignored due to --dist_measure pearson.\n"); 00053 } 00054 if(!!sArgs.norm_subavg_plat_flag){ 00055 fprintf(stderr, "Warning: -M flag is ignored due to --dist_measure pearson.\n"); 00056 } 00057 00058 }else if(dist_measure=="z_score"){ 00059 if(!!sArgs.norm_subavg_plat_flag && !(!!sArgs.norm_subavg_flag)){ 00060 fprintf(stderr, "Please enable -m flag. --norm_subavg_plat requires -m flag.\n"); 00061 return 1; 00062 } 00063 } 00064 00065 if(!sArgs.input_arg || !sArgs.quant_arg || 00066 !sArgs.dset_arg || 00067 !sArgs.query_arg || !sArgs.dir_platform_arg || 00068 !sArgs.dir_in_arg || !sArgs.dir_prep_in_arg){ 00069 fprintf(stderr, "Arguments missing!\n"); 00070 return 1; 00071 } 00072 00073 bool useNibble = false; 00074 if(sArgs.is_nibble_flag==1){ 00075 fprintf(stderr, "Nibble integration is not supported! Please use a non-nibble CDatabase.\n"); 00076 useNibble = true; 00077 return 1; 00078 } 00079 00080 bool bOutputWeightComponent = !!sArgs.output_w_comp_flag; 00081 bool bSimulateWeight = !!sArgs.simulate_w_flag; 00082 00083 // Random Number Generator Initializations 00084 gsl_rng_env_setup(); 00085 00086 const gsl_rng_type *T; 00087 T = gsl_rng_default; 00088 00089 gsl_rng *rnd = gsl_rng_alloc(T); 00090 00091 const gsl_rng_type *T2; 00092 T2 = gsl_rng_default; 00093 00094 gsl_rng *random_ranking_rnd = gsl_rng_alloc(T2); 00095 00096 float RATE = rbp_p; 00097 utype FOLD = (utype) cv_fold; 00098 enum CSeekQuery::PartitionMode PART_M; 00099 if(cv=="LOI"){ 00100 PART_M = CSeekQuery::LEAVE_ONE_IN; 00101 }else if(cv=="LOO"){ 00102 PART_M = CSeekQuery::LEAVE_ONE_OUT; 00103 }else if(cv=="XFOLD"){ 00104 PART_M = CSeekQuery::CUSTOM_PARTITION; 00105 } 00106 00107 utype i,j; 00108 //utype TOP = 1000; 00109 //utype TOP = 0; 00110 00111 /* 00112 CSeekCentral *func = new CSeekCentral(); 00113 if(!func->Initialize(sArgs.input_arg, sArgs.func_quant_arg, 00114 sArgs.func_dset_arg, sArgs.func_dset_arg, sArgs.query_arg, 00115 sArgs.func_prep_arg, sArgs.func_db_arg, sArgs.func_prep_arg, 00116 useNibble, sArgs.func_n_arg, sArgs.buffer_arg, 00117 "fn", false, false, false, false, 00118 sArgs.score_cutoff_arg, sArgs.per_q_required_arg)){ 00119 return -1; 00120 } 00121 00122 func->EqualWeightSearch(); 00123 const vector< vector<AResultFloat> > &vfunc = func->GetAllResult(); 00124 const vector<CSeekQuery> &vq = func->GetAllQuery(); 00125 00126 vector<vector<string> > origQuery; 00127 vector<vector<string> > newQuery; 00128 newQuery.resize(vfunc.size()); 00129 origQuery.resize(vfunc.size()); 00130 00131 for(i=0; i<vfunc.size(); i++){ 00132 newQuery[i] = vector<string>(); 00133 origQuery[i] = vector<string>(); 00134 const vector<utype> &queryGenes = vq[i].GetQuery(); 00135 for(j=0; j<queryGenes.size(); j++){ 00136 origQuery[i].push_back(func->GetGene(queryGenes[j])); 00137 newQuery[i].push_back(func->GetGene(queryGenes[j])); 00138 } 00139 for(j=0; j<200; j++) 00140 newQuery[i].push_back(func->GetGene(vfunc[i][j].i)); 00141 } 00142 00143 func->Destruct(); 00144 delete func; 00145 00146 CSeekTools::Write2DArrayText("/tmp/ex_query.txt", newQuery); 00147 */ 00148 /* 00149 CSeekCentral *csk = new CSeekCentral(); 00150 00151 if(!csk->Initialize(sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg, 00152 sArgs.search_dset_arg, sArgs.query_arg, 00153 sArgs.dir_platform_arg, 00154 sArgs.dir_in_arg, sArgs.dir_prep_in_arg, useNibble, sArgs.num_db_arg, 00155 sArgs.buffer_arg, "normal", 00156 !!sArgs.norm_subavg_flag, !!sArgs.norm_platsubavg_flag, 00157 !!sArgs.norm_platstdev_flag, false, 00158 sArgs.score_cutoff_arg, sArgs.per_q_required_arg)) 00159 return -1; 00160 00161 00162 //csk->CVCustomSearch(newQuery, rnd, PART_M, FOLD, RATE); 00163 csk->CVSearch(rnd, PART_M, FOLD, RATE); 00164 const vector<vector<float> > &csk_weight = csk->GetAllWeight(); 00165 00166 vector<vector<float> > csk_weight_copy; 00167 csk_weight_copy.resize(csk_weight.size()); 00168 for(i=0; i<csk_weight.size(); i++){ 00169 csk_weight_copy[i] = vector<float>(); 00170 for(j=0; j<csk_weight[i].size(); j++) 00171 csk_weight_copy[i].push_back(csk_weight[i][j]); 00172 } 00173 00174 const vector< vector<AResultFloat> > &vcsk = csk->GetAllResult(); 00175 vector< vector<string> > vcNew; 00176 vcNew.resize(vcsk.size()); 00177 for(i=0; i<vcsk.size(); i++){ 00178 vcNew[i] = vector<string>(); 00179 for(j=0; j<TOP; j++){ 00180 vcNew[i].push_back(csk->GetGene(vcsk[i][j].i)); 00181 } 00182 } 00183 csk->Destruct(); 00184 delete csk; 00185 */ 00186 /* 00187 vector< vector<string> > vcIntersect; 00188 vcIntersect.resize(vcNew.size()); 00189 for(i=0; i<vcNew.size(); i++){ 00190 vcIntersect[i] = vector<string>(); 00191 vector<string> s1, s2; 00192 vector<string> intersect; 00193 intersect.resize(TOP); 00194 vector<string>::iterator it; 00195 00196 for(j=0; j<origQuery[i].size(); j++) 00197 vcIntersect[i].push_back(origQuery[i][j]); 00198 00199 //int G = max((int)1, (int)(origQuery[i].size()*0.3)); 00200 //int G = max((int)1, (int)(20 - origQuery[i].size())); 00201 00202 for(j=0; j<TOP; j++) 00203 s1.push_back(vcNew[i][j]); 00204 for(j=0; j<20; j++) 00205 s2.push_back(newQuery[i][j]); 00206 00207 sort(s1.begin(), s1.end()); 00208 sort(s2.begin(), s2.end()); 00209 00210 //fprintf(stderr, "G: %d\n", G); 00211 //for(j=0; j<TOP; j++){ 00212 // s1.push_back(vcNew[i][j]); 00213 // s2.push_back(newQuery[i][j]); 00214 // sort(s1.begin(), s1.end()); 00215 // sort(s2.begin(), s2.end()); 00216 // it = set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), 00217 // intersect.begin()); 00218 // //if((int)(it - intersect.begin()) > G) break; 00219 //} 00220 it = set_intersection(s1.begin(), s1.end(), s2.begin(), s2.end(), 00221 intersect.begin()); 00222 00223 int size2 = (int) (it - intersect.begin()); 00224 for(j=0; j<size2; j++) vcIntersect[i].push_back(intersect[j]); 00225 } 00226 00227 CSeekTools::Write2DArrayText("/tmp/ex_query.txt", vcIntersect); 00228 */ 00229 00230 //vector<vector<string> > newQ; 00231 //CSeekTools::ReadMultipleQueries("/tmp/ex_query2.txt", newQ); 00232 00233 00234 enum CSeekDataset::DistanceMeasure eDistMeasure; 00235 if(dist_measure=="pearson"){ 00236 eDistMeasure = CSeekDataset::CORRELATION; 00237 }else{ 00238 eDistMeasure = CSeekDataset::Z_SCORE; 00239 } 00240 00241 /*fprintf(stderr, "input: %s quant: %s dset: %s, search_dset: %s\n", sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg, sArgs.search_dset_arg); 00242 fprintf(stderr, "query: %s dir_plat: %s dir_in: %s, dir_prep: %s\n", sArgs.query_arg, sArgs.dir_platform_arg, sArgs.dir_in_arg, sArgs.dir_prep_in_arg); 00243 fprintf(stderr, "dir_gvar: %s dir_sinfo: %s useNibble: %d, num_db: %s\n", sArgs.dir_gvar_arg, sArgs.dir_sinfo_arg, sArgs.is_nibble_flag, sArgs.num_db_arg); 00244 getchar();*/ 00245 00246 CSeekCentral *csfinal = new CSeekCentral(); 00247 CSeekDBSetting *dbSetting = new CSeekDBSetting(sArgs.dir_gvar_arg, 00248 sArgs.dir_sinfo_arg, sArgs.dir_platform_arg, sArgs.dir_prep_in_arg, 00249 sArgs.dir_in_arg, sArgs.input_arg, sArgs.quant_arg, sArgs.dset_arg, 00250 sArgs.num_db_arg); 00251 vector<CSeekDBSetting*> cc; 00252 cc.push_back(dbSetting); 00253 00254 string add_db = sArgs.additional_db_arg; 00255 if(add_db!="NA"){ 00256 ifstream ifsm; 00257 ifsm.open(add_db.c_str()); 00258 if(!ifsm.is_open()){ 00259 fprintf(stderr, "Error opening file %s\n", add_db.c_str()); 00260 return false; 00261 } 00262 char acBuffer[lineSize]; 00263 utype c_iBuffer = lineSize; 00264 vector<map<string,string> > parameters; //an array of CDatabase's 00265 while(!ifsm.eof()){ 00266 ifsm.getline(acBuffer, c_iBuffer-1); 00267 if(acBuffer[0]==0) break; 00268 acBuffer[c_iBuffer-1]=0; 00269 string strB = acBuffer; 00270 if(strB=="START"){ 00271 map<string,string> p; 00272 while(!ifsm.eof()){ 00273 ifsm.getline(acBuffer, c_iBuffer-1); 00274 if(acBuffer[0]==0){ 00275 fprintf(stderr, "Invalid line (empty)\n"); 00276 return 1; 00277 } 00278 strB = acBuffer; 00279 if(strB=="END") break; 00280 vector<string> tok; 00281 CMeta::Tokenize(acBuffer, tok); //separator is tab 00282 p[tok[0]] = tok[1]; 00283 } 00284 parameters.push_back(p); 00285 } 00286 } 00287 ifsm.close(); 00288 if(parameters.size()==0){ 00289 fprintf(stderr, "Error, extra_db setting file must begin with START and end with END lines\n"); 00290 return 1; 00291 } 00292 00293 //i=0; 00294 for(i=0; i<parameters.size(); i++){ 00295 string sinfo_dir = "NA"; 00296 string gvar_dir = "NA"; 00297 string platform_dir = "NA"; 00298 string prep_dir = "NA"; 00299 string db_dir = "NA"; 00300 string dset_map_file = "NA"; 00301 string gene_map_file = "NA"; 00302 string quant_file = "NA"; 00303 int num_db = -1; 00304 00305 if(eDistMeasure==CSeekDataset::CORRELATION){ 00306 if(parameters[i].find("SINFO_DIR")==parameters[i].end() || 00307 parameters[i].find("SINFO_DIR")->second=="NA"){ 00308 fprintf(stderr, "Please specify an sinfo directory for the extra db\n"); 00309 return false; 00310 } 00311 sinfo_dir = parameters[i].find("SINFO_DIR")->second; 00312 } 00313 if(parameters[i].find("GVAR_DIR")!=parameters[i].end()) 00314 gvar_dir = parameters[i].find("GVAR_DIR")->second; 00315 if(parameters[i].find("PREP_DIR")==parameters[i].end() || 00316 parameters[i].find("PLATFORM_DIR")==parameters[i].end() || 00317 parameters[i].find("DB_DIR")==parameters[i].end() || 00318 parameters[i].find("DSET_MAP_FILE")==parameters[i].end() || 00319 parameters[i].find("GENE_MAP_FILE")==parameters[i].end() || 00320 parameters[i].find("QUANT_FILE")==parameters[i].end() || 00321 parameters[i].find("NUMBER_OF_DB")==parameters[i].end()){ 00322 fprintf(stderr, "Some arguments are missing. Please make sure the following are provided:\n"); 00323 fprintf(stderr, "PREP_DIR, DB_DIR, DSET_MAP_FILE, GENE_MAP_FILE, QUANT_FILE, NUMBER_OF_DB\n"); 00324 } 00325 00326 platform_dir = parameters[i].find("PLATFORM_DIR")->second; 00327 db_dir = parameters[i].find("DB_DIR")->second; 00328 prep_dir = parameters[i].find("PREP_DIR")->second; 00329 dset_map_file = parameters[i].find("DSET_MAP_FILE")->second; 00330 gene_map_file = parameters[i].find("GENE_MAP_FILE")->second; 00331 quant_file = parameters[i].find("QUANT_FILE")->second; 00332 num_db = atoi(parameters[i].find("NUMBER_OF_DB")->second.c_str()); 00333 00334 CSeekDBSetting *dbSetting2 = new CSeekDBSetting(gvar_dir, sinfo_dir, 00335 platform_dir, prep_dir, db_dir, gene_map_file, quant_file, dset_map_file, 00336 num_db); 00337 cc.push_back(dbSetting2); 00338 } 00339 } 00340 00341 if(!csfinal->Initialize(cc, 00342 sArgs.search_dset_arg, 00343 //"/tmp/ex_query2.txt", 00344 sArgs.query_arg, 00345 sArgs.output_dir_arg, 00346 sArgs.buffer_arg, !!sArgs.output_text_flag, 00347 bOutputWeightComponent, bSimulateWeight, 00348 eDistMeasure, 00349 !!sArgs.norm_subavg_flag, !!sArgs.norm_subavg_plat_flag, 00350 false, 00351 sArgs.score_cutoff_arg, 00352 sArgs.per_q_required_arg, sArgs.per_g_required_arg, 00353 !!sArgs.square_z_flag, 00354 !!sArgs.random_flag, sArgs.num_random_arg, random_ranking_rnd, useNibble, 00355 sArgs.num_threads_arg)) 00356 return -1; 00357 00358 if(method=="CV"){ 00359 csfinal->CVSearch(rnd, PART_M, FOLD, RATE); 00360 }else if(method=="EQUAL"){ 00361 csfinal->EqualWeightSearch(); 00362 }else if(method=="ORDER_STAT"){ 00363 csfinal->OrderStatistics(); 00364 }else if(method=="USER"){ 00365 string uw = sArgs.user_weight_list_arg; 00366 vector<string> uww; 00367 if(!CSeekTools::ReadListOneColumn(uw.c_str(), uww)){ 00368 fprintf(stderr, "Error reading user weight list\n"); 00369 return -1; 00370 } 00371 vector<vector<float> > fw; 00372 fw.resize(uww.size()); 00373 for(i=0; i<uww.size(); i++){ 00374 if(!CSeekTools::ReadArray(uww[i].c_str(), fw[i])){ 00375 return -1; 00376 } 00377 } 00378 csfinal->WeightSearch(fw); 00379 }else if(method=="VAR"){ 00380 for(i=0; i<cc.size(); i++){ 00381 CSeekDBSetting* pc = cc[i]; 00382 if(pc->GetValue("gvar")=="NULL"){ 00383 fprintf(stderr, "Must specify gvar directory!\n"); 00384 return -1; 00385 } 00386 } 00387 if(bSimulateWeight){ 00388 fprintf(stderr, "simulate weight option is not supported for variance-based weighting\n"); 00389 return -1; 00390 } 00391 csfinal->VarianceWeightSearch(); 00392 }else if(method=="AVERAGE_Z"){ 00393 csfinal->AverageWeightSearch(); 00394 } 00395 //csfinal->WeightSearch(csk_weight_copy); 00396 //csfinal->CVCustomSearch(newQ, rnd, PART_M, FOLD, RATE); 00397 //csfinal->EqualWeightSearch(); 00398 //csfinal->CVSearch(rnd, PART_M, FOLD, RATE); 00399 //csfinal->OrderStatistics(); 00400 fprintf(stderr, "Destructing...\n"); 00401 csfinal->Destruct(); 00402 fprintf(stderr, "Deleting...\n"); 00403 delete csfinal; 00404 00405 fprintf(stderr, "Deleting DBSetting...\n"); 00406 if(add_db!="NA"){ 00407 for(i=0; i<cc.size(); i++){ 00408 delete cc[i]; 00409 } 00410 } 00411 fprintf(stderr, "Finished deleting DBSetting...\n"); 00412 00413 cc.clear(); 00414 00415 #ifdef WIN32 00416 pthread_win32_process_detach_np( ); 00417 #endif // WIN32 00418 return 0; 00419 }