Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 00026 int main( int iArgs, char** aszArgs ) { 00027 static const size_t c_iBuffer = 1024; 00028 #ifdef WIN32 00029 pthread_win32_process_attach_np( ); 00030 #endif // WIN32 00031 gengetopt_args_info sArgs; 00032 ifstream ifsm; 00033 istream* pistm; 00034 vector<string> vecstrLine, vecstrGenes, vecstrDatasets, vecstrUserDatasets; 00035 char acBuffer[ c_iBuffer ]; 00036 size_t i; 00037 00038 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00039 cmdline_parser_print_help( ); 00040 return 1; } 00041 00042 vector<string> vecstrGeneID; 00043 map<string, utype> mapstrintGene; 00044 if(!CSeekTools::ReadListTwoColumns(sArgs.input_arg, vecstrGeneID, vecstrGenes)) 00045 return false; 00046 00047 for(i=0; i<vecstrGenes.size(); i++) 00048 mapstrintGene[vecstrGenes[i]] = i; 00049 00050 if(sArgs.weight2_flag==1){ 00051 vector<string> vecstrDataset; 00052 if(!CSeekTools::ReadListOneColumn(sArgs.dweight_map_arg, vecstrDataset)) 00053 return false; 00054 vector<vector<float> > vec_score, orig_score; 00055 utype i, j; 00056 int num_query = sArgs.dweight_num_arg; //random query 00057 orig_score.resize(num_query); 00058 00059 vec_score.resize(vecstrDataset.size()); 00060 for(j=0; j<vec_score.size(); j++) 00061 vec_score[j].resize(num_query); 00062 00063 char x[256]; 00064 for(i=0; i<num_query; i++){ //i is query id 00065 vector<float> v; 00066 sprintf(x, "%s/%d.dweight", sArgs.dweight_dir_arg, i); 00067 CSeekTools::ReadArray(x, v); 00068 orig_score[i] = v; 00069 for(j=0; j<vec_score.size(); j++) //j is dataset id 00070 vec_score[j][i] = v[j]; 00071 } 00072 00073 for(j=0; j<vec_score.size(); j++) 00074 sort(vec_score[j].begin(), vec_score[j].end()); 00075 00076 int test_num_query = sArgs.dweight_test_num_arg; 00077 for(i=0; i<test_num_query; i++){ //i is query id 00078 fprintf(stderr, "Query %d\n", i); 00079 vector<float> v; 00080 sprintf(x, "%s/%d.dweight", sArgs.dweight_test_dir_arg, i); 00081 CSeekTools::ReadArray(x, v); 00082 for(j=0; j<v.size(); j++){ 00083 utype k = 0; 00084 for(k=0; k<num_query; k++){ 00085 if(v[j]<vec_score[j][k]) 00086 break; 00087 } 00088 fprintf(stderr, "%s\t%.3e\t%d\n", vecstrDataset[j].c_str(), v[j], k); 00089 } 00090 } 00091 fprintf(stderr, "Done!\n"); 00092 return 0; 00093 } 00094 00095 if(sArgs.convert_aracne_flag==1){ 00096 int lineLen = 1024; 00097 char *acBuffer = (char*)malloc(lineLen); 00098 FILE *infile; 00099 if((infile=fopen(sArgs.aracne_file_arg, "r"))==NULL){ 00100 fprintf(stderr, "Error no such file %s\n", sArgs.aracne_file_arg); 00101 return 1; 00102 } 00103 while(fgets(acBuffer, lineLen, infile)!=NULL){ 00104 while(strlen(acBuffer)==lineLen-1){ 00105 int len = strlen(acBuffer); 00106 fseek(infile, -len, SEEK_CUR); 00107 lineLen+=1024; 00108 acBuffer = (char*)realloc(acBuffer, lineLen); 00109 char *ret = fgets(acBuffer, lineLen, infile); 00110 } 00111 } 00112 fclose(infile); 00113 fprintf(stderr, "Max line length: %d\n", lineLen); 00114 free(acBuffer); 00115 00116 acBuffer = (char*)malloc(lineLen); 00117 ifstream ia; 00118 ia.open(sArgs.aracne_file_arg); 00119 if(!ia.is_open()){ 00120 fprintf(stderr, "Error opening file %s\n", sArgs.aracne_file_arg); 00121 return 1; 00122 } 00123 set<string> allGenes; 00124 size_t ci=0; 00125 while(!ia.eof()){ 00126 ia.getline(acBuffer, lineLen-1); 00127 if(acBuffer[0]==0) break; 00128 acBuffer[lineLen-1] = 0; 00129 if(acBuffer[0]=='>') continue; 00130 vector<string> tok; 00131 CMeta::Tokenize(acBuffer, tok); 00132 allGenes.insert(tok[0]); 00133 size_t si; 00134 for(si=1; si<tok.size(); si+=2){ 00135 allGenes.insert(tok[si]); 00136 } 00137 if(ci%100==0){ 00138 fprintf(stderr, "Gene Number %d\n", ci); 00139 } 00140 ci++; 00141 } 00142 ia.close(); 00143 00144 vector<string> vecGenes; 00145 copy(allGenes.begin(), allGenes.end(), back_inserter(vecGenes)); 00146 map<string,size_t> mapiGenes; 00147 for(i=0; i<vecGenes.size(); i++) 00148 mapiGenes[vecGenes[i]] = i; 00149 00150 CDat Dat; 00151 Dat.Open(vecGenes); 00152 ci = 0; 00153 ia.open(sArgs.aracne_file_arg); 00154 while(!ia.eof()){ 00155 ia.getline(acBuffer, lineLen-1); 00156 if(acBuffer[0]==0) break; 00157 acBuffer[lineLen-1] = 0; 00158 if(acBuffer[0]=='>') continue; 00159 vector<string> tok; 00160 CMeta::Tokenize(acBuffer, tok); 00161 size_t tG = mapiGenes[tok[0]]; 00162 size_t si; 00163 for(si=1; si<tok.size(); si+=2){ 00164 size_t oG = mapiGenes[tok[si]]; 00165 float fG = atof(tok[si+1].c_str()); 00166 Dat.Set(tG, oG, fG); 00167 } 00168 if(ci%100==0){ 00169 fprintf(stderr, "Gene Number %d\n", ci); 00170 } 00171 ci++; 00172 } 00173 ia.close(); 00174 free(acBuffer); 00175 Dat.Save(sArgs.output_dab_file_arg); 00176 return 0; 00177 } 00178 00179 if(sArgs.weight_flag==1){ 00180 vector<string> vecstrDataset; 00181 if(!CSeekTools::ReadListOneColumn(sArgs.dweight_map_arg, vecstrDataset)) 00182 return false; 00183 00184 vector<vector<float> > vec_score; 00185 vector<vector<float> > orig_score; 00186 utype i, j; 00187 int num_query = sArgs.dweight_num_arg; //random query 00188 orig_score.resize(num_query); 00189 00190 vec_score.resize(vecstrDataset.size()); 00191 for(j=0; j<vec_score.size(); j++) 00192 vec_score[j].resize(num_query); 00193 00194 char x[256]; 00195 for(i=0; i<num_query; i++){ //i is query id 00196 vector<float> v; 00197 sprintf(x, "%s/%d.dweight", sArgs.dweight_dir_arg, i); 00198 CSeekTools::ReadArray(x, v); 00199 orig_score[i] = v; 00200 for(j=0; j<vec_score.size(); j++) //j is dataset id 00201 vec_score[j][i] = v[j]; 00202 } 00203 00204 vector<float> score_cutoff; 00205 score_cutoff.resize(vec_score.size()); 00206 for(j=0; j<vec_score.size(); j++){ 00207 sort(vec_score[j].begin(), vec_score[j].end()); 00208 score_cutoff[j] = vec_score[j][899]; 00209 //fprintf(stderr, "Dataset %d: %.4e\n", j, vec_score[j][899]); 00210 } 00211 00212 sprintf(x, "/tmp/dataset.cutoff"); 00213 CSeekTools::WriteArray(x, score_cutoff); 00214 00215 vector<int> numGoodDataset; 00216 CSeekTools::InitVector(numGoodDataset, num_query, (int) 0); 00217 for(i=0; i<num_query; i++) //i is query id 00218 for(j=0; j<vecstrDataset.size(); j++) 00219 if(orig_score[i][j]>vec_score[j][899]) 00220 numGoodDataset[i]++; 00221 00222 sort(numGoodDataset.begin(), numGoodDataset.end()); 00223 fprintf(stderr, "10 percentile %d\n", numGoodDataset[99]); 00224 fprintf(stderr, "90 percentile %d\n", numGoodDataset[899]); 00225 00226 int test_num_query = sArgs.dweight_test_num_arg; 00227 for(i=0; i<test_num_query; i++){ //i is query id 00228 vector<float> v; 00229 sprintf(x, "%s/%d.dweight", sArgs.dweight_test_dir_arg, i); 00230 CSeekTools::ReadArray(x, v); 00231 int numGood = 0; 00232 for(j=0; j<v.size(); j++) 00233 if(v[j]>vec_score[j][899]) 00234 numGood++; 00235 fprintf(stderr, "Query %d: ", i); 00236 if(numGood > numGoodDataset[899]) 00237 fprintf(stderr, "Unique (upper)\n"); 00238 else if(numGood < numGoodDataset[99]) 00239 fprintf(stderr, "Unique (lower)\n"); 00240 else 00241 fprintf(stderr, "Not unique\n"); 00242 } 00243 fprintf(stderr, "Done!\n"); 00244 return 0; 00245 } 00246 00247 00248 if(sArgs.convert_dab_flag==1){ 00249 string dab_file_name = sArgs.dab_file_arg; 00250 if(dab_file_name.find(".2.dab")!=string::npos){ 00251 CSeekIntIntMap d1(vecstrGenes.size()); 00252 CSparseFlatMatrix<float> sm (0); 00253 CSeekWriter::ReadSeekSparseMatrix<unsigned short>(sArgs.dab_file_arg, 00254 sm, d1, 1000, 0.99, vecstrGenes); 00255 utype ii = 0; 00256 const vector<utype> &allRGenes = d1.GetAllReverse(); 00257 FILE *pFile; 00258 pFile = fopen(sArgs.output_matrix_arg, "w"); 00259 fprintf(pFile, "Genes\t"); 00260 for(i=0; i<d1.GetNumSet(); i++){ 00261 string g = vecstrGenes[allRGenes[i]]; 00262 fprintf(pFile, "%s", g.c_str()); 00263 if(i<d1.GetNumSet()-1) 00264 fprintf(pFile, "\t"); 00265 } 00266 fprintf(pFile, "\n"); 00267 00268 unsigned int j, t; 00269 for(i=0; i<d1.GetNumSet(); i++){ 00270 string g = vecstrGenes[allRGenes[i]]; 00271 fprintf(pFile, "%s\t", g.c_str()); 00272 vector<float> vf; 00273 vf.resize(vecstrGenes.size()); 00274 for(j=0; j<vecstrGenes.size(); j++) 00275 vf[j] = 0; 00276 utype ii = allRGenes[i]; 00277 vector<CPair<float> >::iterator row_it; 00278 float rv; 00279 for(row_it = sm.RowBegin(ii); row_it!=sm.RowEnd(ii); row_it++){ 00280 j = (size_t) row_it->i; 00281 rv = row_it->v; 00282 vf[j] = rv; 00283 } 00284 00285 for(j=0; j<d1.GetNumSet(); j++){ 00286 t = allRGenes[j]; 00287 fprintf(pFile, "%.3e", vf[t]); 00288 if(j<d1.GetNumSet()-1) 00289 fprintf(pFile, "\t"); 00290 } 00291 fprintf(pFile, "\n"); 00292 } 00293 fclose(pFile); 00294 return 0; 00295 } 00296 00297 CDataPair Dat; 00298 fprintf(stderr, "Opening file...\n"); 00299 if(!Dat.Open(sArgs.dab_file_arg, false, false, 2, false, false)){ 00300 cerr << "error opening file" << endl; 00301 return 1; 00302 } 00303 00304 vector<unsigned int> veciGenes; 00305 veciGenes.resize(vecstrGenes.size()); 00306 for(i=0; i<vecstrGenes.size(); i++) 00307 veciGenes[i] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[i]); 00308 00309 unsigned int s, t, j, ss, tt; 00310 float d; 00311 CSeekIntIntMap m(vecstrGenes.size()); 00312 for(i=0; i<vecstrGenes.size(); i++){ 00313 if((s=veciGenes[i])==(unsigned int)-1) continue; 00314 m.Add(i); 00315 } 00316 00317 const vector<utype> &allRGenes = m.GetAllReverse(); 00318 FILE *pFile; 00319 pFile = fopen(sArgs.output_matrix_arg, "w"); 00320 fprintf(pFile, "Genes\t"); 00321 for(i=0; i<m.GetNumSet(); i++){ 00322 string g = vecstrGenes[allRGenes[i]]; 00323 fprintf(pFile, "%s", g.c_str()); 00324 if(i<m.GetNumSet()-1){ 00325 fprintf(pFile, "\t"); 00326 } 00327 } 00328 fprintf(pFile, "\n"); 00329 00330 for(i=0; i<m.GetNumSet(); i++){ 00331 s = veciGenes[allRGenes[i]]; 00332 string g = vecstrGenes[allRGenes[i]]; 00333 fprintf(pFile, "%s\t", g.c_str()); 00334 for(j=0; j<m.GetNumSet(); j++){ 00335 t = veciGenes[allRGenes[j]]; 00336 if(CMeta::IsNaN(d = Dat.Get(s,t))) 00337 d = 0; 00338 fprintf(pFile, "%.3e", d); 00339 if(j<m.GetNumSet()-1){ 00340 fprintf(pFile, "\t"); 00341 } 00342 } 00343 fprintf(pFile, "\n"); 00344 } 00345 fclose(pFile); 00346 return 0; 00347 } 00348 00349 00350 if(sArgs.limit_hub_flag==1){ 00351 float per = 0.30; 00352 CDataPair Dat; 00353 char outFile[1024]; 00354 fprintf(stderr, "Opening file...\n"); 00355 if(!Dat.Open(sArgs.dabinput_arg, false, false, 2, false, false)){ 00356 cerr << "error opening file" << endl; 00357 return 1; 00358 } 00359 vector<unsigned int> veciGenes; 00360 veciGenes.resize(vecstrGenes.size()); 00361 for(i=0; i<vecstrGenes.size(); i++) 00362 veciGenes[i] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[i]); 00363 00364 unsigned int s, t, j, ss, tt; 00365 float d; 00366 CSeekIntIntMap m(vecstrGenes.size()); 00367 for(i=0; i<vecstrGenes.size(); i++){ 00368 if((s=veciGenes[i])==(unsigned int)-1) continue; 00369 m.Add(i); 00370 } 00371 vector<float> vecSum; 00372 CSeekTools::InitVector(vecSum, vecstrGenes.size(),(float) 0); 00373 const vector<utype> &allRGenes = m.GetAllReverse(); 00374 for(i=0; i<m.GetNumSet(); i++){ 00375 s = veciGenes[allRGenes[i]]; 00376 for(j=i+1; j<m.GetNumSet(); j++){ 00377 t = veciGenes[allRGenes[j]]; 00378 if(CMeta::IsNaN(d = Dat.Get(s,t))) continue; 00379 vecSum[allRGenes[i]]+=pow(abs(d), 9); 00380 vecSum[allRGenes[j]]+=pow(abs(d), 9); 00381 } 00382 } 00383 vector<float> backupSum; 00384 for(i=0; i<vecSum.size(); i++) 00385 backupSum.push_back(vecSum[i]); 00386 sort(vecSum.begin(), vecSum.end(), greater<float>()); 00387 int index = (int) (per * (float) m.GetNumSet()); 00388 float percentile = vecSum[index]; 00389 vector<string> limitedGenes; 00390 for(i=0; i<backupSum.size(); i++){ 00391 if(backupSum[i] > percentile){ 00392 limitedGenes.push_back(vecstrGenes[i]); 00393 } 00394 } 00395 fprintf(stderr, "%d / %d genes to be written!\n", limitedGenes.size(), m.GetNumSet()); 00396 CDat NewDat; 00397 NewDat.Open(limitedGenes); 00398 for(i=0; i<limitedGenes.size(); i++){ 00399 s = (unsigned int) Dat.GetGeneIndex(limitedGenes[i]); 00400 ss = NewDat.GetGeneIndex(limitedGenes[i]); 00401 for(j=i+1; j<limitedGenes.size(); j++){ 00402 t = (unsigned int) Dat.GetGeneIndex(limitedGenes[j]); 00403 tt = NewDat.GetGeneIndex(limitedGenes[j]); 00404 d = Dat.Get(s, t); 00405 NewDat.Set(ss, tt, d); 00406 } 00407 } 00408 NewDat.Save(sArgs.hub_dab_output_arg); 00409 return 0; 00410 } 00411 00412 if(sArgs.comp_ranking_flag==1){ 00413 utype i, j; 00414 int num_query = sArgs.gscore_num1_arg; //random query 00415 char x[256]; 00416 for(i=0; i<num_query; i++){ //i is query id 00417 vector<float> v1, v2; 00418 sprintf(x, "%s/%d.gscore", sArgs.gscore_dir1_arg, i); 00419 CSeekTools::ReadArray(x, v1); 00420 sprintf(x, "%s/%d.gscore", sArgs.gscore_dir2_arg, i); 00421 CSeekTools::ReadArray(x, v2); 00422 vector<CPair<float> > cp1, cp2; 00423 cp1.resize(v1.size()); 00424 cp2.resize(v2.size()); 00425 for(j=0; j<v1.size(); j++){ 00426 cp1[j].i = (utype) j; 00427 cp1[j].v = v1[j]; 00428 cp2[j].i = (utype) j; 00429 cp2[j].v = v2[j]; 00430 } 00431 sort(cp1.begin(), cp1.end(), CDescendingValue<float>()); 00432 sort(cp2.begin(), cp2.end(), CDescendingValue<float>()); 00433 vector<char> presence; 00434 CSeekTools::InitVector(presence, v1.size(), (char) 0); 00435 for(j=0; j<500; j++){ 00436 presence[cp1[j].i]++; 00437 presence[cp2[j].i]++; 00438 } 00439 int count = 0; 00440 for(j=0; j<v1.size(); j++){ 00441 if(presence[j]==2){ 00442 count++; 00443 } 00444 } 00445 fprintf(stderr, "Query %d %d\n", i, count); 00446 } 00447 return 0; 00448 } 00449 00450 if(sArgs.dataset_flag==1){ 00451 string db = sArgs.db_arg; 00452 string dset_list = sArgs.dset_list_arg; 00453 string dir_in = sArgs.dir_in_arg; 00454 string dir_prep = sArgs.dir_prep_in_arg; 00455 if(db=="NA" || dset_list=="NA" || dir_in=="NA" || 00456 dir_prep=="NA"){ 00457 fprintf(stderr, "Requires: -x, -X, -d -p\n"); 00458 return false; 00459 } 00460 00461 vector<string> vecstrDP, vecstrUserDP; 00462 //dataset-platform mapping (required) 00463 if(!CSeekTools::ReadListTwoColumns(sArgs.db_arg, vecstrDatasets, vecstrDP)) 00464 return false; 00465 00466 //dataset filter 00467 if(!CSeekTools::ReadListTwoColumns(sArgs.dset_list_arg, vecstrUserDatasets, vecstrUserDP)) 00468 return false; 00469 00470 map<string, utype> mapstrintDataset; 00471 map<string, string> mapstrstrDatasetPlatform; 00472 utype i; 00473 for(i=0; i<vecstrDatasets.size(); i++){ 00474 mapstrstrDatasetPlatform[vecstrDatasets[i]] = vecstrDP[i]; 00475 mapstrintDataset[vecstrDatasets[i]] = i; 00476 } 00477 00478 CSeekIntIntMap *mapUserDatasets = new CSeekIntIntMap(vecstrDatasets.size()); 00479 for(i=0; i<vecstrUserDatasets.size(); i++){ 00480 mapUserDatasets->Add(mapstrintDataset[vecstrUserDatasets[i]]); 00481 } 00482 00483 //fprintf(stderr, "Finished reading dataset\n"); 00484 00485 size_t iDatasets = vecstrDatasets.size(); 00486 vector<string> vecstrPlatforms; 00487 vector<CSeekPlatform> vp; 00488 map<string, utype> mapstriPlatform; 00489 CSeekTools::ReadPlatforms(sArgs.platform_dir_arg, vp, vecstrPlatforms, 00490 mapstriPlatform); 00491 //fprintf(stderr, "Finished reading platform\n"); 00492 00493 vector<CSeekDataset*> vc; 00494 vc.resize(iDatasets); 00495 string strPrepInputDirectory = sArgs.dir_prep_in_arg; 00496 string strGvarInputDirectory = sArgs.dir_gvar_in_arg; 00497 string strSinfoInputDirectory = sArgs.dir_sinfo_in_arg; 00498 00499 for(i=0; i<iDatasets; i++){ 00500 vc[i] = new CSeekDataset(); 00501 string strFileStem = vecstrDatasets[i]; 00502 string strAvgPath = strPrepInputDirectory + "/" + 00503 strFileStem + ".gavg"; 00504 string strPresencePath = strPrepInputDirectory + "/" + 00505 strFileStem + ".gpres"; 00506 00507 if(strGvarInputDirectory!="NA"){ 00508 string strGvarPath = strGvarInputDirectory + "/" + 00509 strFileStem + ".gexpvar"; 00510 vc[i]->ReadGeneVariance(strGvarPath); 00511 } 00512 if(strSinfoInputDirectory!="NA"){ 00513 string strSinfoPath = strSinfoInputDirectory + "/" + 00514 strFileStem + ".sinfo"; 00515 vc[i]->ReadDatasetAverageStdev(strSinfoPath); 00516 } 00517 00518 vc[i]->ReadGeneAverage(strAvgPath); 00519 vc[i]->ReadGenePresence(strPresencePath); 00520 string strPlatform = 00521 mapstrstrDatasetPlatform.find(strFileStem)->second; 00522 utype platform_id = mapstriPlatform.find(strPlatform)->second; 00523 vc[i]->SetPlatform(vp[platform_id]); 00524 } 00525 00526 //fprintf(stderr, "Finished reading prep\n"); 00527 00528 for(i=0; i<iDatasets; i++) vc[i]->InitializeGeneMap(); 00529 00530 for(i=0; i<vecstrGenes.size(); i++){ 00531 utype ii = mapstrintGene[vecstrGenes[i]]; 00532 fprintf(stderr, "Gene %s ", vecstrGenes[i].c_str()); 00533 utype j = 0; 00534 vector<float> va; 00535 for(j=0; j<iDatasets; j++){ 00536 CSeekIntIntMap *gm = vc[j]->GetGeneMap(); 00537 utype ij = gm->GetForward(ii); 00538 if(CSeekTools::IsNaN(ij)){ 00539 continue; 00540 } 00541 float a = vc[j]->GetGeneAverage(ii); 00542 va.push_back(a); 00543 //fprintf(stderr, "%.2f ", a); 00544 } 00545 sort(va.begin(), va.end()); 00546 utype g = 0; 00547 for(g=0; g<20; g++){ 00548 utype ik = (utype) ((float)0.05*(float)(g+1)*(float)va.size() - 1.0); 00549 fprintf(stderr, "%.2f ", va[ik]); 00550 } 00551 fprintf(stderr, "\n"); 00552 } 00553 00554 fprintf(stderr, "Done\n"); 00555 return false; 00556 00557 vector<vector<string> > vecstrQueries; 00558 string multiQuery = sArgs.multi_query_arg; 00559 00560 if(multiQuery=="NA"){ 00561 vecstrQueries.resize(1); 00562 if(!CSeekTools::ReadMultiGeneOneLine(sArgs.single_query_arg, vecstrQueries[0])) 00563 return false; 00564 }else{ 00565 if(!CSeekTools::ReadMultipleQueries(multiQuery, vecstrQueries)) 00566 return false; 00567 } 00568 00569 bool toOutput = false; 00570 string output_file = sArgs.output_file_arg; 00571 FILE *out = NULL; 00572 00573 if(output_file=="NA"){ 00574 toOutput = false; 00575 }else{ 00576 toOutput = true; 00577 out = fopen(output_file.c_str(), "w"); 00578 } 00579 00580 utype ii=0; 00581 float cutoff = sArgs.gvar_cutoff_arg; 00582 bool toCutoff = false; 00583 if(cutoff<0){ 00584 toCutoff = false; 00585 }else{ 00586 toCutoff = true; 00587 } 00588 00589 if(sArgs.order_stat_single_gene_query_flag==1){ 00590 if(strGvarInputDirectory=="NA"){ 00591 fprintf(stderr, "Order statistics mode, but need to provide gvar!\n"); 00592 return false; 00593 } 00594 if(cutoff<0){ 00595 fprintf(stderr, "Need to provide positive Gvar cutoff\n"); 00596 return false; 00597 } 00598 } 00599 00600 for(ii=0; ii<vecstrQueries.size(); ii++){ 00601 00602 vector<string> &vecstrQuery = vecstrQueries[ii]; 00603 00604 //fprintf(stderr, "Finished reading query\n"); 00605 bool isFirst = true; 00606 vector<int> count; 00607 utype j; 00608 CSeekTools::InitVector(count, vecstrQuery.size(), (int) 0); 00609 00610 //only analyze and report on user datasets 00611 const vector<utype> &allRDatasets = mapUserDatasets->GetAllReverse(); 00612 utype iUserDatasets = mapUserDatasets->GetNumSet(); 00613 utype dd; 00614 for(dd=0; dd<iUserDatasets; dd++){ 00615 i = allRDatasets[dd]; 00616 //fprintf(stdout, "%d %d %s\n", i, dd, vecstrDatasets[i].c_str()); 00617 CSeekIntIntMap *si = vc[i]->GetGeneMap(); 00618 00619 if(toCutoff && sArgs.order_stat_single_gene_query_flag==1){ 00620 if(mapstrintGene.find(vecstrQuery[0])==mapstrintGene.end()) 00621 continue; 00622 if(CSeekTools::IsNaN(si->GetForward( 00623 mapstrintGene[vecstrQuery[0]]))) continue; 00624 float gene_var = vc[i]->GetGeneVariance(mapstrintGene[vecstrQuery[0]]); 00625 if(gene_var < cutoff) continue; 00626 if(isFirst){ 00627 isFirst = false; 00628 if(toOutput){ 00629 fprintf(out, "%s", vecstrDatasets[i].c_str()); 00630 }else{ 00631 fprintf(stdout, "%s", vecstrDatasets[i].c_str()); 00632 } 00633 }else{ 00634 if(toOutput){ 00635 fprintf(out, " %s", vecstrDatasets[i].c_str()); 00636 }else{ 00637 fprintf(stdout, " %s", vecstrDatasets[i].c_str()); 00638 } 00639 } 00640 00641 }else{ 00642 utype present = 0; 00643 for(j=0, present=0; j<vecstrQuery.size(); j++){ 00644 if(mapstrintGene.find(vecstrQuery[j])==mapstrintGene.end()) 00645 continue; 00646 if(CSeekTools::IsNaN(si->GetForward( 00647 mapstrintGene[vecstrQuery[j]]))) continue; 00648 count[j]++; 00649 present++; 00650 } 00651 if(present==vecstrQuery.size()){ 00652 if(isFirst){ 00653 isFirst = false; 00654 if(toOutput){ 00655 fprintf(out, "%s", vecstrDatasets[i].c_str()); 00656 }else{ 00657 fprintf(stdout, "%s", vecstrDatasets[i].c_str()); 00658 } 00659 }else{ 00660 if(toOutput){ 00661 fprintf(out, " %s", vecstrDatasets[i].c_str()); 00662 }else{ 00663 fprintf(stdout, " %s", vecstrDatasets[i].c_str()); 00664 } 00665 } 00666 //fprintf(stderr, "%s\t%s\t%d\t%d\n", 00667 // vecstrDatasets[i].c_str(), vecstrDP[i].c_str(), 00668 // present, si->GetNumSet()); 00669 } 00670 } 00671 } 00672 00673 for(j=0; j<vecstrQuery.size(); j++) 00674 fprintf(stderr, "Gene %s: %d\n", vecstrQuery[j].c_str(), count[j]); 00675 00676 if(toOutput){ 00677 fprintf(out, "\n"); 00678 }else{ 00679 fprintf(stdout, "\n"); 00680 } 00681 00682 } 00683 00684 if(toOutput){ 00685 fclose(out); 00686 } 00687 return 0; 00688 } 00689 00690 if(sArgs.databaselet_flag==1){ 00691 string db = sArgs.db_arg; 00692 string dset_list = sArgs.dset_list_arg; 00693 string dir_in = sArgs.dir_in_arg; 00694 string dir_prep = sArgs.dir_prep_in_arg; 00695 string single_query = sArgs.single_query_arg; 00696 00697 if(db=="NA" || dset_list=="NA" || dir_in=="NA" || 00698 dir_prep=="NA" || single_query=="NA"){ 00699 fprintf(stderr, "Requires: -x, -X, -d -p -q\n"); 00700 return false; 00701 } 00702 00703 bool useNibble = false; 00704 if(sArgs.is_nibble_flag==1) useNibble = true; 00705 CDatabase DB(useNibble); 00706 vector<string> vecstrDP; 00707 vector<string> vecstrQuery; 00708 if(!CSeekTools::ReadListTwoColumns(sArgs.db_arg, vecstrDatasets, vecstrDP)) 00709 return false; 00710 if(!CSeekTools::ReadMultiGeneOneLine(sArgs.single_query_arg, vecstrQuery)) 00711 return false; 00712 00713 string strInputDirectory = sArgs.dir_in_arg; 00714 DB.Open(strInputDirectory); 00715 00716 size_t iDatasets = DB.GetDatasets(); 00717 size_t iGenes = DB.GetGenes(); 00718 00719 size_t j,k; 00720 vector<CSeekDataset*> vc; 00721 vc.clear(); 00722 vc.resize(iDatasets); 00723 for(i=0; i<iDatasets; i++){ 00724 vc[i] = new CSeekDataset(); 00725 string strPrepInputDirectory = sArgs.dir_prep_in_arg; 00726 string strFileStem = vecstrDatasets[i]; 00727 string strAvgPath = strPrepInputDirectory + "/" + 00728 strFileStem + ".gavg"; 00729 string strPresencePath = strPrepInputDirectory + "/" + 00730 strFileStem + ".gpres"; 00731 vc[i]->ReadGeneAverage(strAvgPath); 00732 vc[i]->ReadGenePresence(strPresencePath); 00733 } 00734 00735 vector<char> cQuery; 00736 vector<utype> allQ; 00737 CSeekTools::InitVector(cQuery, iGenes, (char) 0); 00738 00739 for(i=0; i<vecstrQuery.size(); i++){ 00740 k = DB.GetGene(vecstrQuery[i]); 00741 if(k==-1) continue; 00742 cQuery[k] = 1; 00743 } 00744 00745 for(k=0; k<iGenes; k++) 00746 if(cQuery[k]==1) 00747 allQ.push_back(k); 00748 00749 for(i=0; i<iDatasets; i++){ 00750 vc[i]->InitializeGeneMap(); 00751 vc[i]->InitializeQueryBlock(allQ); 00752 } 00753 00754 vector<unsigned char> *Q = 00755 new vector<unsigned char>[vecstrQuery.size()]; 00756 00757 for(i=0; i<vecstrQuery.size(); i++) 00758 if(!DB.GetGene(vecstrQuery[i], Q[i])) 00759 cerr << "Gene does not exist" << endl; 00760 00761 //printf("Before"); getchar(); 00762 for(i=0; i<vecstrQuery.size(); i++){ 00763 if(DB.GetGene(vecstrQuery[i])==-1) continue; 00764 size_t m = DB.GetGene(vecstrQuery[i]); 00765 size_t l = 0; 00766 for(j=0; j<iDatasets; j++){ 00767 CSeekIntIntMap *qu = vc[j]->GetDBMap(); 00768 if(qu==NULL) continue; 00769 unsigned char **r = vc[j]->GetMatrix(); 00770 utype query = qu->GetForward(m); 00771 if(CSeekTools::IsNaN(query)) continue; 00772 for(k=0; k<iGenes; k++){ 00773 unsigned char c = Q[i][k*iDatasets + j]; 00774 r[query][k] = c; 00775 } 00776 } 00777 } 00778 00779 for(i=0; i<iDatasets; i++){ 00780 printf("Dataset %ld\n", i); 00781 unsigned char **r = vc[i]->GetMatrix(); 00782 CSeekIntIntMap *mapG = vc[i]->GetGeneMap(); 00783 CSeekIntIntMap *mapDB = vc[i]->GetDBMap(); 00784 if(mapDB==NULL) continue; 00785 for(j=0; j<mapDB->GetNumSet(); j++){ 00786 if(vecstrDatasets[i]=="GSE19470.GPL5175.pcl"){ 00787 printf("Row %ld\n", j); 00788 for(k=0; k<mapG->GetNumSet(); k++){ 00789 printf("%d ", r[j][mapG->GetReverse(k)]); 00790 } 00791 printf("\n"); 00792 } 00793 } 00794 } 00795 00796 } 00797 00798 #ifdef WIN32 00799 pthread_win32_process_detach_np( ); 00800 #endif // WIN32 00801 return 0; }