Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "seekreader.h" 00023 00024 namespace Sleipnir { 00025 00026 string CSeekTools::ConvertInt(const int &number){ 00027 stringstream ss;//create a stringstream 00028 ss << number;//add number to the stream 00029 return ss.str();//return a string with the contents of the stream 00030 } 00031 00032 bool CSeekTools::IsNaN(const utype &v){ 00033 if(v==MAX_UTYPE) return true; 00034 return false; 00035 } 00036 00037 utype CSeekTools::GetNaN(){ 00038 return MAX_UTYPE; 00039 } 00040 00041 bool CSeekTools::ReadDatabaselets(const vector<CDatabase*> &DB, 00042 const size_t &iGenes, const size_t &iDatasets, 00043 const vector< vector<string> > &vecstrAllQuery, 00044 vector<CSeekDataset*> &vc, const map<string,utype> &mapstriGenes, 00045 const vector<vector<string> > &dbDatasets, 00046 const map<string,utype> &mapstriDatasets, 00047 //network mode (data sent to client) 00048 const int &iClient, const bool &bNetwork){ 00049 00050 //requires LoadDatabase to be called beforehand 00051 size_t i, j, k; 00052 vector<char> cAllQuery; 00053 00054 CSeekTools::InitVector(cAllQuery, iGenes, (char) 0); 00055 00056 for(i=0; i<vecstrAllQuery.size(); i++){ 00057 for(j=0; j<vecstrAllQuery[i].size(); j++){ 00058 if(mapstriGenes.find(vecstrAllQuery[i][j])==mapstriGenes.end()) continue; 00059 utype k = mapstriGenes.find(vecstrAllQuery[i][j])->second; 00060 cAllQuery[k] = 1; 00061 } 00062 } 00063 00064 vector<utype> allQ; 00065 for(i=0; i<cAllQuery.size(); i++) if(cAllQuery[i]==1) allQ.push_back(i); 00066 allQ.resize(allQ.size()); 00067 00068 //for now 00069 for(i=0; i<iDatasets; i++){ 00070 if(vc[i]->GetDBMap()!=NULL){ 00071 vc[i]->DeleteQueryBlock(); 00072 } 00073 } 00074 00075 int ret; //system call return 00076 00077 fprintf(stderr, "Initializing query map\n"); 00078 //ret = system("date +%s%N 1>&2"); 00079 /*if(bNetwork && CSeekNetwork::Send(iClient, "Initializing query map")==-1){ 00080 fprintf(stderr, "Error sending client message\n"); 00081 return false; 00082 }*/ 00083 00084 #pragma omp parallel for \ 00085 shared(allQ) private(i) schedule(dynamic) 00086 for(i=0; i<iDatasets; i++){ 00087 vc[i]->InitializeQueryBlock(allQ); 00088 } 00089 00090 fprintf(stderr, "Done initializing query map\n"); 00091 //ret = system("date +%s%N 1>&2"); 00092 /*if(bNetwork && CSeekNetwork::Send(iClient, 00093 "Done initializing query map")==-1){ 00094 fprintf(stderr, "Error sending client message\n"); 00095 return false; 00096 }*/ 00097 00098 fprintf(stderr, "Reading %lu query genes' correlations\n", 00099 allQ.size()); 00100 ret = system("date +%s%N 1>&2"); 00101 if(bNetwork && CSeekNetwork::Send(iClient, "Reading " + 00102 CSeekTools::ConvertInt(allQ.size()) + 00103 " query genes' correlations")==-1){ 00104 fprintf(stderr, "Error sending client message\n"); 00105 return false; 00106 } 00107 00108 size_t m, d; 00109 00110 for(i=0; i<allQ.size(); i++){ 00111 m = allQ[i]; 00112 for(d=0; d<DB.size(); d++){ //number of CDatabase collections 00113 vector<unsigned char> Qi; 00114 if(!DB[d]->GetGene(m, Qi)){ 00115 cerr << "Gene does not exist" << endl; 00116 continue; 00117 } 00118 utype db; 00119 CSeekIntIntMap *qu = NULL; 00120 unsigned char **r = NULL; 00121 vector<utype> vecDatasetID; 00122 for(j=0; j<dbDatasets[d].size(); j++){ 00123 utype qq = mapstriDatasets.find(dbDatasets[d][j])->second; 00124 vecDatasetID.push_back(qq); 00125 } 00126 #pragma omp parallel for \ 00127 shared(Qi) private(j, k) \ 00128 firstprivate(m, qu, r, db) schedule(dynamic) 00129 for(j=0; j<vecDatasetID.size(); j++){ 00130 if((qu=vc[vecDatasetID[j]]->GetDBMap())==NULL) continue; 00131 if(CSeekTools::IsNaN(db = (qu->GetForward(m)))) continue; 00132 for(r = vc[vecDatasetID[j]]->GetMatrix(), k=0; k<iGenes; k++) 00133 r[db][k] = Qi[k*vecDatasetID.size()+j]; 00134 } 00135 Qi.clear(); 00136 } 00137 } 00138 00139 fprintf(stderr, "Finished reading query genes' correlations\n"); 00140 ret = system("date +%s%N 1>&2"); 00141 if(bNetwork && CSeekNetwork::Send(iClient, 00142 "Finished reading query genes' correlations")==-1){ 00143 fprintf(stderr, "Error sending client message\n"); 00144 return false; 00145 } 00146 00147 return true; 00148 } 00149 00150 bool CSeekTools::ReadQuantFile(const string &strFile, vector<float> &quant, 00151 const int lineSize){ 00152 return CSeekTools::ReadQuantFile(strFile.c_str(), quant, lineSize); 00153 } 00154 00155 bool CSeekTools::ReadQuantFile(const char *file, vector<float> &quant, const int lineSize){ 00156 ifstream ifsm; 00157 ifsm.open(file); 00158 char acBuffer[lineSize]; 00159 utype c_iBuffer = lineSize; 00160 vector<string> vecstrLine; 00161 00162 ifsm.getline(acBuffer, c_iBuffer -1); 00163 //fprintf(stderr, "%s\n", acBuffer); 00164 CMeta::Tokenize( acBuffer, vecstrLine, " ", false); 00165 quant.clear(); 00166 utype i; 00167 for(i=0; i<vecstrLine.size(); i++){ 00168 quant.push_back(atof(vecstrLine[i].c_str())); 00169 //fprintf(stderr, "%.5f\n", atof(vecstrLine[i].c_str())); 00170 } 00171 quant.resize(quant.size()); 00172 ifsm.close(); 00173 return true; 00174 } 00175 00176 bool CSeekTools::LoadDatabase(const vector<CDatabase*> &DB, 00177 const size_t &iGenes, const size_t &iDatasets, 00178 vector<CSeekDataset*> &vc, const vector<CSeekDataset*> &vc_src, 00179 vector<CSeekPlatform> &vp, const vector<CSeekPlatform> &vp_src, 00180 const vector<string> &vecstrDatasets, 00181 const map<string, string> &mapstrstrDatasetPlatform, 00182 const map<string, utype> &mapstriPlatform){ 00183 00184 size_t i, j, k; 00185 00186 vc.clear(); 00187 vc.resize(iDatasets); 00188 vp.clear(); 00189 vp.resize(vp_src.size()); 00190 00191 for(i=0; i<vp.size(); i++) 00192 vp[i].Copy(vp_src[i]); 00193 00194 int ret; //system call returns 00195 00196 fprintf(stderr, "Initializing gene map\n"); 00197 //ret = system("date +%s%N 1>&2"); 00198 #pragma omp parallel for \ 00199 private(i) schedule(dynamic) 00200 for(i=0; i<iDatasets; i++){ 00201 vc[i] = new CSeekDataset(); 00202 vc[i]->Copy(vc_src[i]); 00203 string strFileStem = vecstrDatasets[i]; 00204 string strPlatform = 00205 mapstrstrDatasetPlatform.find(strFileStem)->second; 00206 utype platform_id = mapstriPlatform.find(strPlatform)->second; 00207 vc[i]->SetPlatform(vp[platform_id]); 00208 } 00209 00210 fprintf(stderr, "Done initializing gene map\n"); 00211 //ret = system("date +%s%N 1>&2"); 00212 return true; 00213 } 00214 00215 bool CSeekTools::LoadDatabase(const vector<CDatabase*> &DB, 00216 const size_t &iGenes, const size_t &iDatasets, 00217 const vector<CSeekDBSetting*> &DBSetting, 00218 const vector<string> &vecstrDatasets, 00219 const map<string, string> &mapstrstrDatasetPlatform, 00220 const map<string, utype> &mapstriPlatform, vector<CSeekPlatform> &vp, 00221 vector<CSeekDataset*> &vc, const vector<vector<string> > &dbDataset, 00222 const map<string,utype> &mapstriDataset, 00223 const bool bVariance, const bool bCorrelation){ 00224 00225 size_t i, j, k; 00226 vc.clear(); 00227 vc.resize(iDatasets); 00228 00229 if(bCorrelation){ 00230 for(i=0; i<DB.size(); i++){ 00231 if(DBSetting[i]->GetValue("sinfo")=="NA"){ 00232 fprintf(stderr, "sinfo parameter must be given.\n"); 00233 return false; 00234 } 00235 } 00236 } 00237 00238 if(bVariance){ 00239 for(i=0; i<DB.size(); i++){ 00240 if(DBSetting[i]->GetValue("gvar")=="NA"){ 00241 fprintf(stderr, "gene variance parameter must be given.\n"); 00242 return false; 00243 } 00244 } 00245 } 00246 00247 int ret; //system call return 00248 00249 fprintf(stderr, "Start reading average and presence files\n"); 00250 ret = system("date +%s%N 1>&2"); 00251 for(i=0; i<DB.size(); i++){ 00252 const vector<string> &dset = dbDataset[i]; 00253 string strPrepInputDirectory = DBSetting[i]->GetValue("prep"); 00254 string strGvarInputDirectory = DBSetting[i]->GetValue("gvar"); 00255 string strSinfoInputDirectory = DBSetting[i]->GetValue("sinfo"); 00256 00257 for(j=0; j<dset.size(); j++){ 00258 utype d = mapstriDataset.find(dset[j])->second; 00259 vc[d] = new CSeekDataset(); 00260 string strFileStem = dset[j]; 00261 string strAvgPath = strPrepInputDirectory + "/" + 00262 strFileStem + ".gavg"; 00263 string strPresencePath = strPrepInputDirectory + "/" + 00264 strFileStem + ".gpres"; 00265 vc[d]->ReadGeneAverage(strAvgPath); 00266 vc[d]->ReadGenePresence(strPresencePath); 00267 if(bVariance){ 00268 string strVariancePath = strGvarInputDirectory + "/" + 00269 strFileStem + ".gexpvar"; 00270 vc[d]->ReadGeneVariance(strVariancePath); 00271 } 00272 if(bCorrelation){ 00273 string strSinfoPath = strSinfoInputDirectory + "/" + 00274 strFileStem + ".sinfo"; 00275 vc[d]->ReadDatasetAverageStdev(strSinfoPath); 00276 } 00277 string strPlatform = 00278 mapstrstrDatasetPlatform.find(strFileStem)->second; 00279 utype platform_id = mapstriPlatform.find(strPlatform)->second; 00280 vc[d]->SetPlatform(vp[platform_id]); 00281 } 00282 } 00283 00284 fprintf(stderr, "Done reading average and presence files\n"); 00285 ret = system("date +%s%N 1>&2"); 00286 00287 fprintf(stderr, "Initializing gene map\n"); ret = system("date +%s%N 1>&2"); 00288 #pragma omp parallel for \ 00289 private(i) schedule(dynamic) 00290 for(i=0; i<iDatasets; i++) vc[i]->InitializeGeneMap(); 00291 00292 fprintf(stderr, "Done initializing gene map\n"); ret = system("date +%s%N 1>&2"); 00293 return true; 00294 } 00295 00296 bool CSeekTools::ReadPlatforms(const string &strPlatformDirectory, 00297 vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms, 00298 map<string, utype> &mapstriPlatforms, const int lineSize){ 00299 return CSeekTools::ReadPlatforms(strPlatformDirectory.c_str(), plat, 00300 vecstrPlatforms, mapstriPlatforms, lineSize); 00301 } 00302 00303 bool CSeekTools::ReadPlatforms(const char *plat_dir, 00304 vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms, 00305 map<string, utype> &mapstriPlatforms, const int lineSize){ 00306 00307 string strPlatformDirectory = plat_dir; 00308 string strAvgFile = strPlatformDirectory + "/" + 00309 "all_platforms.gplatavg"; 00310 string strStdevFile = strPlatformDirectory + "/" + 00311 "all_platforms.gplatstdev"; 00312 string strPlatformOrderFile = strPlatformDirectory + "/" + 00313 "all_platforms.gplatorder"; 00314 00315 CFullMatrix<float> plat_avg; 00316 plat_avg.Open(strAvgFile.c_str()); 00317 CFullMatrix<float> plat_stdev; 00318 plat_stdev.Open(strStdevFile.c_str()); 00319 plat.clear(); 00320 plat.resize(plat_avg.GetRows()); 00321 utype i, j; 00322 00323 vecstrPlatforms.clear(); 00324 mapstriPlatforms.clear(); 00325 ifstream ifsm; 00326 ifsm.open(strPlatformOrderFile.c_str()); 00327 char acBuffer[lineSize]; 00328 utype c_iBuffer = lineSize; 00329 i = 0; 00330 while(!ifsm.eof()){ 00331 ifsm.getline(acBuffer, c_iBuffer -1); 00332 if(acBuffer[0]==0) break; 00333 acBuffer[c_iBuffer-1] = 0; 00334 vecstrPlatforms.push_back(acBuffer); 00335 mapstriPlatforms[acBuffer] = i; 00336 i++; 00337 } 00338 vecstrPlatforms.resize(vecstrPlatforms.size()); 00339 ifsm.close(); 00340 00341 for(i=0; i<plat_avg.GetRows(); i++){ 00342 plat[i].InitializePlatform(plat_avg.GetColumns(), vecstrPlatforms[i]); 00343 for(j=0; j<plat_avg.GetColumns(); j++){ 00344 plat[i].SetPlatformAvg(j, plat_avg.Get(i, j)); 00345 plat[i].SetPlatformStdev(j, plat_stdev.Get(i, j)); 00346 } 00347 } 00348 00349 return true; 00350 } 00351 00352 bool CSeekTools::ReadListTwoColumns(const string &strFile, 00353 vector<string> &vecstrList1, vector<string> &vecstrList2, const int lineSize){ 00354 return CSeekTools::ReadListTwoColumns(strFile.c_str(), 00355 vecstrList1, vecstrList2, lineSize); 00356 } 00357 00358 bool CSeekTools::ReadListTwoColumns(const char *file, 00359 vector<string> &vecstrList1, vector<string> &vecstrList2, 00360 const int lineSize){ 00361 ifstream ifsm; 00362 ifsm.open(file); 00363 if(!ifsm.is_open()){ 00364 fprintf(stderr, "Error opening file %s\n", file); 00365 return false; 00366 } 00367 char acBuffer[lineSize]; 00368 utype c_iBuffer = lineSize; 00369 vecstrList1.clear(); 00370 vecstrList2.clear(); 00371 00372 while(!ifsm.eof()){ 00373 ifsm.getline(acBuffer, c_iBuffer -1); 00374 if(acBuffer[0]==0) break; 00375 acBuffer[c_iBuffer-1] = 0; 00376 vector<string> tok; 00377 CMeta::Tokenize(acBuffer, tok); 00378 vecstrList1.push_back(tok[0]); 00379 vecstrList2.push_back(tok[1]); 00380 } 00381 vecstrList1.resize(vecstrList1.size()); 00382 vecstrList2.resize(vecstrList2.size()); 00383 ifsm.close(); 00384 return true; 00385 } 00386 00387 bool CSeekTools::ReadListOneColumn(const string &strFile, 00388 vector<string> &vecstrList, CSeekStrIntMap &mapstriList, const int lineSize){ 00389 return CSeekTools::ReadListOneColumn(strFile.c_str(), 00390 vecstrList, mapstriList, lineSize); 00391 } 00392 00393 00394 bool CSeekTools::ReadListOneColumn(const char *file, 00395 vector<string> &vecstrList, CSeekStrIntMap &mapstriList, const int lineSize){ 00396 ifstream ifsm; 00397 ifsm.open(file); 00398 if(!ifsm.is_open()){ 00399 fprintf(stderr, "Error opening file %s\n", file); 00400 return false; 00401 } 00402 00403 char acBuffer[lineSize]; 00404 utype c_iBuffer = lineSize; 00405 vecstrList.clear(); 00406 00407 int i = 0; 00408 while(!ifsm.eof()){ 00409 ifsm.getline(acBuffer, c_iBuffer -1); 00410 if(acBuffer[0]==0) break; 00411 acBuffer[c_iBuffer-1] = 0; 00412 string line = acBuffer; 00413 vecstrList.push_back(line); 00414 mapstriList.Set(line, i); 00415 i++; 00416 } 00417 vecstrList.resize(vecstrList.size()); 00418 ifsm.close(); 00419 return true; 00420 } 00421 00422 bool CSeekTools::ReadMultipleQueries(const string &strFile, 00423 vector< vector<string> > &qList, const int lineSize){ 00424 return CSeekTools::ReadMultipleQueries(strFile.c_str(), qList, lineSize); 00425 } 00426 00427 bool CSeekTools::ReadMultipleQueries(const char *file, 00428 vector< vector<string> > &qList, const int lineSize){ 00429 qList.clear(); 00430 FILE *infile; 00431 if((infile=fopen(file, "r"))==NULL){ 00432 fprintf(stderr, "Error opening file %s\n", file); 00433 return false; 00434 } 00435 00436 char *acBuffer; 00437 int MAX_CHAR_PER_LINE = lineSize; 00438 int lineLen = MAX_CHAR_PER_LINE; 00439 acBuffer = (char*)malloc(lineLen); 00440 while(fgets(acBuffer, lineLen, infile)!=NULL){ 00441 while(strlen(acBuffer)==lineLen-1){ 00442 int len = strlen(acBuffer); 00443 fseek(infile, -len, SEEK_CUR); 00444 lineLen+=MAX_CHAR_PER_LINE; 00445 acBuffer = (char*)realloc(acBuffer, lineLen); 00446 char *ret = fgets(acBuffer, lineLen, infile); 00447 } 00448 } 00449 rewind(infile); 00450 00451 while(fgets(acBuffer, lineLen, infile)!=NULL){ 00452 char *p = strtok(acBuffer, "\n"); 00453 vector<string> tok; 00454 CMeta::Tokenize(p, tok, " "); 00455 qList.push_back(tok); 00456 } 00457 qList.resize(qList.size()); 00458 free(acBuffer); 00459 00460 fclose(infile); 00461 return true; 00462 } 00463 00464 bool CSeekTools::ReadMultipleNotQueries(const char *file, 00465 vector<vector<vector<string> > > &qList, const int lineSize){ 00466 qList.clear(); 00467 FILE *infile; 00468 if((infile=fopen(file, "r"))==NULL){ 00469 fprintf(stderr, "Error opening file %s\n", file); 00470 return false; 00471 } 00472 00473 char *acBuffer; 00474 int MAX_CHAR_PER_LINE = lineSize; 00475 int lineLen = MAX_CHAR_PER_LINE; 00476 acBuffer = (char*)malloc(lineLen); 00477 while(fgets(acBuffer, lineLen, infile)!=NULL){ 00478 while(strlen(acBuffer)==lineLen-1){ 00479 int len = strlen(acBuffer); 00480 fseek(infile, -len, SEEK_CUR); 00481 lineLen+=MAX_CHAR_PER_LINE; 00482 acBuffer = (char*)realloc(acBuffer, lineLen); 00483 char *ret = fgets(acBuffer, lineLen, infile); 00484 } 00485 } 00486 rewind(infile); 00487 00488 while(fgets(acBuffer, lineLen, infile)!=NULL){ 00489 char *p = strtok(acBuffer, "\n"); 00490 vector<vector<string> > aQ; 00491 vector<string> tok; 00492 CMeta::Tokenize(p, tok, "|"); 00493 aQ.resize(tok.size()); 00494 int i; 00495 for(i=0; i<tok.size(); i++){ 00496 vector<string> tmp; 00497 CMeta::Tokenize(tok[i].c_str(), tmp, " "); 00498 aQ[i] = tmp; 00499 } 00500 qList.push_back(aQ); 00501 } 00502 qList.resize(qList.size()); 00503 free(acBuffer); 00504 fclose(infile); 00505 return true; 00506 } 00507 00508 bool CSeekTools::ReadMultiGeneOneLine(const string &strFile, 00509 vector<string> &list, const int lineSize){ 00510 return CSeekTools::ReadMultiGeneOneLine(strFile.c_str(), list, lineSize); 00511 } 00512 00513 bool CSeekTools::ReadMultiGeneOneLine(const char *file, 00514 vector<string> &list, const int lineSize){ 00515 list.clear(); 00516 ifstream ifsm; 00517 ifsm.open(file); 00518 if(!ifsm.is_open()){ 00519 fprintf(stderr, "Error opening file %s\n", file); 00520 return false; 00521 } 00522 00523 char *acBuffer= (char*)malloc(lineSize); 00524 int c_iBuffer = lineSize; 00525 int i = 0; 00526 00527 ifsm.getline(acBuffer, c_iBuffer -1); 00528 acBuffer[c_iBuffer-1] = 0; 00529 vector<string> tok; 00530 CMeta::Tokenize(acBuffer, tok, " "); 00531 for(i = 0; i<tok.size(); i++){ 00532 list.push_back(tok[i]); 00533 } 00534 00535 list.resize(list.size()); 00536 ifsm.close(); 00537 free(acBuffer); 00538 return true; 00539 } 00540 00541 bool CSeekTools::ReadListOneColumn(const string &strFile, 00542 vector<string> &vecstrList, const int lineSize){ 00543 return CSeekTools::ReadListOneColumn(strFile.c_str(), vecstrList, lineSize); 00544 } 00545 00546 bool CSeekTools::ReadListOneColumn(const char *file, 00547 vector<string> &vecstrList, const int lineSize){ 00548 ifstream ifsm; 00549 ifsm.open(file); 00550 00551 if(!ifsm.is_open()){ 00552 fprintf(stderr, "Error opening file %s\n", file); 00553 return false; 00554 } 00555 char acBuffer[lineSize]; 00556 utype c_iBuffer = lineSize; 00557 vecstrList.clear(); 00558 int i = 0; 00559 while(!ifsm.eof()){ 00560 ifsm.getline(acBuffer, c_iBuffer -1); 00561 if(acBuffer[0]==0) break; 00562 acBuffer[c_iBuffer-1] = 0; 00563 string line = acBuffer; 00564 vecstrList.push_back(line); 00565 } 00566 vecstrList.resize(vecstrList.size()); 00567 ifsm.close(); 00568 return true; 00569 } 00570 00571 }