Sleipnir
src/seekreader.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "seekreader.h"
00023 
00024 namespace Sleipnir {
00025 
00026 string CSeekTools::ConvertInt(const int &number){
00027     stringstream ss;//create a stringstream
00028     ss << number;//add number to the stream
00029     return ss.str();//return a string with the contents of the stream
00030 }
00031 
00032 bool CSeekTools::IsNaN(const utype &v){
00033     if(v==MAX_UTYPE) return true;
00034     return false;
00035 }
00036 
00037 utype CSeekTools::GetNaN(){
00038     return MAX_UTYPE;
00039 }
00040 
00041 bool CSeekTools::ReadDatabaselets(const vector<CDatabase*> &DB,
00042     const size_t &iGenes, const size_t &iDatasets,
00043     const vector< vector<string> > &vecstrAllQuery,
00044     vector<CSeekDataset*> &vc, const map<string,utype> &mapstriGenes,
00045     const vector<vector<string> > &dbDatasets,
00046     const map<string,utype> &mapstriDatasets,
00047     //network mode (data sent to client)
00048     const int &iClient, const bool &bNetwork){
00049 
00050     //requires LoadDatabase to be called beforehand
00051     size_t i, j, k;
00052     vector<char> cAllQuery;
00053 
00054     CSeekTools::InitVector(cAllQuery, iGenes, (char) 0);
00055 
00056     for(i=0; i<vecstrAllQuery.size(); i++){
00057         for(j=0; j<vecstrAllQuery[i].size(); j++){
00058             if(mapstriGenes.find(vecstrAllQuery[i][j])==mapstriGenes.end()) continue;
00059             utype k = mapstriGenes.find(vecstrAllQuery[i][j])->second;
00060             cAllQuery[k] = 1;
00061         }
00062     }
00063 
00064     vector<utype> allQ;
00065     for(i=0; i<cAllQuery.size(); i++) if(cAllQuery[i]==1) allQ.push_back(i);
00066     allQ.resize(allQ.size());
00067 
00068     //for now
00069     for(i=0; i<iDatasets; i++){
00070         if(vc[i]->GetDBMap()!=NULL){
00071             vc[i]->DeleteQueryBlock();
00072         }
00073     }
00074 
00075     int ret; //system call return
00076 
00077     fprintf(stderr, "Initializing query map\n"); 
00078     //ret = system("date +%s%N 1>&2");
00079     /*if(bNetwork && CSeekNetwork::Send(iClient, "Initializing query map")==-1){
00080         fprintf(stderr, "Error sending client message\n");
00081         return false;
00082     }*/
00083     
00084     #pragma omp parallel for \
00085     shared(allQ) private(i) schedule(dynamic)
00086     for(i=0; i<iDatasets; i++){
00087         vc[i]->InitializeQueryBlock(allQ);
00088     }
00089 
00090     fprintf(stderr, "Done initializing query map\n");
00091     //ret = system("date +%s%N 1>&2");
00092     /*if(bNetwork && CSeekNetwork::Send(iClient, 
00093         "Done initializing query map")==-1){
00094         fprintf(stderr, "Error sending client message\n");
00095         return false;
00096     }*/
00097 
00098     fprintf(stderr, "Reading %lu query genes' correlations\n",
00099         allQ.size());
00100     ret = system("date +%s%N 1>&2");
00101     if(bNetwork && CSeekNetwork::Send(iClient, "Reading " + 
00102         CSeekTools::ConvertInt(allQ.size()) + 
00103         " query genes' correlations")==-1){
00104         fprintf(stderr, "Error sending client message\n");
00105         return false;
00106     }
00107 
00108     size_t m, d;
00109 
00110     for(i=0; i<allQ.size(); i++){
00111         m = allQ[i];
00112         for(d=0; d<DB.size(); d++){ //number of CDatabase collections
00113             vector<unsigned char> Qi;
00114             if(!DB[d]->GetGene(m, Qi)){
00115                 cerr << "Gene does not exist" << endl;
00116                 continue;
00117             }
00118             utype db;
00119             CSeekIntIntMap *qu = NULL;
00120             unsigned char **r = NULL;
00121             vector<utype> vecDatasetID;
00122             for(j=0; j<dbDatasets[d].size(); j++){
00123                 utype qq = mapstriDatasets.find(dbDatasets[d][j])->second;
00124                 vecDatasetID.push_back(qq);
00125             }
00126             #pragma omp parallel for \
00127             shared(Qi) private(j, k) \
00128             firstprivate(m, qu, r, db) schedule(dynamic)
00129             for(j=0; j<vecDatasetID.size(); j++){
00130                 if((qu=vc[vecDatasetID[j]]->GetDBMap())==NULL) continue;
00131                 if(CSeekTools::IsNaN(db = (qu->GetForward(m)))) continue;
00132                 for(r = vc[vecDatasetID[j]]->GetMatrix(), k=0; k<iGenes; k++)
00133                     r[db][k] = Qi[k*vecDatasetID.size()+j];
00134             }
00135             Qi.clear();
00136         }
00137     }
00138 
00139     fprintf(stderr, "Finished reading query genes' correlations\n");
00140     ret = system("date +%s%N 1>&2");
00141     if(bNetwork && CSeekNetwork::Send(iClient, 
00142         "Finished reading query genes' correlations")==-1){
00143         fprintf(stderr, "Error sending client message\n");
00144         return false;
00145     }
00146 
00147     return true;
00148 }
00149     
00150 bool CSeekTools::ReadQuantFile(const string &strFile, vector<float> &quant,
00151     const int lineSize){
00152     return CSeekTools::ReadQuantFile(strFile.c_str(), quant, lineSize);
00153 }
00154 
00155 bool CSeekTools::ReadQuantFile(const char *file, vector<float> &quant, const int lineSize){
00156     ifstream ifsm;
00157     ifsm.open(file);
00158     char acBuffer[lineSize];
00159     utype c_iBuffer = lineSize;
00160     vector<string> vecstrLine;
00161 
00162     ifsm.getline(acBuffer, c_iBuffer -1);
00163     //fprintf(stderr, "%s\n", acBuffer);
00164     CMeta::Tokenize( acBuffer, vecstrLine, " ", false);
00165     quant.clear();
00166     utype i;
00167     for(i=0; i<vecstrLine.size(); i++){
00168         quant.push_back(atof(vecstrLine[i].c_str()));
00169         //fprintf(stderr, "%.5f\n", atof(vecstrLine[i].c_str()));
00170     }
00171     quant.resize(quant.size());
00172     ifsm.close();
00173     return true;
00174 }
00175 
00176 bool CSeekTools::LoadDatabase(const vector<CDatabase*> &DB,
00177     const size_t &iGenes, const size_t &iDatasets,
00178     vector<CSeekDataset*> &vc, const vector<CSeekDataset*> &vc_src, 
00179     vector<CSeekPlatform> &vp, const vector<CSeekPlatform> &vp_src, 
00180     const vector<string> &vecstrDatasets,
00181     const map<string, string> &mapstrstrDatasetPlatform, 
00182     const map<string, utype> &mapstriPlatform){
00183 
00184     size_t i, j, k;
00185 
00186     vc.clear();
00187     vc.resize(iDatasets);
00188     vp.clear();
00189     vp.resize(vp_src.size());
00190 
00191     for(i=0; i<vp.size(); i++)
00192         vp[i].Copy(vp_src[i]);
00193 
00194     int ret; //system call returns
00195 
00196     fprintf(stderr, "Initializing gene map\n");
00197     //ret = system("date +%s%N 1>&2");
00198     #pragma omp parallel for \
00199     private(i) schedule(dynamic)
00200     for(i=0; i<iDatasets; i++){
00201         vc[i] = new CSeekDataset();
00202         vc[i]->Copy(vc_src[i]);
00203         string strFileStem = vecstrDatasets[i];
00204         string strPlatform =
00205             mapstrstrDatasetPlatform.find(strFileStem)->second;
00206         utype platform_id = mapstriPlatform.find(strPlatform)->second;
00207         vc[i]->SetPlatform(vp[platform_id]);
00208     }
00209 
00210     fprintf(stderr, "Done initializing gene map\n"); 
00211     //ret = system("date +%s%N 1>&2");
00212     return true;
00213 }
00214 
00215 bool CSeekTools::LoadDatabase(const vector<CDatabase*> &DB,
00216     const size_t &iGenes, const size_t &iDatasets,
00217     const vector<CSeekDBSetting*> &DBSetting,
00218     const vector<string> &vecstrDatasets,
00219     const map<string, string> &mapstrstrDatasetPlatform,
00220     const map<string, utype> &mapstriPlatform, vector<CSeekPlatform> &vp,
00221     vector<CSeekDataset*> &vc, const vector<vector<string> > &dbDataset,
00222     const map<string,utype> &mapstriDataset,
00223     const bool bVariance, const bool bCorrelation){
00224 
00225     size_t i, j, k;
00226     vc.clear();
00227     vc.resize(iDatasets);
00228 
00229     if(bCorrelation){
00230         for(i=0; i<DB.size(); i++){
00231             if(DBSetting[i]->GetValue("sinfo")=="NA"){
00232                 fprintf(stderr, "sinfo parameter must be given.\n");
00233                 return false;
00234             }
00235         }
00236     }
00237 
00238     if(bVariance){
00239         for(i=0; i<DB.size(); i++){
00240             if(DBSetting[i]->GetValue("gvar")=="NA"){
00241                 fprintf(stderr, "gene variance parameter must be given.\n");
00242                 return false;
00243             }
00244         }
00245     }
00246 
00247     int ret; //system call return
00248 
00249     fprintf(stderr, "Start reading average and presence files\n");
00250     ret = system("date +%s%N 1>&2");
00251     for(i=0; i<DB.size(); i++){
00252         const vector<string> &dset = dbDataset[i];
00253         string strPrepInputDirectory = DBSetting[i]->GetValue("prep");
00254         string strGvarInputDirectory = DBSetting[i]->GetValue("gvar");
00255         string strSinfoInputDirectory = DBSetting[i]->GetValue("sinfo");
00256 
00257         for(j=0; j<dset.size(); j++){
00258             utype d = mapstriDataset.find(dset[j])->second;
00259             vc[d] = new CSeekDataset();
00260             string strFileStem = dset[j];
00261             string strAvgPath = strPrepInputDirectory + "/" +
00262                 strFileStem + ".gavg";
00263             string strPresencePath = strPrepInputDirectory + "/" +
00264                 strFileStem + ".gpres";
00265             vc[d]->ReadGeneAverage(strAvgPath);
00266             vc[d]->ReadGenePresence(strPresencePath);
00267             if(bVariance){
00268                 string strVariancePath = strGvarInputDirectory + "/" +
00269                     strFileStem + ".gexpvar";
00270                 vc[d]->ReadGeneVariance(strVariancePath);
00271             }
00272             if(bCorrelation){
00273                 string strSinfoPath = strSinfoInputDirectory + "/" +
00274                     strFileStem + ".sinfo";
00275                 vc[d]->ReadDatasetAverageStdev(strSinfoPath);
00276             }
00277             string strPlatform =
00278                 mapstrstrDatasetPlatform.find(strFileStem)->second;
00279             utype platform_id = mapstriPlatform.find(strPlatform)->second;
00280             vc[d]->SetPlatform(vp[platform_id]);
00281         }
00282     }
00283 
00284     fprintf(stderr, "Done reading average and presence files\n");
00285     ret = system("date +%s%N 1>&2");
00286 
00287     fprintf(stderr, "Initializing gene map\n"); ret = system("date +%s%N 1>&2");
00288     #pragma omp parallel for \
00289     private(i) schedule(dynamic)
00290     for(i=0; i<iDatasets; i++) vc[i]->InitializeGeneMap();
00291 
00292     fprintf(stderr, "Done initializing gene map\n"); ret = system("date +%s%N 1>&2");
00293     return true;
00294 }
00295 
00296 bool CSeekTools::ReadPlatforms(const string &strPlatformDirectory,
00297         vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms,
00298         map<string, utype> &mapstriPlatforms, const int lineSize){
00299     return CSeekTools::ReadPlatforms(strPlatformDirectory.c_str(), plat,
00300         vecstrPlatforms, mapstriPlatforms, lineSize);
00301 }
00302 
00303 bool CSeekTools::ReadPlatforms(const char *plat_dir,
00304         vector<CSeekPlatform> &plat, vector<string> &vecstrPlatforms,
00305         map<string, utype> &mapstriPlatforms, const int lineSize){
00306 
00307     string strPlatformDirectory = plat_dir;
00308     string strAvgFile = strPlatformDirectory + "/" +
00309         "all_platforms.gplatavg";
00310     string strStdevFile = strPlatformDirectory + "/" +
00311         "all_platforms.gplatstdev";
00312     string strPlatformOrderFile = strPlatformDirectory + "/" +
00313         "all_platforms.gplatorder";
00314 
00315     CFullMatrix<float> plat_avg;
00316     plat_avg.Open(strAvgFile.c_str());
00317     CFullMatrix<float> plat_stdev;
00318     plat_stdev.Open(strStdevFile.c_str());
00319     plat.clear();
00320     plat.resize(plat_avg.GetRows());
00321     utype i, j;
00322 
00323     vecstrPlatforms.clear();
00324     mapstriPlatforms.clear();
00325     ifstream ifsm;
00326     ifsm.open(strPlatformOrderFile.c_str());
00327     char acBuffer[lineSize];
00328     utype c_iBuffer = lineSize;
00329     i = 0;
00330     while(!ifsm.eof()){
00331         ifsm.getline(acBuffer, c_iBuffer -1);
00332         if(acBuffer[0]==0) break;
00333         acBuffer[c_iBuffer-1] = 0;
00334         vecstrPlatforms.push_back(acBuffer);
00335         mapstriPlatforms[acBuffer] = i;
00336         i++;
00337     }
00338     vecstrPlatforms.resize(vecstrPlatforms.size());
00339     ifsm.close();
00340 
00341     for(i=0; i<plat_avg.GetRows(); i++){
00342         plat[i].InitializePlatform(plat_avg.GetColumns(), vecstrPlatforms[i]);
00343         for(j=0; j<plat_avg.GetColumns(); j++){
00344             plat[i].SetPlatformAvg(j, plat_avg.Get(i, j));
00345             plat[i].SetPlatformStdev(j, plat_stdev.Get(i, j));
00346         }
00347     }
00348 
00349     return true;
00350 }
00351 
00352 bool CSeekTools::ReadListTwoColumns(const string &strFile,
00353         vector<string> &vecstrList1, vector<string> &vecstrList2, const int lineSize){
00354     return CSeekTools::ReadListTwoColumns(strFile.c_str(),
00355         vecstrList1, vecstrList2, lineSize);
00356 }
00357 
00358 bool CSeekTools::ReadListTwoColumns(const char *file,
00359         vector<string> &vecstrList1, vector<string> &vecstrList2,
00360         const int lineSize){
00361     ifstream ifsm;
00362     ifsm.open(file);
00363     if(!ifsm.is_open()){
00364         fprintf(stderr, "Error opening file %s\n", file);
00365         return false;
00366     }
00367     char acBuffer[lineSize];
00368     utype c_iBuffer = lineSize;
00369     vecstrList1.clear();
00370     vecstrList2.clear();
00371 
00372     while(!ifsm.eof()){
00373         ifsm.getline(acBuffer, c_iBuffer -1);
00374         if(acBuffer[0]==0) break;
00375         acBuffer[c_iBuffer-1] = 0;
00376         vector<string> tok;
00377         CMeta::Tokenize(acBuffer, tok);
00378         vecstrList1.push_back(tok[0]);
00379         vecstrList2.push_back(tok[1]);
00380     }
00381     vecstrList1.resize(vecstrList1.size());
00382     vecstrList2.resize(vecstrList2.size());
00383     ifsm.close();
00384     return true;
00385 }
00386 
00387 bool CSeekTools::ReadListOneColumn(const string &strFile,
00388     vector<string> &vecstrList, CSeekStrIntMap &mapstriList, const int lineSize){
00389     return CSeekTools::ReadListOneColumn(strFile.c_str(),
00390         vecstrList, mapstriList, lineSize);
00391 }
00392 
00393 
00394 bool CSeekTools::ReadListOneColumn(const char *file,
00395     vector<string> &vecstrList, CSeekStrIntMap &mapstriList, const int lineSize){
00396     ifstream ifsm;
00397     ifsm.open(file);
00398     if(!ifsm.is_open()){
00399         fprintf(stderr, "Error opening file %s\n", file);
00400         return false;
00401     }
00402 
00403     char acBuffer[lineSize];
00404     utype c_iBuffer = lineSize;
00405     vecstrList.clear();
00406 
00407     int i = 0;
00408     while(!ifsm.eof()){
00409         ifsm.getline(acBuffer, c_iBuffer -1);
00410         if(acBuffer[0]==0) break;
00411         acBuffer[c_iBuffer-1] = 0;
00412         string line = acBuffer;
00413         vecstrList.push_back(line);
00414         mapstriList.Set(line, i);
00415         i++;
00416     }
00417     vecstrList.resize(vecstrList.size());
00418     ifsm.close();
00419     return true;
00420 }
00421 
00422 bool CSeekTools::ReadMultipleQueries(const string &strFile,
00423     vector< vector<string> > &qList, const int lineSize){
00424     return CSeekTools::ReadMultipleQueries(strFile.c_str(), qList, lineSize);
00425 }
00426 
00427 bool CSeekTools::ReadMultipleQueries(const char *file,
00428     vector< vector<string> > &qList, const int lineSize){
00429     qList.clear();
00430     FILE *infile;
00431     if((infile=fopen(file, "r"))==NULL){
00432         fprintf(stderr, "Error opening file %s\n", file);
00433         return false;
00434     }
00435 
00436     char *acBuffer;
00437     int MAX_CHAR_PER_LINE = lineSize;
00438     int lineLen = MAX_CHAR_PER_LINE;
00439     acBuffer = (char*)malloc(lineLen);
00440     while(fgets(acBuffer, lineLen, infile)!=NULL){
00441         while(strlen(acBuffer)==lineLen-1){
00442             int len = strlen(acBuffer);
00443             fseek(infile, -len, SEEK_CUR);
00444             lineLen+=MAX_CHAR_PER_LINE;
00445             acBuffer = (char*)realloc(acBuffer, lineLen);
00446             char *ret = fgets(acBuffer, lineLen, infile);
00447         }
00448     }
00449     rewind(infile);
00450 
00451     while(fgets(acBuffer, lineLen, infile)!=NULL){
00452         char *p = strtok(acBuffer, "\n");
00453         vector<string> tok;
00454         CMeta::Tokenize(p, tok, " ");
00455         qList.push_back(tok);
00456     }
00457     qList.resize(qList.size());
00458     free(acBuffer);
00459 
00460     fclose(infile);
00461     return true;
00462 }
00463 
00464 bool CSeekTools::ReadMultipleNotQueries(const char *file,
00465     vector<vector<vector<string> > > &qList, const int lineSize){
00466     qList.clear();
00467     FILE *infile;
00468     if((infile=fopen(file, "r"))==NULL){
00469         fprintf(stderr, "Error opening file %s\n", file);
00470         return false;
00471     }
00472 
00473     char *acBuffer;
00474     int MAX_CHAR_PER_LINE = lineSize;
00475     int lineLen = MAX_CHAR_PER_LINE;
00476     acBuffer = (char*)malloc(lineLen);
00477     while(fgets(acBuffer, lineLen, infile)!=NULL){
00478         while(strlen(acBuffer)==lineLen-1){
00479             int len = strlen(acBuffer);
00480             fseek(infile, -len, SEEK_CUR);
00481             lineLen+=MAX_CHAR_PER_LINE;
00482             acBuffer = (char*)realloc(acBuffer, lineLen);
00483             char *ret = fgets(acBuffer, lineLen, infile);
00484         }
00485     }
00486     rewind(infile);
00487 
00488     while(fgets(acBuffer, lineLen, infile)!=NULL){
00489         char *p = strtok(acBuffer, "\n");
00490         vector<vector<string> > aQ;
00491         vector<string> tok;
00492         CMeta::Tokenize(p, tok, "|");
00493         aQ.resize(tok.size());
00494         int i;
00495         for(i=0; i<tok.size(); i++){
00496             vector<string> tmp;
00497             CMeta::Tokenize(tok[i].c_str(), tmp, " ");
00498             aQ[i] = tmp;
00499         }
00500         qList.push_back(aQ);
00501     }
00502     qList.resize(qList.size());
00503     free(acBuffer);
00504     fclose(infile);
00505     return true;
00506 }
00507 
00508 bool CSeekTools::ReadMultiGeneOneLine(const string &strFile,
00509     vector<string> &list, const int lineSize){
00510     return CSeekTools::ReadMultiGeneOneLine(strFile.c_str(), list, lineSize);
00511 }
00512 
00513 bool CSeekTools::ReadMultiGeneOneLine(const char *file,
00514     vector<string> &list, const int lineSize){
00515     list.clear();
00516     ifstream ifsm;
00517     ifsm.open(file);
00518     if(!ifsm.is_open()){
00519         fprintf(stderr, "Error opening file %s\n", file);
00520         return false;
00521     }
00522 
00523     char *acBuffer= (char*)malloc(lineSize);
00524     int c_iBuffer = lineSize;
00525     int i = 0;
00526 
00527     ifsm.getline(acBuffer, c_iBuffer -1);
00528     acBuffer[c_iBuffer-1] = 0;
00529     vector<string> tok;
00530     CMeta::Tokenize(acBuffer, tok, " ");
00531     for(i = 0; i<tok.size(); i++){
00532         list.push_back(tok[i]);
00533     }
00534 
00535     list.resize(list.size());
00536     ifsm.close();
00537     free(acBuffer);
00538     return true;
00539 }
00540 
00541 bool CSeekTools::ReadListOneColumn(const string &strFile,
00542     vector<string> &vecstrList, const int lineSize){
00543     return CSeekTools::ReadListOneColumn(strFile.c_str(), vecstrList, lineSize);
00544 }
00545 
00546 bool CSeekTools::ReadListOneColumn(const char *file,
00547     vector<string> &vecstrList, const int lineSize){
00548     ifstream ifsm;
00549     ifsm.open(file);
00550 
00551     if(!ifsm.is_open()){
00552         fprintf(stderr, "Error opening file %s\n", file);
00553         return false;
00554     }
00555     char acBuffer[lineSize];
00556     utype c_iBuffer = lineSize;
00557     vecstrList.clear();
00558     int i = 0;
00559     while(!ifsm.eof()){
00560         ifsm.getline(acBuffer, c_iBuffer -1);
00561         if(acBuffer[0]==0) break;
00562         acBuffer[c_iBuffer-1] = 0;
00563         string line = acBuffer;
00564         vecstrList.push_back(line);
00565     }
00566     vecstrList.resize(vecstrList.size());
00567     ifsm.close();
00568     return true;
00569 }
00570 
00571 }