Sleipnir
tools/SeekReader/SeekReader.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 
00026 int main( int iArgs, char** aszArgs ) {
00027     static const size_t c_iBuffer   = 1024;
00028 #ifdef WIN32
00029     pthread_win32_process_attach_np( );
00030 #endif // WIN32
00031     gengetopt_args_info sArgs;
00032     ifstream            ifsm;
00033     istream*            pistm;
00034     vector<string>      vecstrLine, vecstrGenes, vecstrDatasets, vecstrUserDatasets;
00035     char                acBuffer[ c_iBuffer ];
00036     size_t              i;
00037 
00038     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00039         cmdline_parser_print_help( );
00040         return 1; }
00041 
00042     vector<string> vecstrGeneID;
00043     map<string, utype> mapstrintGene;
00044     if(!CSeekTools::ReadListTwoColumns(sArgs.input_arg, vecstrGeneID, vecstrGenes))
00045         return false;
00046 
00047     for(i=0; i<vecstrGenes.size(); i++)
00048         mapstrintGene[vecstrGenes[i]] = i;
00049 
00050     if(sArgs.weight2_flag==1){
00051         vector<string> vecstrDataset;
00052         if(!CSeekTools::ReadListOneColumn(sArgs.dweight_map_arg, vecstrDataset))
00053             return false;
00054         vector<vector<float> > vec_score, orig_score;
00055         utype i, j;
00056         int num_query = sArgs.dweight_num_arg; //random query
00057         orig_score.resize(num_query);
00058 
00059         vec_score.resize(vecstrDataset.size());
00060         for(j=0; j<vec_score.size(); j++)
00061             vec_score[j].resize(num_query);
00062 
00063         char x[256];
00064         for(i=0; i<num_query; i++){ //i is query id
00065             vector<float> v;
00066             sprintf(x, "%s/%d.dweight", sArgs.dweight_dir_arg, i);
00067             CSeekTools::ReadArray(x, v);
00068             orig_score[i] = v;
00069             for(j=0; j<vec_score.size(); j++) //j is dataset id
00070                 vec_score[j][i] = v[j];
00071         }
00072 
00073         for(j=0; j<vec_score.size(); j++)
00074             sort(vec_score[j].begin(), vec_score[j].end());
00075 
00076         int test_num_query = sArgs.dweight_test_num_arg;
00077         for(i=0; i<test_num_query; i++){ //i is query id
00078             fprintf(stderr, "Query %d\n", i);
00079             vector<float> v;
00080             sprintf(x, "%s/%d.dweight", sArgs.dweight_test_dir_arg, i);
00081             CSeekTools::ReadArray(x, v);
00082             for(j=0; j<v.size(); j++){
00083                 utype k = 0;
00084                 for(k=0; k<num_query; k++){
00085                     if(v[j]<vec_score[j][k])
00086                         break;
00087                 }
00088                 fprintf(stderr, "%s\t%.3e\t%d\n", vecstrDataset[j].c_str(), v[j], k);
00089             }
00090         }
00091         fprintf(stderr, "Done!\n");
00092         return 0;
00093     }
00094 
00095     if(sArgs.convert_aracne_flag==1){
00096         int lineLen = 1024;
00097         char *acBuffer = (char*)malloc(lineLen);
00098         FILE *infile;
00099         if((infile=fopen(sArgs.aracne_file_arg, "r"))==NULL){
00100             fprintf(stderr, "Error no such file %s\n", sArgs.aracne_file_arg);
00101             return 1;
00102         }
00103         while(fgets(acBuffer, lineLen, infile)!=NULL){
00104             while(strlen(acBuffer)==lineLen-1){
00105                 int len = strlen(acBuffer);
00106                 fseek(infile, -len, SEEK_CUR);
00107                 lineLen+=1024;
00108                 acBuffer = (char*)realloc(acBuffer, lineLen);
00109                 char *ret = fgets(acBuffer, lineLen, infile);
00110             }
00111         }
00112         fclose(infile);
00113         fprintf(stderr, "Max line length: %d\n", lineLen);
00114         free(acBuffer);
00115 
00116         acBuffer = (char*)malloc(lineLen);
00117         ifstream ia;
00118         ia.open(sArgs.aracne_file_arg);
00119         if(!ia.is_open()){
00120             fprintf(stderr, "Error opening file %s\n", sArgs.aracne_file_arg);
00121             return 1;
00122         }
00123         set<string> allGenes;
00124         size_t ci=0;
00125         while(!ia.eof()){
00126             ia.getline(acBuffer, lineLen-1);
00127             if(acBuffer[0]==0) break;
00128             acBuffer[lineLen-1] = 0;
00129             if(acBuffer[0]=='>') continue;
00130             vector<string> tok;
00131             CMeta::Tokenize(acBuffer, tok);
00132             allGenes.insert(tok[0]);
00133             size_t si;
00134             for(si=1; si<tok.size(); si+=2){
00135                 allGenes.insert(tok[si]);
00136             }
00137             if(ci%100==0){
00138                 fprintf(stderr, "Gene Number %d\n", ci);
00139             }
00140             ci++;
00141         }
00142         ia.close();
00143 
00144         vector<string> vecGenes;
00145         copy(allGenes.begin(), allGenes.end(), back_inserter(vecGenes));
00146         map<string,size_t> mapiGenes;
00147         for(i=0; i<vecGenes.size(); i++)
00148             mapiGenes[vecGenes[i]] = i;
00149 
00150         CDat Dat;
00151         Dat.Open(vecGenes);
00152         ci = 0;
00153         ia.open(sArgs.aracne_file_arg);
00154         while(!ia.eof()){
00155             ia.getline(acBuffer, lineLen-1);
00156             if(acBuffer[0]==0) break;
00157             acBuffer[lineLen-1] = 0;
00158             if(acBuffer[0]=='>') continue;
00159             vector<string> tok;
00160             CMeta::Tokenize(acBuffer, tok);
00161             size_t tG = mapiGenes[tok[0]];
00162             size_t si;
00163             for(si=1; si<tok.size(); si+=2){
00164                 size_t oG = mapiGenes[tok[si]];
00165                 float fG = atof(tok[si+1].c_str());
00166                 Dat.Set(tG, oG, fG);
00167             }
00168             if(ci%100==0){
00169                 fprintf(stderr, "Gene Number %d\n", ci);
00170             }
00171             ci++;
00172         }
00173         ia.close();
00174         free(acBuffer);
00175         Dat.Save(sArgs.output_dab_file_arg);
00176         return 0;
00177     }
00178 
00179     if(sArgs.weight_flag==1){
00180         vector<string> vecstrDataset;
00181         if(!CSeekTools::ReadListOneColumn(sArgs.dweight_map_arg, vecstrDataset))
00182             return false;
00183 
00184         vector<vector<float> > vec_score;
00185         vector<vector<float> > orig_score;
00186         utype i, j;
00187         int num_query = sArgs.dweight_num_arg; //random query
00188         orig_score.resize(num_query);
00189 
00190         vec_score.resize(vecstrDataset.size());
00191         for(j=0; j<vec_score.size(); j++)
00192             vec_score[j].resize(num_query);
00193 
00194         char x[256];
00195         for(i=0; i<num_query; i++){ //i is query id
00196             vector<float> v;
00197             sprintf(x, "%s/%d.dweight", sArgs.dweight_dir_arg, i);
00198             CSeekTools::ReadArray(x, v);
00199             orig_score[i] = v;
00200             for(j=0; j<vec_score.size(); j++) //j is dataset id
00201                 vec_score[j][i] = v[j];
00202         }
00203 
00204         vector<float> score_cutoff;
00205         score_cutoff.resize(vec_score.size());
00206         for(j=0; j<vec_score.size(); j++){
00207             sort(vec_score[j].begin(), vec_score[j].end());
00208             score_cutoff[j] = vec_score[j][899];
00209             //fprintf(stderr, "Dataset %d: %.4e\n", j, vec_score[j][899]);
00210         }
00211 
00212         sprintf(x, "/tmp/dataset.cutoff");
00213         CSeekTools::WriteArray(x, score_cutoff);
00214 
00215         vector<int> numGoodDataset;
00216         CSeekTools::InitVector(numGoodDataset, num_query, (int) 0);
00217         for(i=0; i<num_query; i++) //i is query id
00218             for(j=0; j<vecstrDataset.size(); j++)
00219                 if(orig_score[i][j]>vec_score[j][899])
00220                     numGoodDataset[i]++;
00221 
00222         sort(numGoodDataset.begin(), numGoodDataset.end());
00223         fprintf(stderr, "10 percentile %d\n", numGoodDataset[99]);
00224         fprintf(stderr, "90 percentile %d\n", numGoodDataset[899]);
00225 
00226         int test_num_query = sArgs.dweight_test_num_arg;
00227         for(i=0; i<test_num_query; i++){ //i is query id
00228             vector<float> v;
00229             sprintf(x, "%s/%d.dweight", sArgs.dweight_test_dir_arg, i);
00230             CSeekTools::ReadArray(x, v);
00231             int numGood = 0;
00232             for(j=0; j<v.size(); j++)
00233                 if(v[j]>vec_score[j][899])
00234                     numGood++;
00235             fprintf(stderr, "Query %d: ", i);
00236             if(numGood > numGoodDataset[899])
00237                 fprintf(stderr, "Unique (upper)\n");
00238             else if(numGood < numGoodDataset[99])
00239                 fprintf(stderr, "Unique (lower)\n");
00240             else
00241                 fprintf(stderr, "Not unique\n");
00242         }
00243         fprintf(stderr, "Done!\n");
00244         return 0;
00245     }
00246 
00247 
00248     if(sArgs.convert_dab_flag==1){
00249         string dab_file_name = sArgs.dab_file_arg;
00250         if(dab_file_name.find(".2.dab")!=string::npos){
00251             CSeekIntIntMap d1(vecstrGenes.size());
00252             CSparseFlatMatrix<float> sm (0);
00253             CSeekWriter::ReadSeekSparseMatrix<unsigned short>(sArgs.dab_file_arg,
00254                 sm, d1, 1000, 0.99, vecstrGenes);   
00255             utype ii = 0;
00256             const vector<utype> &allRGenes = d1.GetAllReverse();
00257             FILE *pFile;
00258             pFile = fopen(sArgs.output_matrix_arg, "w");
00259             fprintf(pFile, "Genes\t");
00260             for(i=0; i<d1.GetNumSet(); i++){    
00261                 string g = vecstrGenes[allRGenes[i]];
00262                 fprintf(pFile, "%s", g.c_str());
00263                 if(i<d1.GetNumSet()-1)
00264                     fprintf(pFile, "\t");
00265             }
00266             fprintf(pFile, "\n");
00267         
00268             unsigned int j, t;  
00269             for(i=0; i<d1.GetNumSet(); i++){    
00270                 string g = vecstrGenes[allRGenes[i]];
00271                 fprintf(pFile, "%s\t", g.c_str());
00272                 vector<float> vf;
00273                 vf.resize(vecstrGenes.size());
00274                 for(j=0; j<vecstrGenes.size(); j++)
00275                     vf[j] = 0;
00276                 utype ii = allRGenes[i];
00277                 vector<CPair<float> >::iterator row_it;
00278                 float rv;
00279                 for(row_it = sm.RowBegin(ii); row_it!=sm.RowEnd(ii); row_it++){
00280                     j = (size_t) row_it->i;
00281                     rv = row_it->v;
00282                     vf[j] = rv;
00283                 }
00284             
00285                 for(j=0; j<d1.GetNumSet(); j++){
00286                     t = allRGenes[j];
00287                     fprintf(pFile, "%.3e", vf[t]);
00288                     if(j<d1.GetNumSet()-1)
00289                         fprintf(pFile, "\t");
00290                 }
00291                 fprintf(pFile, "\n");
00292             }
00293             fclose(pFile);
00294             return 0;
00295         }
00296 
00297         CDataPair Dat;
00298         fprintf(stderr, "Opening file...\n");
00299         if(!Dat.Open(sArgs.dab_file_arg, false, false, 2, false, false)){
00300             cerr << "error opening file" << endl;
00301             return 1;
00302         }
00303 
00304         vector<unsigned int> veciGenes;
00305         veciGenes.resize(vecstrGenes.size());
00306         for(i=0; i<vecstrGenes.size(); i++)
00307             veciGenes[i] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[i]);
00308 
00309         unsigned int s, t, j, ss, tt;
00310         float d;
00311         CSeekIntIntMap m(vecstrGenes.size());
00312         for(i=0; i<vecstrGenes.size(); i++){
00313             if((s=veciGenes[i])==(unsigned int)-1) continue;
00314             m.Add(i);
00315         }
00316 
00317         const vector<utype> &allRGenes = m.GetAllReverse();
00318         FILE *pFile;
00319         pFile = fopen(sArgs.output_matrix_arg, "w");
00320         fprintf(pFile, "Genes\t");
00321         for(i=0; i<m.GetNumSet(); i++){ 
00322             string g = vecstrGenes[allRGenes[i]];
00323             fprintf(pFile, "%s", g.c_str());
00324             if(i<m.GetNumSet()-1){
00325                 fprintf(pFile, "\t");
00326             }
00327         }
00328         fprintf(pFile, "\n");
00329             
00330         for(i=0; i<m.GetNumSet(); i++){ 
00331             s = veciGenes[allRGenes[i]];
00332             string g = vecstrGenes[allRGenes[i]];
00333             fprintf(pFile, "%s\t", g.c_str());
00334             for(j=0; j<m.GetNumSet(); j++){
00335                 t = veciGenes[allRGenes[j]];
00336                 if(CMeta::IsNaN(d = Dat.Get(s,t)))
00337                     d = 0;
00338                 fprintf(pFile, "%.3e", d);
00339                 if(j<m.GetNumSet()-1){
00340                     fprintf(pFile, "\t");
00341                 }
00342             }
00343             fprintf(pFile, "\n");
00344         }
00345         fclose(pFile);
00346         return 0;
00347     }
00348 
00349 
00350     if(sArgs.limit_hub_flag==1){
00351         float per = 0.30;
00352         CDataPair Dat;
00353         char outFile[1024];
00354         fprintf(stderr, "Opening file...\n");
00355         if(!Dat.Open(sArgs.dabinput_arg, false, false, 2, false, false)){
00356             cerr << "error opening file" << endl;
00357             return 1;
00358         }
00359         vector<unsigned int> veciGenes;
00360         veciGenes.resize(vecstrGenes.size());
00361         for(i=0; i<vecstrGenes.size(); i++)
00362             veciGenes[i] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[i]);
00363 
00364         unsigned int s, t, j, ss, tt;
00365         float d;
00366         CSeekIntIntMap m(vecstrGenes.size());
00367         for(i=0; i<vecstrGenes.size(); i++){
00368             if((s=veciGenes[i])==(unsigned int)-1) continue;
00369             m.Add(i);
00370         }
00371         vector<float> vecSum;
00372         CSeekTools::InitVector(vecSum, vecstrGenes.size(),(float) 0);
00373         const vector<utype> &allRGenes = m.GetAllReverse();
00374         for(i=0; i<m.GetNumSet(); i++){ 
00375             s = veciGenes[allRGenes[i]];
00376             for(j=i+1; j<m.GetNumSet(); j++){
00377                 t = veciGenes[allRGenes[j]];
00378                 if(CMeta::IsNaN(d = Dat.Get(s,t))) continue;
00379                 vecSum[allRGenes[i]]+=pow(abs(d), 9);
00380                 vecSum[allRGenes[j]]+=pow(abs(d), 9);
00381             }
00382         }
00383         vector<float> backupSum;
00384         for(i=0; i<vecSum.size(); i++)
00385             backupSum.push_back(vecSum[i]);
00386         sort(vecSum.begin(), vecSum.end(), greater<float>());
00387         int index = (int) (per * (float) m.GetNumSet());
00388         float percentile = vecSum[index];
00389         vector<string> limitedGenes;
00390         for(i=0; i<backupSum.size(); i++){
00391             if(backupSum[i] > percentile){
00392                 limitedGenes.push_back(vecstrGenes[i]);
00393             }
00394         }
00395         fprintf(stderr, "%d / %d genes to be written!\n", limitedGenes.size(), m.GetNumSet());
00396         CDat NewDat;
00397         NewDat.Open(limitedGenes);
00398         for(i=0; i<limitedGenes.size(); i++){   
00399             s = (unsigned int) Dat.GetGeneIndex(limitedGenes[i]);
00400             ss = NewDat.GetGeneIndex(limitedGenes[i]);
00401             for(j=i+1; j<limitedGenes.size(); j++){
00402                 t = (unsigned int) Dat.GetGeneIndex(limitedGenes[j]);
00403                 tt = NewDat.GetGeneIndex(limitedGenes[j]);
00404                 d = Dat.Get(s, t);
00405                 NewDat.Set(ss, tt, d);
00406             }
00407         }
00408         NewDat.Save(sArgs.hub_dab_output_arg);
00409         return 0;
00410     }
00411 
00412     if(sArgs.comp_ranking_flag==1){
00413         utype i, j;
00414         int num_query = sArgs.gscore_num1_arg; //random query
00415         char x[256];
00416         for(i=0; i<num_query; i++){ //i is query id
00417             vector<float> v1, v2;
00418             sprintf(x, "%s/%d.gscore", sArgs.gscore_dir1_arg, i);
00419             CSeekTools::ReadArray(x, v1);
00420             sprintf(x, "%s/%d.gscore", sArgs.gscore_dir2_arg, i);
00421             CSeekTools::ReadArray(x, v2);
00422             vector<CPair<float> > cp1, cp2;
00423             cp1.resize(v1.size());
00424             cp2.resize(v2.size());
00425             for(j=0; j<v1.size(); j++){
00426                 cp1[j].i = (utype) j;
00427                 cp1[j].v = v1[j];
00428                 cp2[j].i = (utype) j;
00429                 cp2[j].v = v2[j];
00430             }
00431             sort(cp1.begin(), cp1.end(), CDescendingValue<float>());
00432             sort(cp2.begin(), cp2.end(), CDescendingValue<float>());
00433             vector<char> presence;
00434             CSeekTools::InitVector(presence, v1.size(), (char) 0);
00435             for(j=0; j<500; j++){
00436                 presence[cp1[j].i]++;
00437                 presence[cp2[j].i]++;
00438             }
00439             int count = 0;
00440             for(j=0; j<v1.size(); j++){
00441                 if(presence[j]==2){
00442                     count++;
00443                 }
00444             }
00445             fprintf(stderr, "Query %d %d\n", i, count);
00446         }
00447         return 0;
00448     }
00449 
00450     if(sArgs.dataset_flag==1){
00451         string db = sArgs.db_arg;
00452         string dset_list = sArgs.dset_list_arg;
00453         string dir_in = sArgs.dir_in_arg;
00454         string dir_prep = sArgs.dir_prep_in_arg;
00455         if(db=="NA" || dset_list=="NA" || dir_in=="NA" ||
00456         dir_prep=="NA"){
00457             fprintf(stderr, "Requires: -x, -X, -d -p\n");
00458             return false;
00459         }
00460 
00461         vector<string> vecstrDP, vecstrUserDP;
00462         //dataset-platform mapping (required)
00463         if(!CSeekTools::ReadListTwoColumns(sArgs.db_arg, vecstrDatasets, vecstrDP))
00464             return false;
00465 
00466         //dataset filter
00467         if(!CSeekTools::ReadListTwoColumns(sArgs.dset_list_arg, vecstrUserDatasets, vecstrUserDP))
00468             return false;
00469 
00470         map<string, utype> mapstrintDataset;
00471         map<string, string> mapstrstrDatasetPlatform;
00472         utype i;
00473         for(i=0; i<vecstrDatasets.size(); i++){
00474             mapstrstrDatasetPlatform[vecstrDatasets[i]] = vecstrDP[i];
00475             mapstrintDataset[vecstrDatasets[i]] = i;
00476         }
00477 
00478         CSeekIntIntMap *mapUserDatasets = new CSeekIntIntMap(vecstrDatasets.size());
00479         for(i=0; i<vecstrUserDatasets.size(); i++){
00480             mapUserDatasets->Add(mapstrintDataset[vecstrUserDatasets[i]]);
00481         }
00482         
00483         //fprintf(stderr, "Finished reading dataset\n");
00484 
00485         size_t iDatasets = vecstrDatasets.size();
00486         vector<string> vecstrPlatforms;
00487         vector<CSeekPlatform> vp;
00488         map<string, utype> mapstriPlatform;
00489         CSeekTools::ReadPlatforms(sArgs.platform_dir_arg, vp, vecstrPlatforms,
00490             mapstriPlatform);
00491         //fprintf(stderr, "Finished reading platform\n");
00492 
00493         vector<CSeekDataset*> vc;
00494         vc.resize(iDatasets);
00495         string strPrepInputDirectory = sArgs.dir_prep_in_arg;
00496         string strGvarInputDirectory = sArgs.dir_gvar_in_arg;
00497         string strSinfoInputDirectory = sArgs.dir_sinfo_in_arg;
00498 
00499         for(i=0; i<iDatasets; i++){
00500             vc[i] = new CSeekDataset();
00501             string strFileStem = vecstrDatasets[i];
00502             string strAvgPath = strPrepInputDirectory + "/" +
00503                 strFileStem + ".gavg";
00504             string strPresencePath = strPrepInputDirectory + "/" +
00505                 strFileStem + ".gpres";
00506 
00507             if(strGvarInputDirectory!="NA"){
00508                 string strGvarPath = strGvarInputDirectory + "/" + 
00509                     strFileStem + ".gexpvar";
00510                 vc[i]->ReadGeneVariance(strGvarPath);
00511             }
00512             if(strSinfoInputDirectory!="NA"){
00513                 string strSinfoPath = strSinfoInputDirectory + "/" + 
00514                     strFileStem + ".sinfo";
00515                 vc[i]->ReadDatasetAverageStdev(strSinfoPath);
00516             }
00517 
00518             vc[i]->ReadGeneAverage(strAvgPath);
00519             vc[i]->ReadGenePresence(strPresencePath);
00520             string strPlatform =
00521                 mapstrstrDatasetPlatform.find(strFileStem)->second;
00522             utype platform_id = mapstriPlatform.find(strPlatform)->second;
00523             vc[i]->SetPlatform(vp[platform_id]);
00524         }
00525 
00526         //fprintf(stderr, "Finished reading prep\n");
00527 
00528         for(i=0; i<iDatasets; i++) vc[i]->InitializeGeneMap();
00529 
00530         for(i=0; i<vecstrGenes.size(); i++){
00531             utype ii = mapstrintGene[vecstrGenes[i]];
00532             fprintf(stderr, "Gene %s ", vecstrGenes[i].c_str());
00533             utype j = 0;
00534             vector<float> va;
00535             for(j=0; j<iDatasets; j++){
00536                 CSeekIntIntMap *gm = vc[j]->GetGeneMap();
00537                 utype ij = gm->GetForward(ii);
00538                 if(CSeekTools::IsNaN(ij)){
00539                     continue;
00540                 }
00541                 float a = vc[j]->GetGeneAverage(ii);
00542                 va.push_back(a);
00543                 //fprintf(stderr, "%.2f ", a);
00544             }
00545             sort(va.begin(), va.end());
00546             utype g = 0;
00547             for(g=0; g<20; g++){
00548                 utype ik = (utype) ((float)0.05*(float)(g+1)*(float)va.size() - 1.0);
00549                 fprintf(stderr, "%.2f ", va[ik]);
00550             }
00551             fprintf(stderr, "\n");
00552         }
00553 
00554         fprintf(stderr, "Done\n");
00555         return false;
00556 
00557         vector<vector<string> > vecstrQueries;
00558         string multiQuery = sArgs.multi_query_arg;
00559 
00560         if(multiQuery=="NA"){
00561             vecstrQueries.resize(1);
00562             if(!CSeekTools::ReadMultiGeneOneLine(sArgs.single_query_arg, vecstrQueries[0]))
00563                 return false;
00564         }else{  
00565             if(!CSeekTools::ReadMultipleQueries(multiQuery, vecstrQueries))
00566                 return false;
00567         }
00568 
00569         bool toOutput = false;
00570         string output_file = sArgs.output_file_arg;
00571         FILE *out = NULL;
00572 
00573         if(output_file=="NA"){
00574             toOutput = false;
00575         }else{
00576             toOutput = true;
00577             out = fopen(output_file.c_str(), "w");
00578         }
00579 
00580         utype ii=0;
00581         float cutoff = sArgs.gvar_cutoff_arg;
00582         bool toCutoff = false;
00583         if(cutoff<0){
00584             toCutoff = false;
00585         }else{
00586             toCutoff = true;
00587         }
00588 
00589         if(sArgs.order_stat_single_gene_query_flag==1){
00590             if(strGvarInputDirectory=="NA"){
00591                 fprintf(stderr, "Order statistics mode, but need to provide gvar!\n");
00592                 return false;
00593             }
00594             if(cutoff<0){
00595                 fprintf(stderr, "Need to provide positive Gvar cutoff\n");
00596                 return false;
00597             }
00598         }
00599 
00600         for(ii=0; ii<vecstrQueries.size(); ii++){
00601 
00602         vector<string> &vecstrQuery = vecstrQueries[ii];
00603 
00604         //fprintf(stderr, "Finished reading query\n");
00605         bool isFirst = true;
00606         vector<int> count;
00607         utype j;
00608         CSeekTools::InitVector(count, vecstrQuery.size(), (int) 0);
00609 
00610         //only analyze and report on user datasets
00611         const vector<utype> &allRDatasets = mapUserDatasets->GetAllReverse();
00612         utype iUserDatasets = mapUserDatasets->GetNumSet();
00613         utype dd;
00614         for(dd=0; dd<iUserDatasets; dd++){
00615             i = allRDatasets[dd];
00616             //fprintf(stdout, "%d %d %s\n", i, dd, vecstrDatasets[i].c_str());
00617             CSeekIntIntMap *si = vc[i]->GetGeneMap();
00618 
00619             if(toCutoff && sArgs.order_stat_single_gene_query_flag==1){
00620                 if(mapstrintGene.find(vecstrQuery[0])==mapstrintGene.end())
00621                     continue;   
00622                 if(CSeekTools::IsNaN(si->GetForward(
00623                     mapstrintGene[vecstrQuery[0]]))) continue;
00624                 float gene_var = vc[i]->GetGeneVariance(mapstrintGene[vecstrQuery[0]]);
00625                 if(gene_var < cutoff) continue;
00626                 if(isFirst){
00627                     isFirst = false;
00628                     if(toOutput){
00629                         fprintf(out, "%s", vecstrDatasets[i].c_str());
00630                     }else{
00631                         fprintf(stdout, "%s", vecstrDatasets[i].c_str());
00632                     }
00633                 }else{
00634                     if(toOutput){
00635                         fprintf(out, " %s", vecstrDatasets[i].c_str());
00636                     }else{
00637                         fprintf(stdout, " %s", vecstrDatasets[i].c_str());
00638                     }
00639                 }
00640 
00641             }else{
00642                 utype present = 0;
00643                 for(j=0, present=0; j<vecstrQuery.size(); j++){
00644                     if(mapstrintGene.find(vecstrQuery[j])==mapstrintGene.end())
00645                         continue;
00646                     if(CSeekTools::IsNaN(si->GetForward(
00647                         mapstrintGene[vecstrQuery[j]]))) continue;
00648                     count[j]++;
00649                     present++;
00650                 }
00651                 if(present==vecstrQuery.size()){
00652                     if(isFirst){
00653                         isFirst = false;
00654                         if(toOutput){
00655                             fprintf(out, "%s", vecstrDatasets[i].c_str());
00656                         }else{
00657                             fprintf(stdout, "%s", vecstrDatasets[i].c_str());
00658                         }
00659                     }else{
00660                         if(toOutput){
00661                             fprintf(out, " %s", vecstrDatasets[i].c_str());
00662                         }else{
00663                             fprintf(stdout, " %s", vecstrDatasets[i].c_str());
00664                         }
00665                     }
00666                     //fprintf(stderr, "%s\t%s\t%d\t%d\n", 
00667                     //  vecstrDatasets[i].c_str(), vecstrDP[i].c_str(), 
00668                     //  present, si->GetNumSet());
00669                 }
00670             }
00671         }
00672 
00673         for(j=0; j<vecstrQuery.size(); j++)
00674             fprintf(stderr, "Gene %s: %d\n", vecstrQuery[j].c_str(), count[j]);
00675 
00676         if(toOutput){
00677             fprintf(out, "\n");
00678         }else{
00679             fprintf(stdout, "\n");
00680         }
00681 
00682         }
00683 
00684         if(toOutput){
00685             fclose(out);
00686         }       
00687         return 0;
00688     }
00689 
00690     if(sArgs.databaselet_flag==1){
00691         string db = sArgs.db_arg;
00692         string dset_list = sArgs.dset_list_arg;
00693         string dir_in = sArgs.dir_in_arg;
00694         string dir_prep = sArgs.dir_prep_in_arg;
00695         string single_query = sArgs.single_query_arg;
00696 
00697         if(db=="NA" || dset_list=="NA" || dir_in=="NA" ||
00698         dir_prep=="NA" || single_query=="NA"){
00699             fprintf(stderr, "Requires: -x, -X, -d -p -q\n");
00700             return false;
00701         }
00702 
00703         bool useNibble = false;
00704         if(sArgs.is_nibble_flag==1) useNibble = true;
00705         CDatabase DB(useNibble);
00706         vector<string> vecstrDP;
00707         vector<string> vecstrQuery;
00708         if(!CSeekTools::ReadListTwoColumns(sArgs.db_arg, vecstrDatasets, vecstrDP))
00709             return false;
00710         if(!CSeekTools::ReadMultiGeneOneLine(sArgs.single_query_arg, vecstrQuery))
00711             return false;
00712 
00713         string strInputDirectory = sArgs.dir_in_arg;
00714         DB.Open(strInputDirectory);
00715 
00716         size_t iDatasets = DB.GetDatasets();
00717         size_t iGenes = DB.GetGenes();
00718 
00719         size_t j,k;
00720         vector<CSeekDataset*> vc;
00721         vc.clear();
00722         vc.resize(iDatasets);
00723         for(i=0; i<iDatasets; i++){
00724             vc[i] = new CSeekDataset();
00725             string strPrepInputDirectory = sArgs.dir_prep_in_arg;
00726             string strFileStem = vecstrDatasets[i];
00727             string strAvgPath = strPrepInputDirectory + "/" +
00728                 strFileStem + ".gavg";
00729             string strPresencePath = strPrepInputDirectory + "/" +
00730                 strFileStem + ".gpres";
00731             vc[i]->ReadGeneAverage(strAvgPath);
00732             vc[i]->ReadGenePresence(strPresencePath);
00733         }
00734 
00735         vector<char> cQuery;
00736         vector<utype> allQ;
00737         CSeekTools::InitVector(cQuery, iGenes, (char) 0);
00738 
00739         for(i=0; i<vecstrQuery.size(); i++){
00740             k = DB.GetGene(vecstrQuery[i]);
00741             if(k==-1) continue;
00742             cQuery[k] = 1;
00743         }
00744 
00745         for(k=0; k<iGenes; k++)
00746             if(cQuery[k]==1)
00747                 allQ.push_back(k);
00748 
00749         for(i=0; i<iDatasets; i++){
00750             vc[i]->InitializeGeneMap();
00751             vc[i]->InitializeQueryBlock(allQ);
00752         }
00753 
00754         vector<unsigned char> *Q =
00755             new vector<unsigned char>[vecstrQuery.size()];
00756 
00757         for(i=0; i<vecstrQuery.size(); i++)
00758             if(!DB.GetGene(vecstrQuery[i], Q[i]))
00759                 cerr << "Gene does not exist" << endl;
00760 
00761         //printf("Before"); getchar();
00762         for(i=0; i<vecstrQuery.size(); i++){
00763             if(DB.GetGene(vecstrQuery[i])==-1) continue;
00764             size_t m = DB.GetGene(vecstrQuery[i]);
00765             size_t l = 0;
00766             for(j=0; j<iDatasets; j++){
00767                 CSeekIntIntMap *qu = vc[j]->GetDBMap();
00768                 if(qu==NULL) continue;
00769                 unsigned char **r = vc[j]->GetMatrix();
00770                 utype query = qu->GetForward(m);
00771                 if(CSeekTools::IsNaN(query)) continue;
00772                 for(k=0; k<iGenes; k++){
00773                     unsigned char c = Q[i][k*iDatasets + j];
00774                     r[query][k] = c;
00775                 }
00776             }
00777         }
00778 
00779         for(i=0; i<iDatasets; i++){
00780             printf("Dataset %ld\n", i);
00781             unsigned char **r = vc[i]->GetMatrix();
00782             CSeekIntIntMap *mapG = vc[i]->GetGeneMap();
00783             CSeekIntIntMap *mapDB = vc[i]->GetDBMap();
00784             if(mapDB==NULL) continue;
00785             for(j=0; j<mapDB->GetNumSet(); j++){
00786                 if(vecstrDatasets[i]=="GSE19470.GPL5175.pcl"){
00787                 printf("Row %ld\n", j);
00788                 for(k=0; k<mapG->GetNumSet(); k++){
00789                     printf("%d ", r[j][mapG->GetReverse(k)]);
00790                 }
00791                 printf("\n");
00792                 }
00793             }
00794         }
00795 
00796     }
00797 
00798 #ifdef WIN32
00799     pthread_win32_process_detach_np( );
00800 #endif // WIN32
00801     return 0; }