Sleipnir
tools/SeekIterative/SeekIterative.cpp
00001 #include "stdafx.h"
00002 #include "cmdline.h"
00003 
00004 bool transfer(CDat &Dat, vector<vector<float> > &mat, CSeekIntIntMap *geneMap,
00005     vector<string> &vecstrGenes){
00006 
00007     int i, j;
00008     vector<utype> veciGenes;
00009     veciGenes.clear();
00010     veciGenes.resize(vecstrGenes.size());
00011     for( i = 0; i < vecstrGenes.size( ); ++i )
00012         veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
00013     
00014     mat.resize(vecstrGenes.size());
00015     for(i=0; i<vecstrGenes.size(); i++)
00016         CSeekTools::InitVector(mat[i], vecstrGenes.size(), CMeta::GetNaN());
00017 
00018     for(i=0; i<vecstrGenes.size(); i++){
00019         utype s = veciGenes[i];
00020         if(CSeekTools::IsNaN(s)) continue;
00021         geneMap->Add(i);
00022         float *v = Dat.GetFullRow(s);
00023         for(j=0; j<vecstrGenes.size(); j++){
00024             utype t = veciGenes[j];
00025             if(CSeekTools::IsNaN(t)) continue;
00026             if(CMeta::IsNaN(v[t])) continue;
00027             mat[i][j] = Dat.Get(s, t);
00028         }
00029         free(v);
00030     }
00031     return true;
00032 }
00033 
00034 //Add scores from two vectors and integrate them using alpha
00035 bool integrate(vector<float> &d1, vector<float> &d2, 
00036     vector<float> &dest, CSeekIntIntMap *geneMap, int k, float alpha){
00037     utype i;
00038     CSeekTools::InitVector(dest, geneMap->GetSize(), (float) CMeta::GetNaN());
00039     const vector<utype> &allGenes = geneMap->GetAllReverse();
00040     for(i=0; i<geneMap->GetNumSet(); i++){
00041         utype gi = allGenes[i];
00042         dest[gi] = (1.0 - alpha) * d1[gi] + alpha / k * d2[gi];
00043     }
00044     return true;
00045 }
00046 
00047 //initialize score vector
00048 bool init_score(vector<float> &dest, CSeekIntIntMap *geneMap){
00049     CSeekTools::InitVector(dest, geneMap->GetSize(), (float) CMeta::GetNaN());
00050     utype i;    
00051     const vector<utype> &allGenes = geneMap->GetAllReverse();
00052     for(i=0; i<geneMap->GetNumSet(); i++)
00053         dest[allGenes[i]] = 0;
00054     return true;
00055 }
00056 
00057 bool add_score(vector<float> &src, vector<float> &dest, CSeekIntIntMap *geneMap){
00058     const vector<utype> &allGenes = geneMap->GetAllReverse();
00059     utype i, j;
00060     for(i=0; i<geneMap->GetNumSet(); i++){
00061         utype gi = allGenes[i];
00062         dest[gi] += src[gi];
00063     }
00064     return true;
00065 }
00066 
00067 bool search_one_dab(vector<float> &gene_score, 
00068     CDat &mat, const size_t numGenes,
00069     CSeekIntIntMap &d1,
00070     vector<utype> &indexConvReverse,
00071     vector<float> &q_weight, 
00072     vector<vector<float> > &nq_weight
00073 ){ 
00074     //q_weight is query presence - 1.0 if gene at the index i is a query gene
00075 
00076     vector<float> gene_count;
00077     CSeekTools::InitVector(gene_score, numGenes, (float)CMeta::GetNaN());
00078     CSeekTools::InitVector(gene_count, numGenes, (float)0);
00079     utype qqi;
00080     utype kk, k;
00081 
00082     const vector<utype> &allGenes = d1.GetAllReverse(); 
00083     for(kk=0; kk<d1.GetNumSet(); kk++){
00084         utype k = allGenes[kk];
00085         utype gi = indexConvReverse[k];
00086         gene_score[gi] = 0;
00087     }
00088 
00089     for(qqi=0; qqi<d1.GetNumSet(); qqi++){
00090         utype qi = allGenes[qqi];
00091         utype qq = indexConvReverse[qi];
00092         if(q_weight[qq]==0) //not a query gene
00093             continue;
00094         //now a query gene
00095         float *vc = mat.GetFullRow(qi);
00096         for(kk=0; kk<d1.GetNumSet(); kk++){
00097             utype k = allGenes[kk];
00098             utype gi = indexConvReverse[k];
00099             float fl = vc[k];
00100             gene_score[gi] += fl;
00101             gene_count[gi] += 1.0;
00102         }
00103         delete[] vc;
00104     }
00105 
00106     for(kk=0; kk<d1.GetNumSet(); kk++){
00107         utype k = allGenes[kk];
00108         utype gi = indexConvReverse[k];
00109         gene_score[gi] /= gene_count[gi];
00110     }
00111 
00112     utype ind;
00113     for(ind=0; ind<nq_weight.size(); ind++){
00114         vector<float> ngene_count, ngene_score;
00115         CSeekTools::InitVector(ngene_score, numGenes, (float)CMeta::GetNaN());
00116         CSeekTools::InitVector(ngene_count, numGenes, (float)0);
00117 
00118         for(kk=0; kk<d1.GetNumSet(); kk++)
00119             ngene_score[(utype)indexConvReverse[(utype)allGenes[kk]]] = 0;
00120 
00121         for(qqi=0; qqi<d1.GetNumSet(); qqi++){
00122             utype qi = allGenes[qqi];
00123             utype qq = indexConvReverse[qi];
00124             if(nq_weight[ind][qq]==0) //not a NOT query gene
00125                 continue;
00126             //now a NOT query gene
00127             float *vc = mat.GetFullRow(qi);
00128             for(kk=0; kk<d1.GetNumSet(); kk++){
00129                 utype k = allGenes[kk];
00130                 utype gi = indexConvReverse[k];
00131                 float fl = vc[k];
00132                 ngene_score[gi] += fl;
00133                 ngene_count[gi] += 1.0;
00134             }
00135             delete[] vc;
00136         }
00137         for(kk=0; kk<d1.GetNumSet(); kk++){
00138             utype gi = indexConvReverse[(utype)allGenes[kk]];
00139             ngene_score[gi] /= ngene_count[gi];
00140             gene_score[gi] -= ngene_score[gi]; //subtract the correlation to NOT genes
00141         }
00142     }
00143 
00144     return true;
00145 }
00146 
00147 bool get_score(vector<float> &gene_score, 
00148     CSparseFlatMatrix<float> &mat,
00149     CSeekIntIntMap *geneMap, vector<float> &q_weight){
00150     vector<float> gene_count;
00151     int numGenes = geneMap->GetSize();
00152     CSeekTools::InitVector(gene_score, numGenes, (float)CMeta::GetNaN());
00153     CSeekTools::InitVector(gene_count, numGenes, (float)0);
00154     int qi=0;
00155     const vector<utype> &allGenes = geneMap->GetAllReverse();
00156     utype kk, k;
00157     
00158     for(kk=0; kk<geneMap->GetNumSet(); kk++){
00159         utype gi = allGenes[kk];
00160         gene_score[gi] = 0;
00161     }
00162     for(qi=0; qi<geneMap->GetNumSet(); qi++){
00163         utype qq = allGenes[qi];
00164         if(q_weight[qq]==0) 
00165             continue;
00166         const vector<CPair<float> > &vc = mat.GetRow(qq);
00167         for(kk=0; kk<vc.size(); kk++){
00168             float fl = vc[kk].v;
00169             utype gi = vc[kk].i;
00170             gene_score[gi] += fl * q_weight[qq];
00171         }
00172         for(kk=0; kk<geneMap->GetNumSet(); kk++){
00173             utype gi = allGenes[kk];
00174             gene_count[gi] += q_weight[qq];
00175         }
00176         /*for(kk=0; kk<geneMap->GetNumSet(); kk++){
00177             utype gi = allGenes[kk];
00178             map<utype,float>::iterator it;
00179             float fl = 0;
00180             if((it=mat[qq].find(gi))==mat[qq].end()){
00181                 fl = 0;
00182             }else{
00183                 fl = it->second;
00184             }
00185             //float fl = mat[qq][gi];
00186             gene_score[gi] += fl * q_weight[qq];
00187             gene_count[gi] += q_weight[qq];
00188         }*/
00189     }
00190 
00191     for(k=0; k<geneMap->GetNumSet(); k++){
00192         utype gi = allGenes[k];
00193         gene_score[gi] /= gene_count[gi];
00194         if(gene_count[gi]==0){
00195             gene_score[gi] = 0;
00196         }
00197     }
00198     return true;
00199 }
00200 
00201 bool get_score(vector<float> &gene_score, CFullMatrix<float> &mat,
00202 CSeekIntIntMap *geneMap, vector<float> &q_weight){
00203     vector<float> gene_count;
00204     int numGenes = geneMap->GetSize();
00205     CSeekTools::InitVector(gene_score, numGenes, (float)CMeta::GetNaN());
00206     CSeekTools::InitVector(gene_count, numGenes, (float)0);
00207     int qi=0;
00208     const vector<utype> &allGenes = geneMap->GetAllReverse();
00209     utype kk, k;
00210     for(kk=0; kk<geneMap->GetNumSet(); kk++){
00211         utype gi = allGenes[kk];
00212         gene_score[gi] = 0;
00213     }
00214     for(qi=0; qi<geneMap->GetNumSet(); qi++){
00215         utype qq = allGenes[qi];
00216         if(q_weight[qq]==0) 
00217             continue;
00218         for(kk=0; kk<geneMap->GetNumSet(); kk++){
00219             utype gi = allGenes[kk];
00220             float fl = mat.Get(qq, gi);
00221             gene_score[gi] += fl * q_weight[qq];
00222         }
00223         for(kk=0; kk<geneMap->GetNumSet(); kk++){
00224             utype gi = allGenes[kk];
00225             gene_count[gi] += q_weight[qq];
00226         }
00227     }
00228     for(k=0; k<geneMap->GetNumSet(); k++){
00229         utype gi = allGenes[k];
00230         gene_score[gi] /= gene_count[gi];
00231         if(gene_count[gi] == 0)
00232             gene_score[gi] = 0;
00233     }
00234     return true;
00235 }
00236 
00237 /*bool iterate(vector<float> &src, vector<float> &dest, 
00238     vector<vector<float> > &tr, CSeekIntIntMap *geneMap,
00239     float alpha, int numIterations){
00240     
00241     const vector<utype> &allGenes = geneMap->GetAllReverse();
00242     dest.resize(src.size());
00243     vector<float> backup;
00244     CSeekTools::InitVector(backup, src.size(), CMeta::GetNaN());
00245     int i, j, k;
00246     for(i=0; i<dest.size(); i++)
00247         dest[i] = src[i];   
00248         
00249     for(i=0; i<numIterations; i++){
00250         for(j=0; j<geneMap->GetNumSet(); j++){
00251             utype gi = allGenes[j];
00252             backup[gi] = dest[gi];
00253         }
00254         get_score(dest, tr, geneMap, backup);
00255         for(j=0; j<geneMap->GetNumSet(); j++){
00256             utype gi = allGenes[j];
00257             dest[gi] = (1.0 - alpha) * src[j] + alpha * dest[gi];
00258         }
00259     }
00260     return true;
00261 }*/
00262 
00263 //is_query is like a gold-standard gene list
00264 bool weight_fast(vector<char> &is_query, vector<float> &d1,
00265 CSeekIntIntMap *geneMap, float &w){
00266     utype i;
00267     w = 0;
00268     const vector<utype> &allGenes = geneMap->GetAllReverse();
00269     for(i=0; i<geneMap->GetNumSet(); i++){
00270         utype gi = allGenes[i];
00271         if(is_query[gi]==1){
00272             //if(CMeta::IsNaN(d1[gi])) continue;
00273             w += d1[gi];
00274         }
00275     }
00276     return true;    
00277 }
00278 
00279 bool weight(vector<char> &is_query, vector<float> &d1,
00280 CSeekIntIntMap *geneMap, float rbp_p, float &w){
00281     vector<AResultFloat> ar;
00282     ar.resize(geneMap->GetNumSet());
00283     utype i;
00284     const vector<utype> &allGenes = geneMap->GetAllReverse();
00285     for(i=0; i<geneMap->GetNumSet(); i++){
00286         utype gi = allGenes[i];
00287         ar[i].i = gi;
00288         ar[i].f = d1[gi];
00289     }
00290     int MAX = 1000;
00291     nth_element(ar.begin(), ar.begin()+MAX, ar.end());
00292     sort(ar.begin(), ar.begin()+MAX);
00293     w = 0;
00294     for(i=0; i<MAX; i++)
00295         if(is_query[ar[i].i]==1)
00296             w += pow(rbp_p, i);
00297     w *= (1.0 - rbp_p); 
00298     return true;
00299 }
00300 
00301 bool cv_weight_LOO(vector<utype> &query, CSparseFlatMatrix<float> &mat,
00302     CSeekIntIntMap *geneMap, float rbp_p, float &tot_w){
00303     //leave one out cross-validation
00304     utype i, j;
00305     int numGenes = geneMap->GetSize();
00306     tot_w = 0;
00307     
00308     for(i=0; i<query.size(); i++){
00309         vector<char> is_query;
00310         vector<float> q_weight;
00311         float w = 0;
00312         CSeekTools::InitVector(is_query, numGenes, (char) 0);
00313         CSeekTools::InitVector(q_weight, numGenes, (float) 0);
00314         for(j=0; j<query.size(); j++){
00315             if(i==j) continue;
00316             q_weight[query[j]] = 1.0;
00317         }
00318         is_query[query[i]] = 1;
00319         vector<float> gene_score;
00320         get_score(gene_score, mat, geneMap, q_weight);
00321         for(j=0; j<query.size(); j++){
00322             if(i==j) continue;
00323             gene_score[query[j]] = 0;
00324         }
00325         //weight_fast(is_query, gene_score, geneMap, w);
00326         weight(is_query, gene_score, geneMap, rbp_p, w);
00327         tot_w += w;
00328     }
00329     tot_w /= query.size();
00330     return true;
00331 }
00332 
00333 bool cv_weight(vector<utype> &query, CSparseFlatMatrix<float> &mat,
00334     CSeekIntIntMap *geneMap, float rbp_p, float &tot_w){
00335     //leave one in cross-validation
00336     utype i, j;
00337     int numGenes = geneMap->GetSize();
00338     tot_w = 0;
00339     
00340     for(i=0; i<query.size(); i++){
00341         vector<char> is_query;
00342         vector<float> q_weight;
00343         CSeekTools::InitVector(is_query, numGenes, (char) 0);
00344         CSeekTools::InitVector(q_weight, numGenes, (float) 0);
00345         q_weight[query[i]] = 1.0;
00346         float w = 0;
00347         for(j=0; j<query.size(); j++){
00348             if(i==j) continue;
00349             is_query[query[j]] = 1;
00350         }
00351         vector<float> gene_score;
00352         get_score(gene_score, mat, geneMap, q_weight);
00353         gene_score[query[i]] = 0;
00354         weight(is_query, gene_score, geneMap, rbp_p, w);
00355         //fprintf(stderr, "Q%d %.3f\n", i, w);
00356         //weight_fast(is_query, gene_score, geneMap, w);
00357         tot_w += w;
00358     }
00359     tot_w /= query.size();
00360     return true;    
00361 }
00362 
00363 //accepts fullmatrix, leave one in cross-validation
00364 bool cv_weight(vector<utype> &query, CFullMatrix<float> &mat,
00365 CSeekIntIntMap *geneMap, float rbp_p, float &tot_w){
00366     utype i, j;
00367     int numGenes = geneMap->GetSize();
00368     tot_w = 0;
00369     for(i=0; i<query.size(); i++){
00370         vector<char> is_query;
00371         vector<float> q_weight;
00372         CSeekTools::InitVector(is_query, numGenes, (char) 0);
00373         CSeekTools::InitVector(q_weight, numGenes, (float) 0);
00374         q_weight[query[i]] = 1.0;
00375         float w = 0;
00376         for(j=0; j<query.size(); j++){
00377             if(i==j) continue;
00378             is_query[query[j]] = 1;
00379         }
00380         vector<float> gene_score;
00381         get_score(gene_score, mat, geneMap, q_weight);
00382         gene_score[query[i]] = 0;
00383         weight(is_query, gene_score, geneMap, rbp_p, w);
00384         tot_w += w;
00385     }
00386     tot_w /= query.size();
00387     return true;    
00388 }
00389 
00390 int main(int iArgs, char **aszArgs){
00391     static const size_t c_iBuffer   = 1024;
00392     char acBuffer[c_iBuffer];
00393     
00394     gengetopt_args_info sArgs;
00395     ifstream ifsm;
00396     istream *pistm;
00397     vector<string> vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery;
00398     utype i, qi, j, k, l, kk;
00399     
00400     if(cmdline_parser(iArgs, aszArgs, &sArgs)){
00401         cmdline_parser_print_help();
00402         return 1;
00403     }
00404 
00405     if( sArgs.input_arg ) {
00406         ifsm.open( sArgs.input_arg );
00407         pistm = &ifsm; 
00408     }else
00409         pistm = &cin;
00410     
00411     map<string, size_t> mapstriGenes;
00412     while( !pistm->eof( ) ) {
00413         pistm->getline( acBuffer, c_iBuffer - 1 );
00414         acBuffer[ c_iBuffer - 1 ] = 0;
00415         vecstrLine.clear( );
00416         CMeta::Tokenize( acBuffer, vecstrLine );
00417         if( vecstrLine.size( ) < 2 ) {
00418             //cerr << "Ignoring line: " << acBuffer << endl;
00419             continue;
00420         }
00421         if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
00422             cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for "
00423                 << vecstrLine[ 1 ] << endl;
00424             return 1;
00425         }
00426         i--;
00427         if( vecstrGenes.size( ) <= i )
00428             vecstrGenes.resize( i + 1 );
00429         vecstrGenes[ i ] = vecstrLine[ 1 ];
00430         mapstriGenes[vecstrGenes[i]] = i;
00431     }
00432 
00433     fprintf(stderr, "Finished reading gene map\n");
00434 
00435     string dab_dir = sArgs.dab_dir_arg;
00436     string output_dir = sArgs.dir_out_arg;
00437     int numGenes = vecstrGenes.size();
00438     vector< vector<string> > vecstrAllQuery;
00439     fprintf(stderr, "Reading queries\n");
00440     if(!CSeekTools::ReadMultipleQueries(sArgs.query_arg, vecstrAllQuery))
00441         return -1;
00442     fprintf(stderr, "Finished reading queries\n");
00443 
00444     vector<vector<utype> > qu;
00445     qu.resize(vecstrAllQuery.size());
00446     for(i=0; i<vecstrAllQuery.size(); i++){
00447         qu[i] = vector<utype>();
00448         for(j=0; j<vecstrAllQuery[i].size(); j++)
00449             qu[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]);
00450     }
00451 
00452     vector<vector<vector<string> > > vecstrAllNotQuery;
00453     vecstrAllNotQuery.resize(vecstrAllQuery.size());
00454     string not_query = sArgs.not_query_arg;
00455     if(not_query!="NA"){
00456         fprintf(stderr, "Reading NOT queries\n");
00457         if(!CSeekTools::ReadMultipleNotQueries(sArgs.not_query_arg, 
00458         vecstrAllNotQuery))
00459             return -1;
00460     }
00461 
00462     if(sArgs.visualize_flag==1){
00463         string dab_base = sArgs.dab_basename_arg;
00464         string file1 = dab_dir + "/" + dab_base + ".dab";
00465         float cutoff_par = sArgs.cutoff_arg;
00466         string genome = sArgs.genome_arg;
00467         vector<string> s1, s2;
00468         CSeekTools::ReadListTwoColumns(genome.c_str(), s1, s2);
00469 
00470         CGenome g;
00471         g.Open(s1);
00472         for(i=0; i<s2.size(); i++){
00473             CGene &g1 = g.GetGene(g.FindGene(s1[i]));
00474             g.AddSynonym(g1, s2[i]);
00475         }
00476 
00477         CDat CD;
00478         CD.Open(file1.c_str(), false, 2, false, false, false);
00479 
00480         CSeekIntIntMap d1(CD.GetGenes());
00481         vector<utype> indexConvReverse;
00482         CSeekTools::InitVector(indexConvReverse, vecstrGenes.size(), (utype) -1);   
00483         for(i=0; i<CD.GetGenes(); i++){
00484             map<string,size_t>::iterator it = mapstriGenes.find(CD.GetGene(i));
00485             if(it==mapstriGenes.end()) continue;
00486             indexConvReverse[i] = it->second;
00487             d1.Add(i);
00488         }
00489 
00490         //Visualize
00491         for(j=0; j<vecstrAllQuery.size(); j++){
00492             vector<string> vec_s;
00493             vector<utype> vec_g;
00494             for(k=0; k<vecstrAllQuery[j].size(); k++){
00495                 size_t ind = CD.GetGeneIndex(vecstrAllQuery[j][k]);
00496                 if(ind==(size_t)-1) continue;
00497                 vec_g.push_back(CD.GetGeneIndex(vecstrAllQuery[j][k]));
00498                 vec_s.push_back(vecstrAllQuery[j][k]);
00499             }
00500             CDat V;
00501             V.Open(vec_s);
00502             for(k=0; k<vec_s.size(); k++){
00503                 for(l=k+1; l<vec_s.size(); l++){
00504                     V.Set(k, l, CD.Get(vec_g[k], vec_g[l]));
00505                 }
00506             }
00507             fprintf(stderr, "Query %d\n", j);
00508 
00509             if(sArgs.print_distr_flag==1){
00510                 float min = 9999;
00511                 float max = -1;
00512                 for(k=0; k<vec_s.size(); k++){
00513                     for(l=k+1; l<vec_s.size(); l++){
00514                         float v = CD.Get(vec_g[k], vec_g[l]);
00515                         if(CMeta::IsNaN(v)) continue;
00516                         if(v>max) max = v;
00517                         if(v<min) min = v;
00518                     }
00519                 }
00520                 float init = min;
00521                 float step = (max - min)/100.0;
00522                 float upper = max;
00523                 float cutoff = init;
00524                 while(cutoff<upper){
00525                     int count = 0;
00526                     for(k=0; k<vec_s.size(); k++){
00527                         for(l=k+1; l<vec_s.size(); l++){
00528                             if(!CMeta::IsNaN(V.Get(k,l)) && V.Get(k,l)>=cutoff){
00529                                 count++;
00530                             }
00531                         }
00532                     }
00533                     fprintf(stderr, "%.5f\t%d\n", cutoff, count);
00534                     cutoff+=step;
00535                 }
00536             }
00537 
00538             sprintf(acBuffer, "%s/%d.dot", output_dir.c_str(), j);          
00539             ofstream ot(acBuffer);
00540             V.SaveDOT(ot, cutoff_par, &g, false, true, NULL, NULL);
00541         }
00542 
00543     }
00544 
00545     if(sArgs.combined_flag==1){
00546         string dab_base = sArgs.dab_basename_arg;
00547         string file1 = dab_dir + "/" + dab_base + ".dab";
00548 
00549         vector<vector<float> > q_weight;
00550         vector<vector<vector<float> > > nq_weight;
00551 
00552         q_weight.resize(vecstrAllQuery.size());
00553         for(i=0; i<vecstrAllQuery.size(); i++){
00554             CSeekTools::InitVector(q_weight[i], numGenes, (float) 0);
00555             for(j=0; j<vecstrAllQuery[i].size(); j++)
00556                 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1;
00557         }
00558 
00559         nq_weight.resize(vecstrAllNotQuery.size());
00560         if(not_query!="NA"){
00561             for(i=0; i<vecstrAllNotQuery.size(); i++){
00562                 nq_weight[i].resize(vecstrAllNotQuery[i].size());
00563                 for(j=0; j<vecstrAllNotQuery[i].size(); j++){
00564                     CSeekTools::InitVector(nq_weight[i][j], numGenes, (float) 0);
00565                     for(k=0; k<vecstrAllNotQuery[i][j].size(); k++)
00566                         nq_weight[i][j][mapstriGenes[vecstrAllNotQuery[i][j][k]]] = 1;
00567                 }
00568             }
00569         }
00570 
00571         fprintf(stderr, "Start reading dab\n");
00572         CDat CD;
00573         CD.Open(file1.c_str(), false, 2, false, false, false);
00574         fprintf(stderr, "Finished reading dab\n");
00575 
00576         CSeekIntIntMap d1(CD.GetGenes());
00577         vector<utype> indexConvReverse;
00578         CSeekTools::InitVector(indexConvReverse, vecstrGenes.size(), (utype) -1);   
00579         for(i=0; i<CD.GetGenes(); i++){
00580             map<string,size_t>::iterator it = mapstriGenes.find(CD.GetGene(i));
00581             if(it==mapstriGenes.end()) continue;
00582             indexConvReverse[i] = it->second;
00583             d1.Add(i);
00584         }
00585 
00586         vector<vector<float> > final_score;
00587         final_score.resize(vecstrAllQuery.size());
00588         for(j=0; j<vecstrAllQuery.size(); j++)
00589             CSeekTools::InitVector(final_score[j], vecstrGenes.size(), 
00590             (float)CMeta::GetNaN());
00591 
00592         const vector<utype> &allGenes = d1.GetAllReverse();         
00593         for(j=0; j<vecstrAllQuery.size(); j++){
00594             vector<float> tmp_score;
00595             search_one_dab(tmp_score, CD, vecstrGenes.size(), d1, indexConvReverse,
00596             q_weight[j], nq_weight[j]);
00597             for(kk=0; kk<d1.GetNumSet(); kk++){
00598                 utype k = allGenes[kk];
00599                 utype gi = indexConvReverse[k];
00600                 final_score[j][gi] = tmp_score[gi];
00601             }
00602         }
00603         fprintf(stderr, "Writing output\n");
00604 
00605         for(j=0; j<vecstrAllQuery.size(); j++){
00606             for(k=0; k<final_score[j].size(); k++){
00607                 if(CMeta::IsNaN(final_score[j][k])){
00608                     final_score[j][k] = -320;
00609                     continue;
00610                 }
00611             }
00612             sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), j);
00613             CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]);
00614             sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), j);
00615             CSeekTools::WriteArray(acBuffer, final_score[j]);
00616         }
00617 
00618         fprintf(stderr, "Finished with search\n");
00619         if(sArgs.print_distr_flag==0 && sArgs.generate_dot_flag==0){
00620             return 0;
00621         }
00622 
00623         float cutoff_par = sArgs.cutoff_arg;
00624         string genome = sArgs.genome_arg;
00625         vector<string> s1, s2;
00626         CSeekTools::ReadListTwoColumns(genome.c_str(), s1, s2);
00627 
00628         CGenome g;
00629         g.Open(s1);
00630         for(i=0; i<s2.size(); i++){
00631             CGene &g1 = g.GetGene(g.FindGene(s1[i]));
00632             g.AddSynonym(g1, s2[i]);
00633         }
00634 
00635         //Visualize
00636         for(j=0; j<vecstrAllQuery.size(); j++){
00637             //fprintf(stderr, "Query %d\n", j);
00638             vector<AResultFloat> ar;
00639             ar.resize(final_score[j].size());
00640             for(k=0; k<final_score[j].size(); k++){
00641                 ar[k].i = k;
00642                 ar[k].f = final_score[j][k];
00643             }
00644             sort(ar.begin(), ar.end());
00645             vector<utype> vec_g;
00646             vector<string> vec_s;
00647             vector<float> node_w;
00648             int FIRST = sArgs.top_genes_arg;
00649             for(k=0; k<FIRST; k++){
00650                 if(ar[k].f==-320) break;
00651                 vec_g.push_back(CD.GetGeneIndex(vecstrGenes[ar[k].i]));
00652                 vec_s.push_back(vecstrGenes[ar[k].i]);
00653                 node_w.push_back(0);
00654             }
00655             //if(vec_g.size()!=FIRST) continue;
00656             for(k=0; k<vecstrAllQuery[j].size(); k++){
00657                 size_t ind = CD.GetGeneIndex(vecstrAllQuery[j][k]);
00658                 if(ind==(size_t)-1) continue;
00659                 vec_g.push_back(ind);
00660                 vec_s.push_back(vecstrAllQuery[j][k]);
00661                 node_w.push_back(1.0);
00662             }
00663 
00664             CDat V;
00665             V.Open(vec_s);
00666             for(k=0; k<vec_s.size(); k++){
00667                 for(l=k+1; l<vec_s.size(); l++){
00668                     float v = CD.Get(vec_g[k], vec_g[l]);
00669                     V.Set(k, l, v);
00670                 }
00671             }
00672 
00673             if(sArgs.print_distr_flag==1){
00674                 float min = 9999;
00675                 float max = -1;
00676                 for(k=0; k<vec_s.size(); k++){
00677                     for(l=k+1; l<vec_s.size(); l++){
00678                         float v = CD.Get(vec_g[k], vec_g[l]);
00679                         if(CMeta::IsNaN(v)) continue;
00680                         if(v>max) max = v;
00681                         if(v<min) min = v;
00682                     }
00683                 }
00684 
00685                 float init = min;
00686                 float step = (max - min)/100.0;
00687                 float upper = max;
00688 
00689                 float cutoff = init;
00690                 while(cutoff<upper){
00691                     int count = 0;
00692                     for(k=0; k<vec_s.size(); k++){
00693                         for(l=k+1; l<vec_s.size(); l++){
00694                             if(!CMeta::IsNaN(V.Get(k,l)) && V.Get(k,l)>=cutoff){
00695                                 count++;
00696                             }
00697                         }
00698                     }
00699                     fprintf(stderr, "%.5f\t%d\n", cutoff, count);
00700                     cutoff+=step;
00701                 }
00702             }
00703 
00704             if(sArgs.generate_dot_flag==1){
00705                 sprintf(acBuffer, "%s/%d.dot", output_dir.c_str(), j);          
00706                 ofstream ot(acBuffer);
00707                 V.SaveDOT(ot, cutoff_par, &g, false, true, &node_w, NULL);
00708                 //V.SaveDOT(ot, 0.0001, NULL, true, false, NULL, NULL);
00709             }
00710         }
00711         
00712     }
00713 
00714     if(sArgs.test_flag==1){
00715         string dab_base = sArgs.dab_basename_arg;
00716         string file1 = dab_dir + "/" + dab_base + ".half";
00717 
00718         vector<vector<float> > q_weight;
00719         q_weight.resize(vecstrAllQuery.size());
00720         for(i=0; i<vecstrAllQuery.size(); i++){
00721             CSeekTools::InitVector(q_weight[i], numGenes, (float) 0);
00722             for(j=0; j<vecstrAllQuery[i].size(); j++)
00723                 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1;
00724         }
00725 
00726         CHalfMatrix<float> res;
00727         res.Initialize(vecstrGenes.size());
00728         ifstream istm1;
00729         uint32_t dim;
00730         istm1.open(file1.c_str(), ios_base::binary);
00731         istm1.read((char*)(&dim), sizeof(dim));
00732         float *adScores = new float[dim-1];
00733         for(i=0; (i+1)<dim; i++){
00734             istm1.read((char*)adScores, sizeof(*adScores) * (dim - i - 1));
00735             res.Set(i, adScores);
00736         }
00737         delete[] adScores;
00738         istm1.close();
00739     
00740         string s1 = "uc003vor.2";
00741         vector<string> r1;
00742         r1.push_back("uc003vos.2");
00743         r1.push_back("uc003vop.1");
00744         r1.push_back("uc011jwz.1");
00745         r1.push_back("uc002rgw.1");
00746 
00747         for(i=0; i<r1.size(); i++){
00748             size_t i1 = mapstriGenes[s1];
00749             size_t j1 = mapstriGenes[r1[i]];
00750             fprintf(stderr, "%s %s %.5f\n", s1.c_str(), r1[i].c_str(), res.Get(i1, j1));
00751         }
00752     }
00753 
00754     if(sArgs.testcount_flag==1){
00755         string dab_base = sArgs.dab_basename_arg;
00756         string file1 = dab_dir + "/" + dab_base + ".pair_count";
00757 
00758         vector<vector<float> > q_weight;
00759         q_weight.resize(vecstrAllQuery.size());
00760         for(i=0; i<vecstrAllQuery.size(); i++){
00761             CSeekTools::InitVector(q_weight[i], numGenes, (float) 0);
00762             for(j=0; j<vecstrAllQuery[i].size(); j++)
00763                 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1;
00764         }
00765 
00766         CHalfMatrix<unsigned short> res;
00767         res.Initialize(vecstrGenes.size());
00768         ifstream istm1;
00769         uint32_t dim;
00770         istm1.open(file1.c_str(), ios_base::binary);
00771         istm1.read((char*)(&dim), sizeof(dim));
00772         unsigned short *adScores = new unsigned short[dim-1];
00773         for(i=0; (i+1)<dim; i++){
00774             istm1.read((char*)adScores, sizeof(*adScores) * (dim - i - 1));
00775             res.Set(i, adScores);
00776         }
00777         delete[] adScores;
00778         istm1.close();
00779     
00780         string s1 = "uc003vor.2";
00781         vector<string> r1;
00782         r1.push_back("uc003vos.2");
00783         r1.push_back("uc003vop.1");
00784         r1.push_back("uc011jwz.1");
00785         r1.push_back("uc002rgw.1");
00786 
00787         for(i=0; i<r1.size(); i++){
00788             size_t i1 = mapstriGenes[s1];
00789             size_t j1 = mapstriGenes[r1[i]];
00790             fprintf(stderr, "%s %s %d\n", s1.c_str(), r1[i].c_str(), res.Get(i1, j1));
00791         }
00792     }
00793 
00794     if(sArgs.testcombined_flag==1){
00795         string dab_base = sArgs.dab_basename_arg;
00796         string file3 = dab_dir + "/" + dab_base + ".gene_count";
00797         string file2 = dab_dir + "/" + dab_base + ".pair_count";
00798         string file1 = dab_dir + "/" + dab_base + ".half";
00799 
00800         vector<utype> gene_count;
00801         CSeekTools::ReadArray(file3.c_str(), gene_count);
00802 
00803         float max_count = *max_element(gene_count.begin(), gene_count.end());
00804         float min_count_required = max_count*0.50;
00805         CSeekIntIntMap d1(vecstrGenes.size());
00806         vector<string> validGenes;
00807         for(i=0; i<vecstrGenes.size(); i++){
00808             if(gene_count[i]<min_count_required) continue;
00809             validGenes.push_back(vecstrGenes[i]);
00810             d1.Add(i);
00811         }
00812         
00813         CDat CD;
00814         CD.Open(validGenes);
00815         for(i=0; i<validGenes.size(); i++){
00816             for(j=i+1; j<validGenes.size(); j++){
00817                 CD.Set(i,j,CMeta::GetNaN());
00818             }
00819         }
00820         //CHalfMatrix<float> res;
00821         //res.Initialize(vecstrGenes.size());
00822         ifstream istm1, istm2;
00823         uint32_t dim;
00824         istm1.open(file1.c_str(), ios_base::binary);
00825         istm1.read((char*)(&dim), sizeof(dim));
00826         istm2.open(file2.c_str(), ios_base::binary);
00827         istm2.read((char*)(&dim), sizeof(dim));
00828         float *adScores = new float[dim-1];
00829         unsigned short *adScores2 = new unsigned short[dim-1];
00830         for(i=0; (i+1)<dim; i++){
00831             istm1.read((char*)adScores, sizeof(*adScores) * (dim - i - 1));
00832             istm2.read((char*)adScores2, sizeof(*adScores2) * (dim - i - 1));
00833             if(i%2000==0)
00834                 fprintf(stderr, "%d Finished\n", i);
00835             utype gi = d1.GetForward(i);
00836             if(CSeekTools::IsNaN(gi)) continue;
00837 
00838             for(j=0; j<dim-i-1; j++){
00839                 utype gj = d1.GetForward(j+i+1);
00840                 if(CSeekTools::IsNaN(gj) || 
00841                     (float) adScores2[j]<min_count_required) continue;
00842                 adScores[j] = adScores[j] / (float) adScores2[j];
00843                 CD.Set(gi, gj, adScores[j]);
00844             }
00845     
00846             //res.Set(i, adScores);
00847         }
00848         delete[] adScores;
00849         delete[] adScores2;
00850         istm1.close();
00851         istm2.close();
00852     
00853         string s1 = "uc003vor.2";
00854         vector<string> r1;
00855         r1.push_back("uc003vos.2");
00856         r1.push_back("uc003vop.1");
00857         r1.push_back("uc011jwz.1");
00858         r1.push_back("uc002rgw.1");
00859 
00860         for(i=0; i<r1.size(); i++){
00861             //size_t i1 = mapstriGenes[s1];
00862             //size_t j1 = mapstriGenes[r1[i]];
00863             size_t i1 = CD.GetGeneIndex(s1);
00864             size_t j1 = CD.GetGeneIndex(r1[i]);
00865             fprintf(stderr, "%s %s %.5f\n", s1.c_str(), r1[i].c_str(), CD.Get(i1, j1));
00866         }
00867     }
00868 
00869     //Traditional DAB mode (.dab file)
00870     if(sArgs.tdab_flag==1){
00871         string search_mode = sArgs.tsearch_mode_arg;
00872         if(search_mode=="NA"){
00873             fprintf(stderr, "Please specify a search mode!\n");
00874             return 1;
00875         }
00876         vector<string> dab_list;
00877         CSeekTools::ReadListOneColumn(sArgs.tdab_list_arg, dab_list);
00878 
00879         fprintf(stderr, "Finished reading dablist\n");
00880         //preparing query
00881         vector<vector<float> > q_weight;
00882         q_weight.resize(vecstrAllQuery.size());
00883         for(i=0; i<vecstrAllQuery.size(); i++){
00884             CSeekTools::InitVector(q_weight[i], numGenes, (float) 0);
00885             for(j=0; j<vecstrAllQuery[i].size(); j++)
00886                 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1;
00887         }
00888 
00889         //preparing query2
00890         vector<vector<unsigned int> > qq;
00891         qq.resize(vecstrAllQuery.size());
00892         for(i=0; i<vecstrAllQuery.size(); i++){
00893             qq[i] = vector<unsigned int>();
00894             for(j=0; j<vecstrAllQuery[i].size(); j++)
00895                 qq[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]);
00896         }
00897 
00898         //selected datasets for each query
00899         vector<vector<char> > selectedDataset;
00900         selectedDataset.resize(vecstrAllQuery.size());
00901         for(i=0; i<vecstrAllQuery.size(); i++)
00902             CSeekTools::InitVector(selectedDataset[i], dab_list.size(), (char)0);
00903         
00904         fprintf(stderr, "Reading DAB\n");
00905         vector<vector<float> > final_score, count, dweight;
00906         vector<vector<int> > freq;
00907         final_score.resize(vecstrAllQuery.size());
00908         count.resize(vecstrAllQuery.size());
00909         freq.resize(vecstrAllQuery.size());
00910         dweight.resize(vecstrAllQuery.size());
00911         for(j=0; j<vecstrAllQuery.size(); j++){
00912             CSeekTools::InitVector(final_score[j], vecstrGenes.size(), (float)CMeta::GetNaN());
00913             CSeekTools::InitVector(count[j], vecstrGenes.size(), (float) 0);
00914             CSeekTools::InitVector(freq[j], vecstrGenes.size(), (int) 0);
00915             CSeekTools::InitVector(dweight[j], dab_list.size(), (float) 0);
00916         }
00917 
00918         float threshold_g = sArgs.threshold_g_arg;
00919         float threshold_q = sArgs.threshold_q_arg;
00920         bool DEBUG = !!sArgs.debug_flag;
00921 
00922         size_t l;
00923         for(l=0; l<dab_list.size(); l++){
00924             CDat Dat;
00925             fprintf(stderr, "Reading %d: %s\n", l, dab_list[l].c_str());
00926             string dabfile = dab_dir + "/" + dab_list[l];
00927             Dat.Open(dabfile.c_str(), false, 2, false, false, false);
00928 
00929             fprintf(stderr, "Finished reading DAB\n");
00930             vector<unsigned int> veciGenes;
00931             veciGenes.resize(vecstrGenes.size());
00932             unsigned int ki;
00933             for(ki=0; ki<vecstrGenes.size(); ki++)
00934                 veciGenes[ki] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[ki]);
00935             unsigned int s,t;
00936             float d;
00937             CSeekIntIntMap m(vecstrGenes.size());
00938             for(i=0; i<vecstrGenes.size(); i++){
00939                 if((s=veciGenes[i])==(unsigned int)-1) continue;
00940                 m.Add(i);
00941             }
00942 
00943             //Copy to matrix sm
00944             CFullMatrix<float> sm;
00945             sm.Initialize(vecstrGenes.size(), vecstrGenes.size());
00946             const vector<utype> &allRGenes = m.GetAllReverse();
00947             for(i=0; i<m.GetNumSet(); i++){
00948                 unsigned int si = allRGenes[i];
00949                 s = veciGenes[si];
00950                 for(j=i+1; j<m.GetNumSet(); j++){
00951                     unsigned int tj = allRGenes[j];
00952                     t = veciGenes[allRGenes[j]];
00953                     if(CMeta::IsNaN(d = Dat.Get(s,t))){
00954                         sm.Set(si, tj, 0);
00955                         sm.Set(tj, si, 0);
00956                     }else{
00957                         sm.Set(si, tj, d);
00958                         sm.Set(tj, si, d);
00959                     }
00960                 }
00961                 sm.Set(si, si, 0);
00962             }
00963 
00964             fprintf(stderr, "Finished copying matrix\n");
00965 
00966             for(j=0; j<vecstrAllQuery.size(); j++){
00967                 int numPresent = 0;
00968                 for(k=0; k<qq[j].size(); k++){
00969                     if(m.GetForward(qq[j][k])==(unsigned int)-1) continue;
00970                     numPresent++;
00971                 }
00972                 if(search_mode=="eq" && numPresent==0){
00973                     continue;
00974                 }else if(search_mode=="cv_loi" && numPresent<=1){
00975                     continue;
00976                 }
00977                 int numThreshold = (int) (threshold_q * qq[j].size());
00978                 if(numPresent>numThreshold)
00979                     selectedDataset[j][l] = 1;
00980             }
00981 
00982             for(j=0; j<vecstrAllQuery.size(); j++){
00983                 //not enough query genes present
00984                 if(selectedDataset[j][l]==0) continue;
00985 
00986                 float dw = 1.0;
00987                 if(search_mode=="eq"){
00988                     dw = 1.0;
00989                 }else{ //cv_loi, rbp_p = 0.99
00990                     cv_weight(qu[j], sm, &m, 0.99, dw);
00991                 }
00992                 dweight[j][l] = dw;
00993                 vector<float> tmp_score;
00994                 get_score(tmp_score, sm, &m, q_weight[j]);
00995                 for(k=0; k<m.GetNumSet(); k++){
00996                     utype gi = allRGenes[k];
00997                     if(CMeta::IsNaN(final_score[j][gi]))
00998                         final_score[j][gi] = 0;
00999                     final_score[j][gi] += tmp_score[gi] * dw;
01000                     count[j][gi]+=dw;
01001                     freq[j][gi]++;
01002                 }   
01003             }
01004 
01005         }
01006 
01007         for(j=0; j<vecstrAllQuery.size(); j++){
01008 
01009             int countSelected = 0;
01010             for(k=0; k<selectedDataset[j].size(); k++)
01011                 countSelected+=selectedDataset[j][k];
01012 
01013             //int minRequired = (int) ((float) dab_list.size() * 0.50);
01014             int minRequired = (int) ((float) countSelected * threshold_g);
01015 
01016             if(DEBUG){
01017                 int nG = 0;
01018                 for(k=0; k<final_score[j].size(); k++){
01019                     if(freq[j][k]>=minRequired){
01020                         nG++;
01021                     }
01022                 }
01023                 fprintf(stderr, "Query %d numSelectedDataset %d numGenesIntegrated %d\n", j, countSelected, nG);
01024             }
01025 
01026             for(k=0; k<final_score[j].size(); k++){
01027                 if(freq[j][k]<minRequired){
01028                     final_score[j][k] = -320;
01029                     continue;
01030                 }
01031                 if(CMeta::IsNaN(final_score[j][k])){
01032                     final_score[j][k] = -320;
01033                     continue;
01034                 }
01035                 final_score[j][k] /= count[j][k];
01036             }
01037             sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), j);
01038             CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]);
01039             sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), j);
01040             CSeekTools::WriteArray(acBuffer, final_score[j]);
01041             sprintf(acBuffer, "%s/%d.dweight", output_dir.c_str(), j);
01042             CSeekTools::WriteArray(acBuffer, dweight[j]);
01043         }
01044         return 0;
01045     }
01046 
01047 
01048     //DAB mode (sparse DAB: .2.dab)
01049     if(sArgs.dab_flag==1){
01050         float threshold_g = sArgs.threshold_g_arg;
01051         float threshold_q = sArgs.threshold_q_arg;
01052         bool DEBUG = !!sArgs.debug_flag;
01053 
01054         string search_mode = sArgs.tsearch_mode_arg;
01055         if(search_mode=="NA"){
01056             fprintf(stderr, "Please specify a search mode!\n");
01057             return 1;
01058         }
01059 
01060         string norm_mode = sArgs.norm_mode_arg;
01061         if(sArgs.default_type_arg!=0 && sArgs.default_type_arg!=1){
01062             fprintf(stderr, "Error, invalid type!\n");
01063             return -1;
01064         }
01065 
01066         if(norm_mode=="NA"){
01067             fprintf(stderr, "Error, please specify a norm mode!\n");
01068             return -1;
01069         }
01070 
01071         float exp = sArgs.exp_arg;
01072         if(norm_mode=="rank"){
01073             if(sArgs.max_rank_arg==-1){
01074                 fprintf(stderr, "Error, please supply the max rank flag.\n");
01075                 return -1;
01076             }
01077             if(sArgs.rbp_p_arg==-1){
01078                 fprintf(stderr, "Error, please supply the rbp_p flag.\n");
01079                 return -1;
01080             }
01081         }else if(norm_mode=="subtract_z"){
01082             if(exp==-1){
01083                 fprintf(stderr, "Invalid exponent!\n");
01084                 return -1;
01085             }
01086             if(sArgs.rbp_p_arg==-1){
01087                 fprintf(stderr, "Error, please supply the rbp_p flag.\n");
01088                 return -1;
01089             }
01090         }
01091 
01092         vector<float> score_cutoff;
01093         bool bDatasetCutoff = false;
01094         string dset_cutoff_file = sArgs.dset_cutoff_file_arg;
01095         if(dset_cutoff_file!="NA"){
01096             CSeekTools::ReadArray(dset_cutoff_file.c_str(), score_cutoff);
01097             bDatasetCutoff = true;
01098             fprintf(stderr, "Filtered mode is on!\n");
01099         }
01100 
01101         float rbp_p = sArgs.rbp_p_arg;
01102         int max_rank = sArgs.max_rank_arg;
01103         int num_iter = sArgs.num_iter_arg;
01104         vector<string> dab_list;
01105         CSeekTools::ReadListOneColumn(sArgs.dab_list_arg, dab_list);
01106         vector<CSeekIntIntMap*> dm;
01107         dm.resize(dab_list.size());
01108 
01109         //selected datasets for each query
01110         vector<vector<char> > selectedDataset;
01111         selectedDataset.resize(vecstrAllQuery.size());
01112         for(i=0; i<vecstrAllQuery.size(); i++)
01113             CSeekTools::InitVector(selectedDataset[i], dab_list.size(), (char)0);
01114         
01115         //MODE 1 - Normal search:
01116         //preparing query
01117         vector<vector<float> > q_weight;
01118         q_weight.resize(vecstrAllQuery.size());
01119         for(i=0; i<vecstrAllQuery.size(); i++){
01120             CSeekTools::InitVector(q_weight[i], numGenes, (float) 0);
01121             for(j=0; j<vecstrAllQuery[i].size(); j++)
01122                 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1;
01123         }
01124 
01125         //preparing query2
01126         vector<vector<unsigned int> > qq;
01127         qq.resize(vecstrAllQuery.size());
01128         for(i=0; i<vecstrAllQuery.size(); i++){
01129             qq[i] = vector<unsigned int>();
01130             for(j=0; j<vecstrAllQuery[i].size(); j++)
01131                 qq[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]);
01132         }
01133         
01134         fprintf(stderr, "Reading sparse DAB\n");
01135         vector<vector<float> > final_score, count;
01136         vector<vector<float> > dweight;
01137         vector<vector<int> > freq;
01138         final_score.resize(vecstrAllQuery.size());
01139         count.resize(vecstrAllQuery.size());
01140         freq.resize(vecstrAllQuery.size());
01141         dweight.resize(vecstrAllQuery.size());
01142         for(j=0; j<vecstrAllQuery.size(); j++){
01143             CSeekTools::InitVector(final_score[j], vecstrGenes.size(), (float)CMeta::GetNaN());
01144             CSeekTools::InitVector(count[j], vecstrGenes.size(), (float) 0);
01145             CSeekTools::InitVector(freq[j], vecstrGenes.size(), (int) 0);
01146             CSeekTools::InitVector(dweight[j], dab_list.size(), (float) 0);
01147         }
01148 
01149         if(bDatasetCutoff)  fprintf(stderr, "Dataset cutoff is on!\n");
01150         else    fprintf(stderr, "Dataset cutoff is off!\n");
01151         
01152 
01153         for(i=0; i<dab_list.size(); i++){
01154             fprintf(stderr, "Reading %d: %s\n", i, dab_list[i].c_str());
01155             CSeekIntIntMap d1(vecstrGenes.size());
01156             string dabfile = dab_dir + "/" + dab_list[i];
01157             CSparseFlatMatrix<float> sm (0);
01158 
01159             if(norm_mode=="rank"){
01160                 if(sArgs.default_type_arg==0) //utype
01161                     CSeekWriter::ReadSeekSparseMatrix<utype>(dabfile.c_str(), sm, d1, 
01162                     max_rank, rbp_p, vecstrGenes);
01163                 else
01164                     CSeekWriter::ReadSeekSparseMatrix<unsigned short>(dabfile.c_str(), 
01165                     sm, d1, max_rank, rbp_p, vecstrGenes);
01166             }else if(norm_mode=="subtract_z"){
01167                 if(sArgs.default_type_arg==0) //utype
01168                     CSeekWriter::ReadSeekSparseMatrix<utype>(dabfile.c_str(), sm, d1, 
01169                     vecstrGenes, 1000, exp);
01170                 else
01171                     CSeekWriter::ReadSeekSparseMatrix<unsigned short>(dabfile.c_str(), sm, d1, 
01172                     vecstrGenes, 1000, exp);
01173             }
01174             const vector<utype> &allGenes = d1.GetAllReverse();
01175 
01176             for(j=0; j<vecstrAllQuery.size(); j++){
01177                 int numPresent = 0;
01178                 for(k=0; k<qq[j].size(); k++){
01179                     if(d1.GetForward(qq[j][k])==(unsigned int)-1) continue;
01180                     numPresent++;
01181                 }
01182                 if(search_mode=="eq" && numPresent==0){
01183                     continue;
01184                 }else if(search_mode=="cv_loi" && numPresent<=1){
01185                     continue;
01186                 }
01187                 int numThreshold = (int) (threshold_q * qq[j].size());
01188                 if(numPresent>numThreshold)
01189                     selectedDataset[j][i] = 1;
01190             }
01191 
01192             #pragma omp parallel for \
01193             shared(qu, sm, d1, dweight, final_score, count, freq, score_cutoff) \
01194             private(j, k) firstprivate(bDatasetCutoff, i) schedule(dynamic)
01195             for(j=0; j<vecstrAllQuery.size(); j++){
01196                 //not enough query genes present
01197                 if(selectedDataset[j][i]==0) continue;
01198 
01199                 float dw = 1.0;
01200                 if(search_mode=="eq"){
01201                     dw = 1.0;
01202                 }else{
01203                     //cv_weight_LOO(qu[j], sm, &d1, rbp_p, dw);
01204                     cv_weight(qu[j], sm, &d1, rbp_p, dw);
01205                 }
01206 
01207                 if(bDatasetCutoff){
01208                     if(score_cutoff[i]>dw)
01209                         dw = 0;
01210                     //fprintf(stderr, "%.3e %.3e\n", score_cutoff[i], dw);
01211                 }
01212                 //fprintf(stderr, "%.3e\n", dw);
01213                 dweight[j][i] = dw;
01214                 //if(dw==0) 
01215                 //  continue;
01216                 vector<float> tmp_score;
01217                 get_score(tmp_score, sm, &d1, q_weight[j]);
01218                 for(k=0; k<d1.GetNumSet(); k++){
01219                     utype gi = allGenes[k];
01220                     if(CMeta::IsNaN(final_score[j][gi]))
01221                         final_score[j][gi] = 0;
01222                     final_score[j][gi] += tmp_score[gi] * dw;
01223                     count[j][gi]+=dw;
01224                     freq[j][gi]++;
01225                 }
01226             }
01227         }
01228 
01229         /*
01230         ofstream ofsm;
01231         ofsm.open("/memex/qzhu/p4/concatenate_tumor_network.half", ios_base::binary);
01232         res.Save(ofsm, true);
01233         */
01234         
01235         for(j=0; j<vecstrAllQuery.size(); j++){
01236             int countSelected = 0;
01237             for(k=0; k<selectedDataset[j].size(); k++)
01238                 countSelected+=selectedDataset[j][k];
01239 
01240             int minRequired = (int) ((float) countSelected * threshold_g);
01241 
01242             if(DEBUG){
01243                 int nG = 0;
01244                 for(k=0; k<final_score[j].size(); k++){
01245                     if(freq[j][k]>=minRequired){
01246                         nG++;
01247                     }
01248                 }
01249                 fprintf(stderr, "Query %d numSelectedDataset %d numGenesIntegrated %d\n", j, countSelected, nG);
01250             }
01251 
01252             for(k=0; k<final_score[j].size(); k++){
01253                 if(freq[j][k]<minRequired){
01254                     final_score[j][k] = -320;
01255                     continue;
01256                 }
01257                 if(CMeta::IsNaN(final_score[j][k])){
01258                     final_score[j][k] = -320;
01259                     continue;
01260                 }
01261                 final_score[j][k] /= count[j][k];
01262             }
01263 
01264             sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), j);
01265             CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]);
01266             sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), j);
01267             CSeekTools::WriteArray(acBuffer, final_score[j]);
01268             sprintf(acBuffer, "%s/%d.dweight", output_dir.c_str(), j);
01269             CSeekTools::WriteArray(acBuffer, dweight[j]);
01270         }
01271     
01272         //MODE 2
01273         /*vector<vector<utype> > qu;
01274         qu.resize(vecstrAllQuery.size());
01275         for(i=0; i<vecstrAllQuery.size(); i++){
01276             qu[i] = vector<utype>();
01277             for(j=0; j<vecstrAllQuery[i].size(); j++)
01278                 qu[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]);
01279         }
01280         vector<vector<float> > q_weight;
01281         q_weight.resize(vecstrAllQuery.size());
01282         for(i=0; i<vecstrAllQuery.size(); i++){
01283             CSeekTools::InitVector(q_weight[i], numGenes, (float) 0);
01284             for(j=0; j<vecstrAllQuery[i].size(); j++)
01285                 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1;
01286         }
01287 
01288         float alpha1 = 0.1; //inter - propagation
01289         float alpha2 = 0.1; //intra - propagation
01290         //float alpha = 0;      
01291         bool label_propagate = false;
01292 
01293         for(i=0; i<dab_list.size(); i++){
01294             CSeekIntIntMap d1(vecstrGenes.size());
01295             vector<vector<float> > sm1;
01296             vector<utype> l1;
01297             string dabfile1 = dab_dir + "/" + dab_list[i];
01298             CSeekWriter::ReadSparseMatrixAsArray(l1, dabfile1.c_str());
01299             CSeekWriter::ReadSparseMatrix(l1, sm1, d1, 1000, 0.99, vecstrGenes);
01300 
01301             cerr << "1 " << dabfile1 << endl;
01302             vector<vector<float> > final_score;
01303             final_score.resize(vecstrAllQuery.size());
01304             
01305             //vector<vector<float> > tmp_score;
01306             //tmp_score.resize(vecstrAllQuery.size());
01307     
01308             vector<vector<float> > tmp_score_2;
01309             tmp_score_2.resize(vecstrAllQuery.size());
01310 
01311             //this score
01312             //component 1
01313             for(j=0; j<vecstrAllQuery.size(); j++){
01314                 //get_score(tmp_score[j], sm1, &d1, q_weight[j]);
01315                 init_score(tmp_score_2[j], &d1);
01316                 init_score(final_score[j], &d1);
01317             }
01318 
01319             for(j=0; label_propagate && j<dab_list.size(); j++){
01320                 if(i==j) continue;
01321                 CSeekIntIntMap d2(vecstrGenes.size());
01322                 vector<vector<float> > sm2;
01323                 vector<utype> l2;
01324                 string dabfile2 = dab_dir + "/" + dab_list[j];
01325                 cerr << "2 " << dabfile2 << endl;
01326                 CSeekWriter::ReadSparseMatrixAsArray(l2, dabfile2.c_str());
01327                 CSeekWriter::ReadSparseMatrix(l2, sm2, d2, 1000, 0.99, vecstrGenes);
01328 
01329                 //similarity matrix
01330                 vector<vector<float> > sim;
01331                 CSeekWriter::ProductNorm(sm1, sm2, d1, d2, sim);
01332 
01333                 utype kj, kk;
01334                 for(k=0; k<vecstrAllQuery.size(); k++){
01335                     //component 2
01336                     cerr << k << " of " << vecstrAllQuery.size() << endl;
01337                     vector<float> intra;
01338                     vector<float> inter;
01339                     get_score(intra, sm2, &d2, q_weight[k]);
01340                     get_score(inter, sim, &d2, intra);
01341                     //get_score(inter, sim, &d2, q_weight[k]);
01342                     add_score(inter, tmp_score_2[k], &d2);
01343                 }
01344             }
01345 
01346             for(j=0; j<vecstrAllQuery.size(); j++){
01347                 if(label_propagate){
01348                     vector<float> tmp3;
01349                     integrate(q_weight[j], tmp_score_2[j], tmp3, &d1, dab_list.size()-1, alpha1);
01350                     iterate(tmp3, final_score[j], sm1, &d1, alpha2, 1);
01351                 }else{
01352                     //integrate(q_weight[j], tmp_score_2[j], tmp3, &d1, dab_list.size()-1, alpha1);
01353                     iterate(q_weight[j], final_score[j], sm1, &d1, alpha2, 1);
01354                 }
01355 
01356                 for(k=0; k<final_score[j].size(); k++){
01357                     if(CMeta::IsNaN(final_score[j][k])){
01358                         final_score[j][k] = -320;
01359                         continue;
01360                     }
01361                 }
01362 
01363                 for(k=0; k<qu[j].size(); k++){
01364                     final_score[j][qu[j][k]] = -320;
01365                 }
01366 
01367                 sprintf(acBuffer, "%s/%s/%d.query", output_dir.c_str(), dab_list[i].c_str(), j);
01368                 CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]);
01369                 sprintf(acBuffer, "%s/%s/%d.gscore", output_dir.c_str(), dab_list[i].c_str(), j);
01370                 CSeekTools::WriteArray(acBuffer, final_score[j]);
01371             }
01372         }*/
01373         
01374     }
01375     
01376 }