Sleipnir
|
00001 #include "stdafx.h" 00002 #include "cmdline.h" 00003 00004 bool transfer(CDat &Dat, vector<vector<float> > &mat, CSeekIntIntMap *geneMap, 00005 vector<string> &vecstrGenes){ 00006 00007 int i, j; 00008 vector<utype> veciGenes; 00009 veciGenes.clear(); 00010 veciGenes.resize(vecstrGenes.size()); 00011 for( i = 0; i < vecstrGenes.size( ); ++i ) 00012 veciGenes[ i ] = Dat.GetGene( vecstrGenes[i] ); 00013 00014 mat.resize(vecstrGenes.size()); 00015 for(i=0; i<vecstrGenes.size(); i++) 00016 CSeekTools::InitVector(mat[i], vecstrGenes.size(), CMeta::GetNaN()); 00017 00018 for(i=0; i<vecstrGenes.size(); i++){ 00019 utype s = veciGenes[i]; 00020 if(CSeekTools::IsNaN(s)) continue; 00021 geneMap->Add(i); 00022 float *v = Dat.GetFullRow(s); 00023 for(j=0; j<vecstrGenes.size(); j++){ 00024 utype t = veciGenes[j]; 00025 if(CSeekTools::IsNaN(t)) continue; 00026 if(CMeta::IsNaN(v[t])) continue; 00027 mat[i][j] = Dat.Get(s, t); 00028 } 00029 free(v); 00030 } 00031 return true; 00032 } 00033 00034 //Add scores from two vectors and integrate them using alpha 00035 bool integrate(vector<float> &d1, vector<float> &d2, 00036 vector<float> &dest, CSeekIntIntMap *geneMap, int k, float alpha){ 00037 utype i; 00038 CSeekTools::InitVector(dest, geneMap->GetSize(), (float) CMeta::GetNaN()); 00039 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00040 for(i=0; i<geneMap->GetNumSet(); i++){ 00041 utype gi = allGenes[i]; 00042 dest[gi] = (1.0 - alpha) * d1[gi] + alpha / k * d2[gi]; 00043 } 00044 return true; 00045 } 00046 00047 //initialize score vector 00048 bool init_score(vector<float> &dest, CSeekIntIntMap *geneMap){ 00049 CSeekTools::InitVector(dest, geneMap->GetSize(), (float) CMeta::GetNaN()); 00050 utype i; 00051 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00052 for(i=0; i<geneMap->GetNumSet(); i++) 00053 dest[allGenes[i]] = 0; 00054 return true; 00055 } 00056 00057 bool add_score(vector<float> &src, vector<float> &dest, CSeekIntIntMap *geneMap){ 00058 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00059 utype i, j; 00060 for(i=0; i<geneMap->GetNumSet(); i++){ 00061 utype gi = allGenes[i]; 00062 dest[gi] += src[gi]; 00063 } 00064 return true; 00065 } 00066 00067 bool search_one_dab(vector<float> &gene_score, 00068 CDat &mat, const size_t numGenes, 00069 CSeekIntIntMap &d1, 00070 vector<utype> &indexConvReverse, 00071 vector<float> &q_weight, 00072 vector<vector<float> > &nq_weight 00073 ){ 00074 //q_weight is query presence - 1.0 if gene at the index i is a query gene 00075 00076 vector<float> gene_count; 00077 CSeekTools::InitVector(gene_score, numGenes, (float)CMeta::GetNaN()); 00078 CSeekTools::InitVector(gene_count, numGenes, (float)0); 00079 utype qqi; 00080 utype kk, k; 00081 00082 const vector<utype> &allGenes = d1.GetAllReverse(); 00083 for(kk=0; kk<d1.GetNumSet(); kk++){ 00084 utype k = allGenes[kk]; 00085 utype gi = indexConvReverse[k]; 00086 gene_score[gi] = 0; 00087 } 00088 00089 for(qqi=0; qqi<d1.GetNumSet(); qqi++){ 00090 utype qi = allGenes[qqi]; 00091 utype qq = indexConvReverse[qi]; 00092 if(q_weight[qq]==0) //not a query gene 00093 continue; 00094 //now a query gene 00095 float *vc = mat.GetFullRow(qi); 00096 for(kk=0; kk<d1.GetNumSet(); kk++){ 00097 utype k = allGenes[kk]; 00098 utype gi = indexConvReverse[k]; 00099 float fl = vc[k]; 00100 gene_score[gi] += fl; 00101 gene_count[gi] += 1.0; 00102 } 00103 delete[] vc; 00104 } 00105 00106 for(kk=0; kk<d1.GetNumSet(); kk++){ 00107 utype k = allGenes[kk]; 00108 utype gi = indexConvReverse[k]; 00109 gene_score[gi] /= gene_count[gi]; 00110 } 00111 00112 utype ind; 00113 for(ind=0; ind<nq_weight.size(); ind++){ 00114 vector<float> ngene_count, ngene_score; 00115 CSeekTools::InitVector(ngene_score, numGenes, (float)CMeta::GetNaN()); 00116 CSeekTools::InitVector(ngene_count, numGenes, (float)0); 00117 00118 for(kk=0; kk<d1.GetNumSet(); kk++) 00119 ngene_score[(utype)indexConvReverse[(utype)allGenes[kk]]] = 0; 00120 00121 for(qqi=0; qqi<d1.GetNumSet(); qqi++){ 00122 utype qi = allGenes[qqi]; 00123 utype qq = indexConvReverse[qi]; 00124 if(nq_weight[ind][qq]==0) //not a NOT query gene 00125 continue; 00126 //now a NOT query gene 00127 float *vc = mat.GetFullRow(qi); 00128 for(kk=0; kk<d1.GetNumSet(); kk++){ 00129 utype k = allGenes[kk]; 00130 utype gi = indexConvReverse[k]; 00131 float fl = vc[k]; 00132 ngene_score[gi] += fl; 00133 ngene_count[gi] += 1.0; 00134 } 00135 delete[] vc; 00136 } 00137 for(kk=0; kk<d1.GetNumSet(); kk++){ 00138 utype gi = indexConvReverse[(utype)allGenes[kk]]; 00139 ngene_score[gi] /= ngene_count[gi]; 00140 gene_score[gi] -= ngene_score[gi]; //subtract the correlation to NOT genes 00141 } 00142 } 00143 00144 return true; 00145 } 00146 00147 bool get_score(vector<float> &gene_score, 00148 CSparseFlatMatrix<float> &mat, 00149 CSeekIntIntMap *geneMap, vector<float> &q_weight){ 00150 vector<float> gene_count; 00151 int numGenes = geneMap->GetSize(); 00152 CSeekTools::InitVector(gene_score, numGenes, (float)CMeta::GetNaN()); 00153 CSeekTools::InitVector(gene_count, numGenes, (float)0); 00154 int qi=0; 00155 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00156 utype kk, k; 00157 00158 for(kk=0; kk<geneMap->GetNumSet(); kk++){ 00159 utype gi = allGenes[kk]; 00160 gene_score[gi] = 0; 00161 } 00162 for(qi=0; qi<geneMap->GetNumSet(); qi++){ 00163 utype qq = allGenes[qi]; 00164 if(q_weight[qq]==0) 00165 continue; 00166 const vector<CPair<float> > &vc = mat.GetRow(qq); 00167 for(kk=0; kk<vc.size(); kk++){ 00168 float fl = vc[kk].v; 00169 utype gi = vc[kk].i; 00170 gene_score[gi] += fl * q_weight[qq]; 00171 } 00172 for(kk=0; kk<geneMap->GetNumSet(); kk++){ 00173 utype gi = allGenes[kk]; 00174 gene_count[gi] += q_weight[qq]; 00175 } 00176 /*for(kk=0; kk<geneMap->GetNumSet(); kk++){ 00177 utype gi = allGenes[kk]; 00178 map<utype,float>::iterator it; 00179 float fl = 0; 00180 if((it=mat[qq].find(gi))==mat[qq].end()){ 00181 fl = 0; 00182 }else{ 00183 fl = it->second; 00184 } 00185 //float fl = mat[qq][gi]; 00186 gene_score[gi] += fl * q_weight[qq]; 00187 gene_count[gi] += q_weight[qq]; 00188 }*/ 00189 } 00190 00191 for(k=0; k<geneMap->GetNumSet(); k++){ 00192 utype gi = allGenes[k]; 00193 gene_score[gi] /= gene_count[gi]; 00194 if(gene_count[gi]==0){ 00195 gene_score[gi] = 0; 00196 } 00197 } 00198 return true; 00199 } 00200 00201 bool get_score(vector<float> &gene_score, CFullMatrix<float> &mat, 00202 CSeekIntIntMap *geneMap, vector<float> &q_weight){ 00203 vector<float> gene_count; 00204 int numGenes = geneMap->GetSize(); 00205 CSeekTools::InitVector(gene_score, numGenes, (float)CMeta::GetNaN()); 00206 CSeekTools::InitVector(gene_count, numGenes, (float)0); 00207 int qi=0; 00208 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00209 utype kk, k; 00210 for(kk=0; kk<geneMap->GetNumSet(); kk++){ 00211 utype gi = allGenes[kk]; 00212 gene_score[gi] = 0; 00213 } 00214 for(qi=0; qi<geneMap->GetNumSet(); qi++){ 00215 utype qq = allGenes[qi]; 00216 if(q_weight[qq]==0) 00217 continue; 00218 for(kk=0; kk<geneMap->GetNumSet(); kk++){ 00219 utype gi = allGenes[kk]; 00220 float fl = mat.Get(qq, gi); 00221 gene_score[gi] += fl * q_weight[qq]; 00222 } 00223 for(kk=0; kk<geneMap->GetNumSet(); kk++){ 00224 utype gi = allGenes[kk]; 00225 gene_count[gi] += q_weight[qq]; 00226 } 00227 } 00228 for(k=0; k<geneMap->GetNumSet(); k++){ 00229 utype gi = allGenes[k]; 00230 gene_score[gi] /= gene_count[gi]; 00231 if(gene_count[gi] == 0) 00232 gene_score[gi] = 0; 00233 } 00234 return true; 00235 } 00236 00237 /*bool iterate(vector<float> &src, vector<float> &dest, 00238 vector<vector<float> > &tr, CSeekIntIntMap *geneMap, 00239 float alpha, int numIterations){ 00240 00241 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00242 dest.resize(src.size()); 00243 vector<float> backup; 00244 CSeekTools::InitVector(backup, src.size(), CMeta::GetNaN()); 00245 int i, j, k; 00246 for(i=0; i<dest.size(); i++) 00247 dest[i] = src[i]; 00248 00249 for(i=0; i<numIterations; i++){ 00250 for(j=0; j<geneMap->GetNumSet(); j++){ 00251 utype gi = allGenes[j]; 00252 backup[gi] = dest[gi]; 00253 } 00254 get_score(dest, tr, geneMap, backup); 00255 for(j=0; j<geneMap->GetNumSet(); j++){ 00256 utype gi = allGenes[j]; 00257 dest[gi] = (1.0 - alpha) * src[j] + alpha * dest[gi]; 00258 } 00259 } 00260 return true; 00261 }*/ 00262 00263 //is_query is like a gold-standard gene list 00264 bool weight_fast(vector<char> &is_query, vector<float> &d1, 00265 CSeekIntIntMap *geneMap, float &w){ 00266 utype i; 00267 w = 0; 00268 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00269 for(i=0; i<geneMap->GetNumSet(); i++){ 00270 utype gi = allGenes[i]; 00271 if(is_query[gi]==1){ 00272 //if(CMeta::IsNaN(d1[gi])) continue; 00273 w += d1[gi]; 00274 } 00275 } 00276 return true; 00277 } 00278 00279 bool weight(vector<char> &is_query, vector<float> &d1, 00280 CSeekIntIntMap *geneMap, float rbp_p, float &w){ 00281 vector<AResultFloat> ar; 00282 ar.resize(geneMap->GetNumSet()); 00283 utype i; 00284 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00285 for(i=0; i<geneMap->GetNumSet(); i++){ 00286 utype gi = allGenes[i]; 00287 ar[i].i = gi; 00288 ar[i].f = d1[gi]; 00289 } 00290 int MAX = 1000; 00291 nth_element(ar.begin(), ar.begin()+MAX, ar.end()); 00292 sort(ar.begin(), ar.begin()+MAX); 00293 w = 0; 00294 for(i=0; i<MAX; i++) 00295 if(is_query[ar[i].i]==1) 00296 w += pow(rbp_p, i); 00297 w *= (1.0 - rbp_p); 00298 return true; 00299 } 00300 00301 bool cv_weight_LOO(vector<utype> &query, CSparseFlatMatrix<float> &mat, 00302 CSeekIntIntMap *geneMap, float rbp_p, float &tot_w){ 00303 //leave one out cross-validation 00304 utype i, j; 00305 int numGenes = geneMap->GetSize(); 00306 tot_w = 0; 00307 00308 for(i=0; i<query.size(); i++){ 00309 vector<char> is_query; 00310 vector<float> q_weight; 00311 float w = 0; 00312 CSeekTools::InitVector(is_query, numGenes, (char) 0); 00313 CSeekTools::InitVector(q_weight, numGenes, (float) 0); 00314 for(j=0; j<query.size(); j++){ 00315 if(i==j) continue; 00316 q_weight[query[j]] = 1.0; 00317 } 00318 is_query[query[i]] = 1; 00319 vector<float> gene_score; 00320 get_score(gene_score, mat, geneMap, q_weight); 00321 for(j=0; j<query.size(); j++){ 00322 if(i==j) continue; 00323 gene_score[query[j]] = 0; 00324 } 00325 //weight_fast(is_query, gene_score, geneMap, w); 00326 weight(is_query, gene_score, geneMap, rbp_p, w); 00327 tot_w += w; 00328 } 00329 tot_w /= query.size(); 00330 return true; 00331 } 00332 00333 bool cv_weight(vector<utype> &query, CSparseFlatMatrix<float> &mat, 00334 CSeekIntIntMap *geneMap, float rbp_p, float &tot_w){ 00335 //leave one in cross-validation 00336 utype i, j; 00337 int numGenes = geneMap->GetSize(); 00338 tot_w = 0; 00339 00340 for(i=0; i<query.size(); i++){ 00341 vector<char> is_query; 00342 vector<float> q_weight; 00343 CSeekTools::InitVector(is_query, numGenes, (char) 0); 00344 CSeekTools::InitVector(q_weight, numGenes, (float) 0); 00345 q_weight[query[i]] = 1.0; 00346 float w = 0; 00347 for(j=0; j<query.size(); j++){ 00348 if(i==j) continue; 00349 is_query[query[j]] = 1; 00350 } 00351 vector<float> gene_score; 00352 get_score(gene_score, mat, geneMap, q_weight); 00353 gene_score[query[i]] = 0; 00354 weight(is_query, gene_score, geneMap, rbp_p, w); 00355 //fprintf(stderr, "Q%d %.3f\n", i, w); 00356 //weight_fast(is_query, gene_score, geneMap, w); 00357 tot_w += w; 00358 } 00359 tot_w /= query.size(); 00360 return true; 00361 } 00362 00363 //accepts fullmatrix, leave one in cross-validation 00364 bool cv_weight(vector<utype> &query, CFullMatrix<float> &mat, 00365 CSeekIntIntMap *geneMap, float rbp_p, float &tot_w){ 00366 utype i, j; 00367 int numGenes = geneMap->GetSize(); 00368 tot_w = 0; 00369 for(i=0; i<query.size(); i++){ 00370 vector<char> is_query; 00371 vector<float> q_weight; 00372 CSeekTools::InitVector(is_query, numGenes, (char) 0); 00373 CSeekTools::InitVector(q_weight, numGenes, (float) 0); 00374 q_weight[query[i]] = 1.0; 00375 float w = 0; 00376 for(j=0; j<query.size(); j++){ 00377 if(i==j) continue; 00378 is_query[query[j]] = 1; 00379 } 00380 vector<float> gene_score; 00381 get_score(gene_score, mat, geneMap, q_weight); 00382 gene_score[query[i]] = 0; 00383 weight(is_query, gene_score, geneMap, rbp_p, w); 00384 tot_w += w; 00385 } 00386 tot_w /= query.size(); 00387 return true; 00388 } 00389 00390 int main(int iArgs, char **aszArgs){ 00391 static const size_t c_iBuffer = 1024; 00392 char acBuffer[c_iBuffer]; 00393 00394 gengetopt_args_info sArgs; 00395 ifstream ifsm; 00396 istream *pistm; 00397 vector<string> vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery; 00398 utype i, qi, j, k, l, kk; 00399 00400 if(cmdline_parser(iArgs, aszArgs, &sArgs)){ 00401 cmdline_parser_print_help(); 00402 return 1; 00403 } 00404 00405 if( sArgs.input_arg ) { 00406 ifsm.open( sArgs.input_arg ); 00407 pistm = &ifsm; 00408 }else 00409 pistm = &cin; 00410 00411 map<string, size_t> mapstriGenes; 00412 while( !pistm->eof( ) ) { 00413 pistm->getline( acBuffer, c_iBuffer - 1 ); 00414 acBuffer[ c_iBuffer - 1 ] = 0; 00415 vecstrLine.clear( ); 00416 CMeta::Tokenize( acBuffer, vecstrLine ); 00417 if( vecstrLine.size( ) < 2 ) { 00418 //cerr << "Ignoring line: " << acBuffer << endl; 00419 continue; 00420 } 00421 if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) { 00422 cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " 00423 << vecstrLine[ 1 ] << endl; 00424 return 1; 00425 } 00426 i--; 00427 if( vecstrGenes.size( ) <= i ) 00428 vecstrGenes.resize( i + 1 ); 00429 vecstrGenes[ i ] = vecstrLine[ 1 ]; 00430 mapstriGenes[vecstrGenes[i]] = i; 00431 } 00432 00433 fprintf(stderr, "Finished reading gene map\n"); 00434 00435 string dab_dir = sArgs.dab_dir_arg; 00436 string output_dir = sArgs.dir_out_arg; 00437 int numGenes = vecstrGenes.size(); 00438 vector< vector<string> > vecstrAllQuery; 00439 fprintf(stderr, "Reading queries\n"); 00440 if(!CSeekTools::ReadMultipleQueries(sArgs.query_arg, vecstrAllQuery)) 00441 return -1; 00442 fprintf(stderr, "Finished reading queries\n"); 00443 00444 vector<vector<utype> > qu; 00445 qu.resize(vecstrAllQuery.size()); 00446 for(i=0; i<vecstrAllQuery.size(); i++){ 00447 qu[i] = vector<utype>(); 00448 for(j=0; j<vecstrAllQuery[i].size(); j++) 00449 qu[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]); 00450 } 00451 00452 vector<vector<vector<string> > > vecstrAllNotQuery; 00453 vecstrAllNotQuery.resize(vecstrAllQuery.size()); 00454 string not_query = sArgs.not_query_arg; 00455 if(not_query!="NA"){ 00456 fprintf(stderr, "Reading NOT queries\n"); 00457 if(!CSeekTools::ReadMultipleNotQueries(sArgs.not_query_arg, 00458 vecstrAllNotQuery)) 00459 return -1; 00460 } 00461 00462 if(sArgs.visualize_flag==1){ 00463 string dab_base = sArgs.dab_basename_arg; 00464 string file1 = dab_dir + "/" + dab_base + ".dab"; 00465 float cutoff_par = sArgs.cutoff_arg; 00466 string genome = sArgs.genome_arg; 00467 vector<string> s1, s2; 00468 CSeekTools::ReadListTwoColumns(genome.c_str(), s1, s2); 00469 00470 CGenome g; 00471 g.Open(s1); 00472 for(i=0; i<s2.size(); i++){ 00473 CGene &g1 = g.GetGene(g.FindGene(s1[i])); 00474 g.AddSynonym(g1, s2[i]); 00475 } 00476 00477 CDat CD; 00478 CD.Open(file1.c_str(), false, 2, false, false, false); 00479 00480 CSeekIntIntMap d1(CD.GetGenes()); 00481 vector<utype> indexConvReverse; 00482 CSeekTools::InitVector(indexConvReverse, vecstrGenes.size(), (utype) -1); 00483 for(i=0; i<CD.GetGenes(); i++){ 00484 map<string,size_t>::iterator it = mapstriGenes.find(CD.GetGene(i)); 00485 if(it==mapstriGenes.end()) continue; 00486 indexConvReverse[i] = it->second; 00487 d1.Add(i); 00488 } 00489 00490 //Visualize 00491 for(j=0; j<vecstrAllQuery.size(); j++){ 00492 vector<string> vec_s; 00493 vector<utype> vec_g; 00494 for(k=0; k<vecstrAllQuery[j].size(); k++){ 00495 size_t ind = CD.GetGeneIndex(vecstrAllQuery[j][k]); 00496 if(ind==(size_t)-1) continue; 00497 vec_g.push_back(CD.GetGeneIndex(vecstrAllQuery[j][k])); 00498 vec_s.push_back(vecstrAllQuery[j][k]); 00499 } 00500 CDat V; 00501 V.Open(vec_s); 00502 for(k=0; k<vec_s.size(); k++){ 00503 for(l=k+1; l<vec_s.size(); l++){ 00504 V.Set(k, l, CD.Get(vec_g[k], vec_g[l])); 00505 } 00506 } 00507 fprintf(stderr, "Query %d\n", j); 00508 00509 if(sArgs.print_distr_flag==1){ 00510 float min = 9999; 00511 float max = -1; 00512 for(k=0; k<vec_s.size(); k++){ 00513 for(l=k+1; l<vec_s.size(); l++){ 00514 float v = CD.Get(vec_g[k], vec_g[l]); 00515 if(CMeta::IsNaN(v)) continue; 00516 if(v>max) max = v; 00517 if(v<min) min = v; 00518 } 00519 } 00520 float init = min; 00521 float step = (max - min)/100.0; 00522 float upper = max; 00523 float cutoff = init; 00524 while(cutoff<upper){ 00525 int count = 0; 00526 for(k=0; k<vec_s.size(); k++){ 00527 for(l=k+1; l<vec_s.size(); l++){ 00528 if(!CMeta::IsNaN(V.Get(k,l)) && V.Get(k,l)>=cutoff){ 00529 count++; 00530 } 00531 } 00532 } 00533 fprintf(stderr, "%.5f\t%d\n", cutoff, count); 00534 cutoff+=step; 00535 } 00536 } 00537 00538 sprintf(acBuffer, "%s/%d.dot", output_dir.c_str(), j); 00539 ofstream ot(acBuffer); 00540 V.SaveDOT(ot, cutoff_par, &g, false, true, NULL, NULL); 00541 } 00542 00543 } 00544 00545 if(sArgs.combined_flag==1){ 00546 string dab_base = sArgs.dab_basename_arg; 00547 string file1 = dab_dir + "/" + dab_base + ".dab"; 00548 00549 vector<vector<float> > q_weight; 00550 vector<vector<vector<float> > > nq_weight; 00551 00552 q_weight.resize(vecstrAllQuery.size()); 00553 for(i=0; i<vecstrAllQuery.size(); i++){ 00554 CSeekTools::InitVector(q_weight[i], numGenes, (float) 0); 00555 for(j=0; j<vecstrAllQuery[i].size(); j++) 00556 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1; 00557 } 00558 00559 nq_weight.resize(vecstrAllNotQuery.size()); 00560 if(not_query!="NA"){ 00561 for(i=0; i<vecstrAllNotQuery.size(); i++){ 00562 nq_weight[i].resize(vecstrAllNotQuery[i].size()); 00563 for(j=0; j<vecstrAllNotQuery[i].size(); j++){ 00564 CSeekTools::InitVector(nq_weight[i][j], numGenes, (float) 0); 00565 for(k=0; k<vecstrAllNotQuery[i][j].size(); k++) 00566 nq_weight[i][j][mapstriGenes[vecstrAllNotQuery[i][j][k]]] = 1; 00567 } 00568 } 00569 } 00570 00571 fprintf(stderr, "Start reading dab\n"); 00572 CDat CD; 00573 CD.Open(file1.c_str(), false, 2, false, false, false); 00574 fprintf(stderr, "Finished reading dab\n"); 00575 00576 CSeekIntIntMap d1(CD.GetGenes()); 00577 vector<utype> indexConvReverse; 00578 CSeekTools::InitVector(indexConvReverse, vecstrGenes.size(), (utype) -1); 00579 for(i=0; i<CD.GetGenes(); i++){ 00580 map<string,size_t>::iterator it = mapstriGenes.find(CD.GetGene(i)); 00581 if(it==mapstriGenes.end()) continue; 00582 indexConvReverse[i] = it->second; 00583 d1.Add(i); 00584 } 00585 00586 vector<vector<float> > final_score; 00587 final_score.resize(vecstrAllQuery.size()); 00588 for(j=0; j<vecstrAllQuery.size(); j++) 00589 CSeekTools::InitVector(final_score[j], vecstrGenes.size(), 00590 (float)CMeta::GetNaN()); 00591 00592 const vector<utype> &allGenes = d1.GetAllReverse(); 00593 for(j=0; j<vecstrAllQuery.size(); j++){ 00594 vector<float> tmp_score; 00595 search_one_dab(tmp_score, CD, vecstrGenes.size(), d1, indexConvReverse, 00596 q_weight[j], nq_weight[j]); 00597 for(kk=0; kk<d1.GetNumSet(); kk++){ 00598 utype k = allGenes[kk]; 00599 utype gi = indexConvReverse[k]; 00600 final_score[j][gi] = tmp_score[gi]; 00601 } 00602 } 00603 fprintf(stderr, "Writing output\n"); 00604 00605 for(j=0; j<vecstrAllQuery.size(); j++){ 00606 for(k=0; k<final_score[j].size(); k++){ 00607 if(CMeta::IsNaN(final_score[j][k])){ 00608 final_score[j][k] = -320; 00609 continue; 00610 } 00611 } 00612 sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), j); 00613 CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]); 00614 sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), j); 00615 CSeekTools::WriteArray(acBuffer, final_score[j]); 00616 } 00617 00618 fprintf(stderr, "Finished with search\n"); 00619 if(sArgs.print_distr_flag==0 && sArgs.generate_dot_flag==0){ 00620 return 0; 00621 } 00622 00623 float cutoff_par = sArgs.cutoff_arg; 00624 string genome = sArgs.genome_arg; 00625 vector<string> s1, s2; 00626 CSeekTools::ReadListTwoColumns(genome.c_str(), s1, s2); 00627 00628 CGenome g; 00629 g.Open(s1); 00630 for(i=0; i<s2.size(); i++){ 00631 CGene &g1 = g.GetGene(g.FindGene(s1[i])); 00632 g.AddSynonym(g1, s2[i]); 00633 } 00634 00635 //Visualize 00636 for(j=0; j<vecstrAllQuery.size(); j++){ 00637 //fprintf(stderr, "Query %d\n", j); 00638 vector<AResultFloat> ar; 00639 ar.resize(final_score[j].size()); 00640 for(k=0; k<final_score[j].size(); k++){ 00641 ar[k].i = k; 00642 ar[k].f = final_score[j][k]; 00643 } 00644 sort(ar.begin(), ar.end()); 00645 vector<utype> vec_g; 00646 vector<string> vec_s; 00647 vector<float> node_w; 00648 int FIRST = sArgs.top_genes_arg; 00649 for(k=0; k<FIRST; k++){ 00650 if(ar[k].f==-320) break; 00651 vec_g.push_back(CD.GetGeneIndex(vecstrGenes[ar[k].i])); 00652 vec_s.push_back(vecstrGenes[ar[k].i]); 00653 node_w.push_back(0); 00654 } 00655 //if(vec_g.size()!=FIRST) continue; 00656 for(k=0; k<vecstrAllQuery[j].size(); k++){ 00657 size_t ind = CD.GetGeneIndex(vecstrAllQuery[j][k]); 00658 if(ind==(size_t)-1) continue; 00659 vec_g.push_back(ind); 00660 vec_s.push_back(vecstrAllQuery[j][k]); 00661 node_w.push_back(1.0); 00662 } 00663 00664 CDat V; 00665 V.Open(vec_s); 00666 for(k=0; k<vec_s.size(); k++){ 00667 for(l=k+1; l<vec_s.size(); l++){ 00668 float v = CD.Get(vec_g[k], vec_g[l]); 00669 V.Set(k, l, v); 00670 } 00671 } 00672 00673 if(sArgs.print_distr_flag==1){ 00674 float min = 9999; 00675 float max = -1; 00676 for(k=0; k<vec_s.size(); k++){ 00677 for(l=k+1; l<vec_s.size(); l++){ 00678 float v = CD.Get(vec_g[k], vec_g[l]); 00679 if(CMeta::IsNaN(v)) continue; 00680 if(v>max) max = v; 00681 if(v<min) min = v; 00682 } 00683 } 00684 00685 float init = min; 00686 float step = (max - min)/100.0; 00687 float upper = max; 00688 00689 float cutoff = init; 00690 while(cutoff<upper){ 00691 int count = 0; 00692 for(k=0; k<vec_s.size(); k++){ 00693 for(l=k+1; l<vec_s.size(); l++){ 00694 if(!CMeta::IsNaN(V.Get(k,l)) && V.Get(k,l)>=cutoff){ 00695 count++; 00696 } 00697 } 00698 } 00699 fprintf(stderr, "%.5f\t%d\n", cutoff, count); 00700 cutoff+=step; 00701 } 00702 } 00703 00704 if(sArgs.generate_dot_flag==1){ 00705 sprintf(acBuffer, "%s/%d.dot", output_dir.c_str(), j); 00706 ofstream ot(acBuffer); 00707 V.SaveDOT(ot, cutoff_par, &g, false, true, &node_w, NULL); 00708 //V.SaveDOT(ot, 0.0001, NULL, true, false, NULL, NULL); 00709 } 00710 } 00711 00712 } 00713 00714 if(sArgs.test_flag==1){ 00715 string dab_base = sArgs.dab_basename_arg; 00716 string file1 = dab_dir + "/" + dab_base + ".half"; 00717 00718 vector<vector<float> > q_weight; 00719 q_weight.resize(vecstrAllQuery.size()); 00720 for(i=0; i<vecstrAllQuery.size(); i++){ 00721 CSeekTools::InitVector(q_weight[i], numGenes, (float) 0); 00722 for(j=0; j<vecstrAllQuery[i].size(); j++) 00723 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1; 00724 } 00725 00726 CHalfMatrix<float> res; 00727 res.Initialize(vecstrGenes.size()); 00728 ifstream istm1; 00729 uint32_t dim; 00730 istm1.open(file1.c_str(), ios_base::binary); 00731 istm1.read((char*)(&dim), sizeof(dim)); 00732 float *adScores = new float[dim-1]; 00733 for(i=0; (i+1)<dim; i++){ 00734 istm1.read((char*)adScores, sizeof(*adScores) * (dim - i - 1)); 00735 res.Set(i, adScores); 00736 } 00737 delete[] adScores; 00738 istm1.close(); 00739 00740 string s1 = "uc003vor.2"; 00741 vector<string> r1; 00742 r1.push_back("uc003vos.2"); 00743 r1.push_back("uc003vop.1"); 00744 r1.push_back("uc011jwz.1"); 00745 r1.push_back("uc002rgw.1"); 00746 00747 for(i=0; i<r1.size(); i++){ 00748 size_t i1 = mapstriGenes[s1]; 00749 size_t j1 = mapstriGenes[r1[i]]; 00750 fprintf(stderr, "%s %s %.5f\n", s1.c_str(), r1[i].c_str(), res.Get(i1, j1)); 00751 } 00752 } 00753 00754 if(sArgs.testcount_flag==1){ 00755 string dab_base = sArgs.dab_basename_arg; 00756 string file1 = dab_dir + "/" + dab_base + ".pair_count"; 00757 00758 vector<vector<float> > q_weight; 00759 q_weight.resize(vecstrAllQuery.size()); 00760 for(i=0; i<vecstrAllQuery.size(); i++){ 00761 CSeekTools::InitVector(q_weight[i], numGenes, (float) 0); 00762 for(j=0; j<vecstrAllQuery[i].size(); j++) 00763 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1; 00764 } 00765 00766 CHalfMatrix<unsigned short> res; 00767 res.Initialize(vecstrGenes.size()); 00768 ifstream istm1; 00769 uint32_t dim; 00770 istm1.open(file1.c_str(), ios_base::binary); 00771 istm1.read((char*)(&dim), sizeof(dim)); 00772 unsigned short *adScores = new unsigned short[dim-1]; 00773 for(i=0; (i+1)<dim; i++){ 00774 istm1.read((char*)adScores, sizeof(*adScores) * (dim - i - 1)); 00775 res.Set(i, adScores); 00776 } 00777 delete[] adScores; 00778 istm1.close(); 00779 00780 string s1 = "uc003vor.2"; 00781 vector<string> r1; 00782 r1.push_back("uc003vos.2"); 00783 r1.push_back("uc003vop.1"); 00784 r1.push_back("uc011jwz.1"); 00785 r1.push_back("uc002rgw.1"); 00786 00787 for(i=0; i<r1.size(); i++){ 00788 size_t i1 = mapstriGenes[s1]; 00789 size_t j1 = mapstriGenes[r1[i]]; 00790 fprintf(stderr, "%s %s %d\n", s1.c_str(), r1[i].c_str(), res.Get(i1, j1)); 00791 } 00792 } 00793 00794 if(sArgs.testcombined_flag==1){ 00795 string dab_base = sArgs.dab_basename_arg; 00796 string file3 = dab_dir + "/" + dab_base + ".gene_count"; 00797 string file2 = dab_dir + "/" + dab_base + ".pair_count"; 00798 string file1 = dab_dir + "/" + dab_base + ".half"; 00799 00800 vector<utype> gene_count; 00801 CSeekTools::ReadArray(file3.c_str(), gene_count); 00802 00803 float max_count = *max_element(gene_count.begin(), gene_count.end()); 00804 float min_count_required = max_count*0.50; 00805 CSeekIntIntMap d1(vecstrGenes.size()); 00806 vector<string> validGenes; 00807 for(i=0; i<vecstrGenes.size(); i++){ 00808 if(gene_count[i]<min_count_required) continue; 00809 validGenes.push_back(vecstrGenes[i]); 00810 d1.Add(i); 00811 } 00812 00813 CDat CD; 00814 CD.Open(validGenes); 00815 for(i=0; i<validGenes.size(); i++){ 00816 for(j=i+1; j<validGenes.size(); j++){ 00817 CD.Set(i,j,CMeta::GetNaN()); 00818 } 00819 } 00820 //CHalfMatrix<float> res; 00821 //res.Initialize(vecstrGenes.size()); 00822 ifstream istm1, istm2; 00823 uint32_t dim; 00824 istm1.open(file1.c_str(), ios_base::binary); 00825 istm1.read((char*)(&dim), sizeof(dim)); 00826 istm2.open(file2.c_str(), ios_base::binary); 00827 istm2.read((char*)(&dim), sizeof(dim)); 00828 float *adScores = new float[dim-1]; 00829 unsigned short *adScores2 = new unsigned short[dim-1]; 00830 for(i=0; (i+1)<dim; i++){ 00831 istm1.read((char*)adScores, sizeof(*adScores) * (dim - i - 1)); 00832 istm2.read((char*)adScores2, sizeof(*adScores2) * (dim - i - 1)); 00833 if(i%2000==0) 00834 fprintf(stderr, "%d Finished\n", i); 00835 utype gi = d1.GetForward(i); 00836 if(CSeekTools::IsNaN(gi)) continue; 00837 00838 for(j=0; j<dim-i-1; j++){ 00839 utype gj = d1.GetForward(j+i+1); 00840 if(CSeekTools::IsNaN(gj) || 00841 (float) adScores2[j]<min_count_required) continue; 00842 adScores[j] = adScores[j] / (float) adScores2[j]; 00843 CD.Set(gi, gj, adScores[j]); 00844 } 00845 00846 //res.Set(i, adScores); 00847 } 00848 delete[] adScores; 00849 delete[] adScores2; 00850 istm1.close(); 00851 istm2.close(); 00852 00853 string s1 = "uc003vor.2"; 00854 vector<string> r1; 00855 r1.push_back("uc003vos.2"); 00856 r1.push_back("uc003vop.1"); 00857 r1.push_back("uc011jwz.1"); 00858 r1.push_back("uc002rgw.1"); 00859 00860 for(i=0; i<r1.size(); i++){ 00861 //size_t i1 = mapstriGenes[s1]; 00862 //size_t j1 = mapstriGenes[r1[i]]; 00863 size_t i1 = CD.GetGeneIndex(s1); 00864 size_t j1 = CD.GetGeneIndex(r1[i]); 00865 fprintf(stderr, "%s %s %.5f\n", s1.c_str(), r1[i].c_str(), CD.Get(i1, j1)); 00866 } 00867 } 00868 00869 //Traditional DAB mode (.dab file) 00870 if(sArgs.tdab_flag==1){ 00871 string search_mode = sArgs.tsearch_mode_arg; 00872 if(search_mode=="NA"){ 00873 fprintf(stderr, "Please specify a search mode!\n"); 00874 return 1; 00875 } 00876 vector<string> dab_list; 00877 CSeekTools::ReadListOneColumn(sArgs.tdab_list_arg, dab_list); 00878 00879 fprintf(stderr, "Finished reading dablist\n"); 00880 //preparing query 00881 vector<vector<float> > q_weight; 00882 q_weight.resize(vecstrAllQuery.size()); 00883 for(i=0; i<vecstrAllQuery.size(); i++){ 00884 CSeekTools::InitVector(q_weight[i], numGenes, (float) 0); 00885 for(j=0; j<vecstrAllQuery[i].size(); j++) 00886 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1; 00887 } 00888 00889 //preparing query2 00890 vector<vector<unsigned int> > qq; 00891 qq.resize(vecstrAllQuery.size()); 00892 for(i=0; i<vecstrAllQuery.size(); i++){ 00893 qq[i] = vector<unsigned int>(); 00894 for(j=0; j<vecstrAllQuery[i].size(); j++) 00895 qq[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]); 00896 } 00897 00898 //selected datasets for each query 00899 vector<vector<char> > selectedDataset; 00900 selectedDataset.resize(vecstrAllQuery.size()); 00901 for(i=0; i<vecstrAllQuery.size(); i++) 00902 CSeekTools::InitVector(selectedDataset[i], dab_list.size(), (char)0); 00903 00904 fprintf(stderr, "Reading DAB\n"); 00905 vector<vector<float> > final_score, count, dweight; 00906 vector<vector<int> > freq; 00907 final_score.resize(vecstrAllQuery.size()); 00908 count.resize(vecstrAllQuery.size()); 00909 freq.resize(vecstrAllQuery.size()); 00910 dweight.resize(vecstrAllQuery.size()); 00911 for(j=0; j<vecstrAllQuery.size(); j++){ 00912 CSeekTools::InitVector(final_score[j], vecstrGenes.size(), (float)CMeta::GetNaN()); 00913 CSeekTools::InitVector(count[j], vecstrGenes.size(), (float) 0); 00914 CSeekTools::InitVector(freq[j], vecstrGenes.size(), (int) 0); 00915 CSeekTools::InitVector(dweight[j], dab_list.size(), (float) 0); 00916 } 00917 00918 float threshold_g = sArgs.threshold_g_arg; 00919 float threshold_q = sArgs.threshold_q_arg; 00920 bool DEBUG = !!sArgs.debug_flag; 00921 00922 size_t l; 00923 for(l=0; l<dab_list.size(); l++){ 00924 CDat Dat; 00925 fprintf(stderr, "Reading %d: %s\n", l, dab_list[l].c_str()); 00926 string dabfile = dab_dir + "/" + dab_list[l]; 00927 Dat.Open(dabfile.c_str(), false, 2, false, false, false); 00928 00929 fprintf(stderr, "Finished reading DAB\n"); 00930 vector<unsigned int> veciGenes; 00931 veciGenes.resize(vecstrGenes.size()); 00932 unsigned int ki; 00933 for(ki=0; ki<vecstrGenes.size(); ki++) 00934 veciGenes[ki] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[ki]); 00935 unsigned int s,t; 00936 float d; 00937 CSeekIntIntMap m(vecstrGenes.size()); 00938 for(i=0; i<vecstrGenes.size(); i++){ 00939 if((s=veciGenes[i])==(unsigned int)-1) continue; 00940 m.Add(i); 00941 } 00942 00943 //Copy to matrix sm 00944 CFullMatrix<float> sm; 00945 sm.Initialize(vecstrGenes.size(), vecstrGenes.size()); 00946 const vector<utype> &allRGenes = m.GetAllReverse(); 00947 for(i=0; i<m.GetNumSet(); i++){ 00948 unsigned int si = allRGenes[i]; 00949 s = veciGenes[si]; 00950 for(j=i+1; j<m.GetNumSet(); j++){ 00951 unsigned int tj = allRGenes[j]; 00952 t = veciGenes[allRGenes[j]]; 00953 if(CMeta::IsNaN(d = Dat.Get(s,t))){ 00954 sm.Set(si, tj, 0); 00955 sm.Set(tj, si, 0); 00956 }else{ 00957 sm.Set(si, tj, d); 00958 sm.Set(tj, si, d); 00959 } 00960 } 00961 sm.Set(si, si, 0); 00962 } 00963 00964 fprintf(stderr, "Finished copying matrix\n"); 00965 00966 for(j=0; j<vecstrAllQuery.size(); j++){ 00967 int numPresent = 0; 00968 for(k=0; k<qq[j].size(); k++){ 00969 if(m.GetForward(qq[j][k])==(unsigned int)-1) continue; 00970 numPresent++; 00971 } 00972 if(search_mode=="eq" && numPresent==0){ 00973 continue; 00974 }else if(search_mode=="cv_loi" && numPresent<=1){ 00975 continue; 00976 } 00977 int numThreshold = (int) (threshold_q * qq[j].size()); 00978 if(numPresent>numThreshold) 00979 selectedDataset[j][l] = 1; 00980 } 00981 00982 for(j=0; j<vecstrAllQuery.size(); j++){ 00983 //not enough query genes present 00984 if(selectedDataset[j][l]==0) continue; 00985 00986 float dw = 1.0; 00987 if(search_mode=="eq"){ 00988 dw = 1.0; 00989 }else{ //cv_loi, rbp_p = 0.99 00990 cv_weight(qu[j], sm, &m, 0.99, dw); 00991 } 00992 dweight[j][l] = dw; 00993 vector<float> tmp_score; 00994 get_score(tmp_score, sm, &m, q_weight[j]); 00995 for(k=0; k<m.GetNumSet(); k++){ 00996 utype gi = allRGenes[k]; 00997 if(CMeta::IsNaN(final_score[j][gi])) 00998 final_score[j][gi] = 0; 00999 final_score[j][gi] += tmp_score[gi] * dw; 01000 count[j][gi]+=dw; 01001 freq[j][gi]++; 01002 } 01003 } 01004 01005 } 01006 01007 for(j=0; j<vecstrAllQuery.size(); j++){ 01008 01009 int countSelected = 0; 01010 for(k=0; k<selectedDataset[j].size(); k++) 01011 countSelected+=selectedDataset[j][k]; 01012 01013 //int minRequired = (int) ((float) dab_list.size() * 0.50); 01014 int minRequired = (int) ((float) countSelected * threshold_g); 01015 01016 if(DEBUG){ 01017 int nG = 0; 01018 for(k=0; k<final_score[j].size(); k++){ 01019 if(freq[j][k]>=minRequired){ 01020 nG++; 01021 } 01022 } 01023 fprintf(stderr, "Query %d numSelectedDataset %d numGenesIntegrated %d\n", j, countSelected, nG); 01024 } 01025 01026 for(k=0; k<final_score[j].size(); k++){ 01027 if(freq[j][k]<minRequired){ 01028 final_score[j][k] = -320; 01029 continue; 01030 } 01031 if(CMeta::IsNaN(final_score[j][k])){ 01032 final_score[j][k] = -320; 01033 continue; 01034 } 01035 final_score[j][k] /= count[j][k]; 01036 } 01037 sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), j); 01038 CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]); 01039 sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), j); 01040 CSeekTools::WriteArray(acBuffer, final_score[j]); 01041 sprintf(acBuffer, "%s/%d.dweight", output_dir.c_str(), j); 01042 CSeekTools::WriteArray(acBuffer, dweight[j]); 01043 } 01044 return 0; 01045 } 01046 01047 01048 //DAB mode (sparse DAB: .2.dab) 01049 if(sArgs.dab_flag==1){ 01050 float threshold_g = sArgs.threshold_g_arg; 01051 float threshold_q = sArgs.threshold_q_arg; 01052 bool DEBUG = !!sArgs.debug_flag; 01053 01054 string search_mode = sArgs.tsearch_mode_arg; 01055 if(search_mode=="NA"){ 01056 fprintf(stderr, "Please specify a search mode!\n"); 01057 return 1; 01058 } 01059 01060 string norm_mode = sArgs.norm_mode_arg; 01061 if(sArgs.default_type_arg!=0 && sArgs.default_type_arg!=1){ 01062 fprintf(stderr, "Error, invalid type!\n"); 01063 return -1; 01064 } 01065 01066 if(norm_mode=="NA"){ 01067 fprintf(stderr, "Error, please specify a norm mode!\n"); 01068 return -1; 01069 } 01070 01071 float exp = sArgs.exp_arg; 01072 if(norm_mode=="rank"){ 01073 if(sArgs.max_rank_arg==-1){ 01074 fprintf(stderr, "Error, please supply the max rank flag.\n"); 01075 return -1; 01076 } 01077 if(sArgs.rbp_p_arg==-1){ 01078 fprintf(stderr, "Error, please supply the rbp_p flag.\n"); 01079 return -1; 01080 } 01081 }else if(norm_mode=="subtract_z"){ 01082 if(exp==-1){ 01083 fprintf(stderr, "Invalid exponent!\n"); 01084 return -1; 01085 } 01086 if(sArgs.rbp_p_arg==-1){ 01087 fprintf(stderr, "Error, please supply the rbp_p flag.\n"); 01088 return -1; 01089 } 01090 } 01091 01092 vector<float> score_cutoff; 01093 bool bDatasetCutoff = false; 01094 string dset_cutoff_file = sArgs.dset_cutoff_file_arg; 01095 if(dset_cutoff_file!="NA"){ 01096 CSeekTools::ReadArray(dset_cutoff_file.c_str(), score_cutoff); 01097 bDatasetCutoff = true; 01098 fprintf(stderr, "Filtered mode is on!\n"); 01099 } 01100 01101 float rbp_p = sArgs.rbp_p_arg; 01102 int max_rank = sArgs.max_rank_arg; 01103 int num_iter = sArgs.num_iter_arg; 01104 vector<string> dab_list; 01105 CSeekTools::ReadListOneColumn(sArgs.dab_list_arg, dab_list); 01106 vector<CSeekIntIntMap*> dm; 01107 dm.resize(dab_list.size()); 01108 01109 //selected datasets for each query 01110 vector<vector<char> > selectedDataset; 01111 selectedDataset.resize(vecstrAllQuery.size()); 01112 for(i=0; i<vecstrAllQuery.size(); i++) 01113 CSeekTools::InitVector(selectedDataset[i], dab_list.size(), (char)0); 01114 01115 //MODE 1 - Normal search: 01116 //preparing query 01117 vector<vector<float> > q_weight; 01118 q_weight.resize(vecstrAllQuery.size()); 01119 for(i=0; i<vecstrAllQuery.size(); i++){ 01120 CSeekTools::InitVector(q_weight[i], numGenes, (float) 0); 01121 for(j=0; j<vecstrAllQuery[i].size(); j++) 01122 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1; 01123 } 01124 01125 //preparing query2 01126 vector<vector<unsigned int> > qq; 01127 qq.resize(vecstrAllQuery.size()); 01128 for(i=0; i<vecstrAllQuery.size(); i++){ 01129 qq[i] = vector<unsigned int>(); 01130 for(j=0; j<vecstrAllQuery[i].size(); j++) 01131 qq[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]); 01132 } 01133 01134 fprintf(stderr, "Reading sparse DAB\n"); 01135 vector<vector<float> > final_score, count; 01136 vector<vector<float> > dweight; 01137 vector<vector<int> > freq; 01138 final_score.resize(vecstrAllQuery.size()); 01139 count.resize(vecstrAllQuery.size()); 01140 freq.resize(vecstrAllQuery.size()); 01141 dweight.resize(vecstrAllQuery.size()); 01142 for(j=0; j<vecstrAllQuery.size(); j++){ 01143 CSeekTools::InitVector(final_score[j], vecstrGenes.size(), (float)CMeta::GetNaN()); 01144 CSeekTools::InitVector(count[j], vecstrGenes.size(), (float) 0); 01145 CSeekTools::InitVector(freq[j], vecstrGenes.size(), (int) 0); 01146 CSeekTools::InitVector(dweight[j], dab_list.size(), (float) 0); 01147 } 01148 01149 if(bDatasetCutoff) fprintf(stderr, "Dataset cutoff is on!\n"); 01150 else fprintf(stderr, "Dataset cutoff is off!\n"); 01151 01152 01153 for(i=0; i<dab_list.size(); i++){ 01154 fprintf(stderr, "Reading %d: %s\n", i, dab_list[i].c_str()); 01155 CSeekIntIntMap d1(vecstrGenes.size()); 01156 string dabfile = dab_dir + "/" + dab_list[i]; 01157 CSparseFlatMatrix<float> sm (0); 01158 01159 if(norm_mode=="rank"){ 01160 if(sArgs.default_type_arg==0) //utype 01161 CSeekWriter::ReadSeekSparseMatrix<utype>(dabfile.c_str(), sm, d1, 01162 max_rank, rbp_p, vecstrGenes); 01163 else 01164 CSeekWriter::ReadSeekSparseMatrix<unsigned short>(dabfile.c_str(), 01165 sm, d1, max_rank, rbp_p, vecstrGenes); 01166 }else if(norm_mode=="subtract_z"){ 01167 if(sArgs.default_type_arg==0) //utype 01168 CSeekWriter::ReadSeekSparseMatrix<utype>(dabfile.c_str(), sm, d1, 01169 vecstrGenes, 1000, exp); 01170 else 01171 CSeekWriter::ReadSeekSparseMatrix<unsigned short>(dabfile.c_str(), sm, d1, 01172 vecstrGenes, 1000, exp); 01173 } 01174 const vector<utype> &allGenes = d1.GetAllReverse(); 01175 01176 for(j=0; j<vecstrAllQuery.size(); j++){ 01177 int numPresent = 0; 01178 for(k=0; k<qq[j].size(); k++){ 01179 if(d1.GetForward(qq[j][k])==(unsigned int)-1) continue; 01180 numPresent++; 01181 } 01182 if(search_mode=="eq" && numPresent==0){ 01183 continue; 01184 }else if(search_mode=="cv_loi" && numPresent<=1){ 01185 continue; 01186 } 01187 int numThreshold = (int) (threshold_q * qq[j].size()); 01188 if(numPresent>numThreshold) 01189 selectedDataset[j][i] = 1; 01190 } 01191 01192 #pragma omp parallel for \ 01193 shared(qu, sm, d1, dweight, final_score, count, freq, score_cutoff) \ 01194 private(j, k) firstprivate(bDatasetCutoff, i) schedule(dynamic) 01195 for(j=0; j<vecstrAllQuery.size(); j++){ 01196 //not enough query genes present 01197 if(selectedDataset[j][i]==0) continue; 01198 01199 float dw = 1.0; 01200 if(search_mode=="eq"){ 01201 dw = 1.0; 01202 }else{ 01203 //cv_weight_LOO(qu[j], sm, &d1, rbp_p, dw); 01204 cv_weight(qu[j], sm, &d1, rbp_p, dw); 01205 } 01206 01207 if(bDatasetCutoff){ 01208 if(score_cutoff[i]>dw) 01209 dw = 0; 01210 //fprintf(stderr, "%.3e %.3e\n", score_cutoff[i], dw); 01211 } 01212 //fprintf(stderr, "%.3e\n", dw); 01213 dweight[j][i] = dw; 01214 //if(dw==0) 01215 // continue; 01216 vector<float> tmp_score; 01217 get_score(tmp_score, sm, &d1, q_weight[j]); 01218 for(k=0; k<d1.GetNumSet(); k++){ 01219 utype gi = allGenes[k]; 01220 if(CMeta::IsNaN(final_score[j][gi])) 01221 final_score[j][gi] = 0; 01222 final_score[j][gi] += tmp_score[gi] * dw; 01223 count[j][gi]+=dw; 01224 freq[j][gi]++; 01225 } 01226 } 01227 } 01228 01229 /* 01230 ofstream ofsm; 01231 ofsm.open("/memex/qzhu/p4/concatenate_tumor_network.half", ios_base::binary); 01232 res.Save(ofsm, true); 01233 */ 01234 01235 for(j=0; j<vecstrAllQuery.size(); j++){ 01236 int countSelected = 0; 01237 for(k=0; k<selectedDataset[j].size(); k++) 01238 countSelected+=selectedDataset[j][k]; 01239 01240 int minRequired = (int) ((float) countSelected * threshold_g); 01241 01242 if(DEBUG){ 01243 int nG = 0; 01244 for(k=0; k<final_score[j].size(); k++){ 01245 if(freq[j][k]>=minRequired){ 01246 nG++; 01247 } 01248 } 01249 fprintf(stderr, "Query %d numSelectedDataset %d numGenesIntegrated %d\n", j, countSelected, nG); 01250 } 01251 01252 for(k=0; k<final_score[j].size(); k++){ 01253 if(freq[j][k]<minRequired){ 01254 final_score[j][k] = -320; 01255 continue; 01256 } 01257 if(CMeta::IsNaN(final_score[j][k])){ 01258 final_score[j][k] = -320; 01259 continue; 01260 } 01261 final_score[j][k] /= count[j][k]; 01262 } 01263 01264 sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), j); 01265 CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]); 01266 sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), j); 01267 CSeekTools::WriteArray(acBuffer, final_score[j]); 01268 sprintf(acBuffer, "%s/%d.dweight", output_dir.c_str(), j); 01269 CSeekTools::WriteArray(acBuffer, dweight[j]); 01270 } 01271 01272 //MODE 2 01273 /*vector<vector<utype> > qu; 01274 qu.resize(vecstrAllQuery.size()); 01275 for(i=0; i<vecstrAllQuery.size(); i++){ 01276 qu[i] = vector<utype>(); 01277 for(j=0; j<vecstrAllQuery[i].size(); j++) 01278 qu[i].push_back(mapstriGenes[vecstrAllQuery[i][j]]); 01279 } 01280 vector<vector<float> > q_weight; 01281 q_weight.resize(vecstrAllQuery.size()); 01282 for(i=0; i<vecstrAllQuery.size(); i++){ 01283 CSeekTools::InitVector(q_weight[i], numGenes, (float) 0); 01284 for(j=0; j<vecstrAllQuery[i].size(); j++) 01285 q_weight[i][mapstriGenes[vecstrAllQuery[i][j]]] = 1; 01286 } 01287 01288 float alpha1 = 0.1; //inter - propagation 01289 float alpha2 = 0.1; //intra - propagation 01290 //float alpha = 0; 01291 bool label_propagate = false; 01292 01293 for(i=0; i<dab_list.size(); i++){ 01294 CSeekIntIntMap d1(vecstrGenes.size()); 01295 vector<vector<float> > sm1; 01296 vector<utype> l1; 01297 string dabfile1 = dab_dir + "/" + dab_list[i]; 01298 CSeekWriter::ReadSparseMatrixAsArray(l1, dabfile1.c_str()); 01299 CSeekWriter::ReadSparseMatrix(l1, sm1, d1, 1000, 0.99, vecstrGenes); 01300 01301 cerr << "1 " << dabfile1 << endl; 01302 vector<vector<float> > final_score; 01303 final_score.resize(vecstrAllQuery.size()); 01304 01305 //vector<vector<float> > tmp_score; 01306 //tmp_score.resize(vecstrAllQuery.size()); 01307 01308 vector<vector<float> > tmp_score_2; 01309 tmp_score_2.resize(vecstrAllQuery.size()); 01310 01311 //this score 01312 //component 1 01313 for(j=0; j<vecstrAllQuery.size(); j++){ 01314 //get_score(tmp_score[j], sm1, &d1, q_weight[j]); 01315 init_score(tmp_score_2[j], &d1); 01316 init_score(final_score[j], &d1); 01317 } 01318 01319 for(j=0; label_propagate && j<dab_list.size(); j++){ 01320 if(i==j) continue; 01321 CSeekIntIntMap d2(vecstrGenes.size()); 01322 vector<vector<float> > sm2; 01323 vector<utype> l2; 01324 string dabfile2 = dab_dir + "/" + dab_list[j]; 01325 cerr << "2 " << dabfile2 << endl; 01326 CSeekWriter::ReadSparseMatrixAsArray(l2, dabfile2.c_str()); 01327 CSeekWriter::ReadSparseMatrix(l2, sm2, d2, 1000, 0.99, vecstrGenes); 01328 01329 //similarity matrix 01330 vector<vector<float> > sim; 01331 CSeekWriter::ProductNorm(sm1, sm2, d1, d2, sim); 01332 01333 utype kj, kk; 01334 for(k=0; k<vecstrAllQuery.size(); k++){ 01335 //component 2 01336 cerr << k << " of " << vecstrAllQuery.size() << endl; 01337 vector<float> intra; 01338 vector<float> inter; 01339 get_score(intra, sm2, &d2, q_weight[k]); 01340 get_score(inter, sim, &d2, intra); 01341 //get_score(inter, sim, &d2, q_weight[k]); 01342 add_score(inter, tmp_score_2[k], &d2); 01343 } 01344 } 01345 01346 for(j=0; j<vecstrAllQuery.size(); j++){ 01347 if(label_propagate){ 01348 vector<float> tmp3; 01349 integrate(q_weight[j], tmp_score_2[j], tmp3, &d1, dab_list.size()-1, alpha1); 01350 iterate(tmp3, final_score[j], sm1, &d1, alpha2, 1); 01351 }else{ 01352 //integrate(q_weight[j], tmp_score_2[j], tmp3, &d1, dab_list.size()-1, alpha1); 01353 iterate(q_weight[j], final_score[j], sm1, &d1, alpha2, 1); 01354 } 01355 01356 for(k=0; k<final_score[j].size(); k++){ 01357 if(CMeta::IsNaN(final_score[j][k])){ 01358 final_score[j][k] = -320; 01359 continue; 01360 } 01361 } 01362 01363 for(k=0; k<qu[j].size(); k++){ 01364 final_score[j][qu[j][k]] = -320; 01365 } 01366 01367 sprintf(acBuffer, "%s/%s/%d.query", output_dir.c_str(), dab_list[i].c_str(), j); 01368 CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[j]); 01369 sprintf(acBuffer, "%s/%s/%d.gscore", output_dir.c_str(), dab_list[i].c_str(), j); 01370 CSeekTools::WriteArray(acBuffer, final_score[j]); 01371 } 01372 }*/ 01373 01374 } 01375 01376 }