Sleipnir
|
00001 #include "stdafx.h" 00002 #include "cmdline.h" 00003 00004 struct bresult{ 00005 int i; 00006 int j; 00007 float f; 00008 float f2; 00009 }; 00010 00011 struct ascend{ 00012 bool operator()(const bresult& lx, const bresult& rx) const{ 00013 return lx.f < rx.f; 00014 } 00015 }; 00016 00017 struct descend{ 00018 bool operator()(const bresult& lx, const bresult& rx) const{ 00019 return lx.f > rx.f; 00020 } 00021 }; 00022 00023 00024 bool calculate_correlation( 00025 vector<vector<vector<float> > > &mat, 00026 vector<CSeekIntIntMap*> &dm, 00027 utype g1, 00028 utype g2, 00029 float &r){ 00030 00031 int numDatasets = mat.size(); 00032 int d, i; 00033 vector<float> v1, v2; 00034 00035 vector<int> count_common; 00036 CSeekTools::InitVector(count_common, numDatasets, (int) 0); 00037 00038 for(d=0; d<numDatasets; d++){ 00039 if(CSeekTools::IsNaN(dm[d]->GetForward(g1))) continue; 00040 count_common[d]++; 00041 } 00042 00043 for(d=0; d<numDatasets; d++){ 00044 if(CSeekTools::IsNaN(dm[d]->GetForward(g2))) continue; 00045 count_common[d]++; 00046 } 00047 00048 int total_common = 0; 00049 for(d=0; d<numDatasets; d++){ 00050 if(count_common[d]==2) total_common++; 00051 } 00052 00053 /*if(total_common<0.5*(float)numDatasets){ 00054 r = -320; 00055 return true; 00056 }*/ 00057 00058 for(d=0; d<numDatasets; d++){ 00059 if(count_common[d]==2){ 00060 utype n1 = dm[d]->GetForward(g1); 00061 utype n2 = dm[d]->GetForward(g2); 00062 int numExp = mat[d][n1].size(); 00063 for(i=0; i<numExp; i++){ 00064 v1.push_back(mat[d][n1][i]); 00065 v2.push_back(mat[d][n2][i]); 00066 /*if(isinf(mat[d][n1][i])||isnan(mat[d][n1][i])){ 00067 fprintf(stderr, "BAD: d %d g %d e %d v %.2f\n", d, n1, i, mat[d][n1][i]); 00068 } 00069 if(isinf(mat[d][n2][i])||isnan(mat[d][n2][i])){ 00070 fprintf(stderr, "BAD: d %d g %d e %d v %.2f\n", d, n2, i, mat[d][n2][i]); 00071 }*/ 00072 } 00073 } 00074 } 00075 00076 //pearson correlation calculation 00077 double mean_x = 0; 00078 double mean_y = 0; 00079 int xx = 0; 00080 for(xx=0; xx<v1.size(); xx++){ 00081 mean_x+=v1[xx]; 00082 mean_y+=v2[xx]; 00083 } 00084 mean_x /= (double) v1.size(); 00085 mean_y /= (double) v2.size(); 00086 00087 double sum_xy = 0; 00088 double dev_x = 0; 00089 double dev_y = 0; 00090 for(xx=0; xx<v1.size(); xx++){ 00091 sum_xy += (v1[xx] - mean_x) * (v2[xx] - mean_y); 00092 dev_x += (v1[xx] - mean_x) * (v1[xx] - mean_x); 00093 dev_y += (v2[xx] - mean_y) * (v2[xx] - mean_y); 00094 } 00095 dev_x = sqrt(dev_x); 00096 dev_y = sqrt(dev_y); 00097 r = (float) (sum_xy / dev_x / dev_y); 00098 if(isinf(r) || isnan(r)){ 00099 r = -320; 00100 } 00101 //fprintf(stderr, "%.2f\t%d\n", r, total_common); 00102 00103 return true; 00104 00105 } 00106 00107 int main(int iArgs, char **aszArgs){ 00108 static const size_t c_iBuffer = 1024; 00109 char acBuffer[c_iBuffer]; 00110 00111 gengetopt_args_info sArgs; 00112 ifstream ifsm; 00113 istream *pistm; 00114 vector<string> vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery; 00115 utype i, qi, j, k, l; 00116 00117 if(cmdline_parser(iArgs, aszArgs, &sArgs)){ 00118 cmdline_parser_print_help(); 00119 return 1; 00120 } 00121 00122 if( sArgs.input_arg ) { 00123 ifsm.open( sArgs.input_arg ); 00124 pistm = &ifsm; 00125 }else{ 00126 pistm = &cin; 00127 } 00128 00129 fprintf(stderr, "Reading gene list\n"); 00130 map<string, size_t> mapstriGenes; 00131 while( !pistm->eof( ) ) { 00132 pistm->getline( acBuffer, c_iBuffer - 1 ); 00133 acBuffer[ c_iBuffer - 1 ] = 0; 00134 vecstrLine.clear( ); 00135 CMeta::Tokenize( acBuffer, vecstrLine ); 00136 if( vecstrLine.size( ) < 2 ) { 00137 cerr << "Ignoring line: " << acBuffer << endl; 00138 continue; 00139 } 00140 if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) { 00141 cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " 00142 << vecstrLine[ 1 ] << endl; 00143 return 1; 00144 } 00145 i--; 00146 if( vecstrGenes.size( ) <= i ) 00147 vecstrGenes.resize( i + 1 ); 00148 vecstrGenes[ i ] = vecstrLine[ 1 ]; 00149 mapstriGenes[vecstrGenes[i]] = i; 00150 } 00151 00152 //char acBuffer[1024]; 00153 00154 fprintf(stderr, "Finished reading gene map\n"); 00155 if(sArgs.pcl_flag==1){ 00156 string pcl_dir = sArgs.pcl_dir_arg; 00157 string output_dir = sArgs.dir_out_arg; 00158 fprintf(stderr, "Arrived here\n"); 00159 vector<string> pcl_list; 00160 vector< vector<string> > vecstrAllQuery; 00161 int numGenes = vecstrGenes.size(); 00162 00163 fprintf(stderr, "Reading query\n"); 00164 if(!CSeekTools::ReadMultipleQueries(sArgs.query_arg, vecstrAllQuery)) 00165 return -1; 00166 00167 fprintf(stderr, "Reading pcl list\n"); 00168 CSeekTools::ReadListOneColumn(sArgs.pcl_list_arg, pcl_list); 00169 vector< vector< vector<float> > > mat; //only needed for cor-calc 00170 vector<CSeekIntIntMap*> dm; 00171 dm.resize(pcl_list.size()); 00172 mat.resize(pcl_list.size()); //only needed for cor-calc 00173 for(i=0; i<pcl_list.size(); i++){ 00174 fprintf(stderr, "Reading %d: %s\n", i, pcl_list[i].c_str()); 00175 dm[i] = new CSeekIntIntMap(vecstrGenes.size()); 00176 00177 string pclfile = pcl_dir + "/" + pcl_list[i] + ".bin"; 00178 00179 CPCL pcl; 00180 pcl.Open(pclfile.c_str()); 00181 int totNumExperiments = pcl.GetExperiments(); 00182 00183 vector<utype> presentIndex; 00184 vector<string> presentGeneNames; 00185 for(j=0; j<vecstrGenes.size(); j++){ 00186 utype g = pcl.GetGene(vecstrGenes[j]); 00187 if(CSeekTools::IsNaN(g)) continue; //gene does not exist in the dataset 00188 presentIndex.push_back(g); 00189 presentGeneNames.push_back(vecstrGenes[j]); 00190 dm[i]->Add(j); 00191 } 00192 00193 //only needed for correlation calc 00194 mat[i].resize(presentIndex.size()); 00195 for(j=0; j<presentIndex.size(); j++) 00196 mat[i][j].resize(totNumExperiments); 00197 00198 for(j=0; j<presentIndex.size(); j++){ 00199 float *val = pcl.Get(presentIndex[j]); 00200 for(k=0; k<pcl.GetExperiments(); k++){ 00201 mat[i][j][k] = val[k]; 00202 if(isinf(val[k])||isnan(val[k])){ 00203 fprintf(stderr, "loading error: %d of %d\n", k, pcl.GetExperiments()); 00204 } 00205 } 00206 } 00207 00208 } 00209 00210 float present_cutoff = 0.5; 00211 vector<char> absent_g; 00212 CSeekTools::InitVector(absent_g, numGenes, (char) 0); 00213 00214 for(i=0; i<numGenes; i++){ 00215 int totAbsent = 0; 00216 for(j=0; j<pcl_list.size(); j++){ 00217 CSeekIntIntMap *mapG = dm[j]; 00218 if(CSeekTools::IsNaN(mapG->GetForward(i))){ 00219 totAbsent++; 00220 } 00221 } 00222 if(totAbsent<0.5*(float)pcl_list.size()){ 00223 absent_g[i] = 0; 00224 }else{ 00225 absent_g[i] = 1; 00226 } 00227 } 00228 00229 CSeekIntIntMap *geneMap = new CSeekIntIntMap(numGenes); 00230 for(i=0; i<numGenes; i++){ 00231 if(absent_g[i]==1) continue; 00232 geneMap->Add(i); 00233 } 00234 00235 int numActualGenes = geneMap->GetNumSet(); 00236 00237 00238 //Do pair per machine, Step 2============================ 00239 /* 00240 vector<utype> pairs; 00241 CSeekTools::ReadArray("/tmp/pairs_to_do", pairs); 00242 00243 int numP = pairs.size()/2; 00244 int pi=0; 00245 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00246 vector<float> cor; 00247 cor.resize(numP); 00248 00249 #pragma omp parallel for \ 00250 shared(allGenes, cor, mat, dm, pairs) \ 00251 private(pi) \ 00252 firstprivate(numP) schedule(dynamic) 00253 for(pi=0; pi<numP; pi++){ 00254 utype g1 = allGenes[pairs[pi*2]]; 00255 utype g2 = allGenes[pairs[pi*2+1]]; 00256 if(g1==g2){ 00257 cor[pi] = -320; 00258 continue; 00259 } 00260 calculate_correlation(mat, dm, g1, g2, cor[pi]); 00261 if(pi%1000==0){ 00262 fprintf(stderr, " %d of %d\n", pi, numP); 00263 } 00264 } 00265 00266 CSeekTools::WriteArray("/tmp/results_pairs", cor); 00267 */ 00268 //============================================================= 00269 00270 00271 00272 vector< vector<float> > correlations; 00273 correlations.resize(numActualGenes); 00274 for(i=0; i<numActualGenes; i++){ 00275 correlations[i].resize(numActualGenes); 00276 } 00277 00278 //combining pairs, Step 3================================= 00279 /* 00280 int max_i = 49; 00281 char file[256]; 00282 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00283 00284 for(i=0; i<=max_i; i++){ 00285 vector<utype> pairs; 00286 vector<float> cor; 00287 00288 sprintf(file, "/memex/qzhu/p1/pairs_to_do.%d", i); 00289 CSeekTools::ReadArray(file, pairs); 00290 sprintf(file, "/memex/qzhu/p1/oct6/pairs_to_do.%d_results", i); 00291 CSeekTools::ReadArray(file, cor); 00292 00293 int numP = pairs.size()/2; 00294 int pi=0; 00295 00296 for(pi=0; pi<numP; pi++){ 00297 correlations[pairs[pi*2]][pairs[pi*2+1]] = cor[pi]; 00298 correlations[pairs[pi*2+1]][pairs[pi*2]] = cor[pi]; 00299 } 00300 } 00301 00302 vector<float> correlation1D; 00303 correlation1D.resize(numActualGenes*numActualGenes); 00304 int kk=0; 00305 for(i=0; i<numActualGenes; i++){ 00306 for(j=0; j<numActualGenes; j++){ 00307 correlation1D[kk] = correlations[i][j]; 00308 kk++; 00309 } 00310 } 00311 CSeekTools::WriteArray("/memex/qzhu/p1/aggregated_dataset_correlation", correlation1D); 00312 00313 fprintf(stderr, "Finished creating file\n"); 00314 getchar(); 00315 */ 00316 //============================================================ 00317 00318 00319 /* 00320 //generate pairs per machine, Step 1================================ 00321 vector<utype> pairs; 00322 int numP = numActualGenes*(numActualGenes-1); 00323 pairs.resize(numP); 00324 int ki = 0; 00325 for(i=0; i<numActualGenes; i++){ 00326 for(j=i+1; j<numActualGenes; j++){ 00327 pairs[ki] = i; 00328 ki++; 00329 pairs[ki] = j; 00330 ki++; 00331 } 00332 } 00333 00334 if(ki!=numP){ 00335 fprintf(stderr, "Cannot continue!\n"); 00336 return -1; 00337 } 00338 00339 int num_pairs_per_file = (numP / 2) / 49; 00340 00341 ki = 0; 00342 vector<utype> pp; 00343 pp.resize(num_pairs_per_file*2); 00344 char destfile[256]; 00345 int kj = 0; 00346 int ii = 0; 00347 00348 fprintf(stderr, "Numpairs per file: %d\n", num_pairs_per_file); 00349 for(ii=0; ii<numP/2; ii++){ 00350 if(ii%num_pairs_per_file==0 && ii>0){ 00351 sprintf(destfile, "/memex/qzhu/p1/pairs_to_do.%d", ki); 00352 CSeekTools::WriteArray(destfile, pp); 00353 pp.clear(); 00354 pp.resize(num_pairs_per_file*2); 00355 ki++; 00356 kj = 0; 00357 } 00358 pp[kj] = pairs[ii*2]; 00359 pp[kj+1] = pairs[ii*2+1]; 00360 kj+=2; 00361 if(ii==numP/2-1){ 00362 sprintf(destfile, "/memex/qzhu/p1/pairs_to_do.%d", ki); 00363 pp.resize(kj); 00364 CSeekTools::WriteArray(destfile, pp); 00365 } 00366 } 00367 00368 00369 fprintf(stderr, "Finished\n"); 00370 getchar(); 00371 //===================================================================== 00372 */ 00373 00374 //Load aggregated dataset, Step 4===================================== 00375 00376 vector<float> correlation1D; 00377 CSeekTools::ReadArray("/home/qzhu/Seek/aggregated_dataset_correlation", correlation1D); 00378 int pi = 0; 00379 for(i=0; i<numActualGenes; i++){ 00380 for(j=0; j<numActualGenes; j++){ 00381 correlations[i][j] = correlation1D[pi]; 00382 pi++; 00383 } 00384 } 00385 correlation1D.clear(); 00386 00387 //====================================================================== 00388 00389 00390 /* 00391 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00392 //start correlation calculations 00393 for(i=0; i<numActualGenes; i++){ 00394 utype g1 = allGenes[i]; 00395 fprintf(stderr, "Gene %d of %d: %d\n", i, numActualGenes, numActualGenes - (i+1)); 00396 00397 #pragma omp parallel for \ 00398 shared(allGenes, correlations, mat, dm) \ 00399 private(j) \ 00400 firstprivate(numActualGenes, g1, i) schedule(dynamic) 00401 for(j=i+1; j<numActualGenes; j++){ 00402 utype tid = omp_get_thread_num(); 00403 utype g2 = allGenes[j]; 00404 if(g1==g2){ 00405 correlations[i][j] = -320; //Null value 00406 correlations[j][i] = -320; 00407 continue; 00408 } 00409 float r = -320; 00410 calculate_correlation(mat, dm, g1, g2, r); 00411 correlations[i][j] = r; 00412 correlations[j][i] = r; 00413 //if((j-(i+1))%500==0){ 00414 // fprintf(stderr, " %d of %d from %d\n", j - (i+1), numActualGenes - (i+1), tid); 00415 //} 00416 } 00417 fprintf(stderr, "Done\n"); 00418 } 00419 00420 vector<float> correlation1D; 00421 correlation1D.resize(numActualGenes*numActualGenes); 00422 int kk=0; 00423 for(i=0; i<numActualGenes; i++){ 00424 for(j=0; j<numActualGenes; j++){ 00425 correlation1D[kk] = correlations[i][j]; 00426 kk++; 00427 } 00428 } 00429 CSeekTools::WriteArray("/memex/qzhu/p1/aggregated_dataset_correlation", correlation1D); 00430 */ 00431 00432 //Evaluation Last step================================================== 00433 00434 const vector<utype> &allGenes = geneMap->GetAllReverse(); 00435 00436 for(i=0; i<vecstrAllQuery.size(); i++){ 00437 vector<utype> q; 00438 vector<char> is_query; 00439 00440 fprintf(stderr, "Query %d begin\n", i); //getchar(); 00441 00442 CSeekTools::InitVector(is_query, numGenes, (char) 0); 00443 for(j=0; j<vecstrAllQuery[i].size(); j++){ 00444 q.push_back(mapstriGenes[vecstrAllQuery[i][j]]); 00445 is_query[mapstriGenes[vecstrAllQuery[i][j]]] = 1; 00446 } 00447 00448 vector<float> gene_score; 00449 vector<char> gene_count; 00450 CSeekTools::InitVector(gene_score, numGenes, (float) CMeta::GetNaN()); 00451 CSeekTools::InitVector(gene_count, numGenes, (char) 0); 00452 int qi=0; 00453 int tot_present_q = 0; 00454 for(qi=0; qi<q.size(); qi++){ 00455 if(CSeekTools::IsNaN(geneMap->GetForward(q[qi]))) continue; 00456 tot_present_q++; 00457 utype qg = geneMap->GetForward(q[qi]); 00458 for(j=0; j<numActualGenes; j++){ 00459 utype gi = allGenes[j]; 00460 if(correlations[qg][j]==-320){ 00461 continue; 00462 } 00463 if(CMeta::IsNaN(gene_score[gi])){ 00464 gene_score[gi] = 0; 00465 } 00466 gene_score[gi] += correlations[qg][j]; 00467 gene_count[gi]++; 00468 //fprintf(stderr, "%.2f\n", correlations[qg][j]); 00469 } 00470 } 00471 00472 int total_non_zero = 0; 00473 for(j=0; j<numActualGenes; j++){ 00474 utype gi = allGenes[j]; 00475 if(gene_count[gi]!=tot_present_q){ 00476 gene_score[gi] = -320; 00477 continue; 00478 } 00479 if(CMeta::IsNaN(gene_score[gi])){ 00480 gene_score[gi] = -320; 00481 continue; 00482 } 00483 if(is_query[gi]==1){ 00484 gene_score[gi] = -320; 00485 continue; 00486 } 00487 if(tot_present_q==0){ 00488 gene_score[gi] = -320; 00489 continue; 00490 } 00491 gene_score[gi] /= (float) tot_present_q; 00492 total_non_zero++; 00493 } 00494 00495 //fprintf(stderr, "Total non zero genes: %d\n", total_non_zero); 00496 00497 for(j=0; j<numGenes; j++){ 00498 if(CMeta::IsNaN(gene_score[j])){ 00499 gene_score[j] = -320; 00500 } 00501 } 00502 00503 sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), i); 00504 CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[i]); 00505 sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), i); 00506 CSeekTools::WriteArray(acBuffer, gene_score); 00507 00508 //gs.clear(); 00509 //b.clear(); 00510 00511 } 00512 00513 00514 } 00515 00516 00517 }