Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 #include <iomanip> 00025 00026 00027 float** LoadGenes(const vector<string> struserGenes, 00028 const vector<utype> &veciGenes, const vector<string> &vecstrGenes, 00029 const vector<utype> &veciallGenes, CSeekIntIntMap *gmap, 00030 CSeekDataset *vcd, float **vall){ 00031 00032 float** v2 = CSeekTools::Init2DArray(struserGenes.size(), 00033 vecstrGenes.size(), CMeta::GetNaN()); 00034 size_t i, j; 00035 00036 for(i=0; i<struserGenes.size(); i++){ 00037 utype s = veciGenes[i]; //s is user gene ID 00038 if(CSeekTools::IsNaN(s)) continue; 00039 00040 for(j=0; j<vecstrGenes.size(); j++){ 00041 utype t = veciallGenes[j]; 00042 if(CSeekTools::IsNaN(t)) continue; 00043 if(CMeta::IsNaN(vall[s][t])) continue; 00044 if(CSeekTools::IsNaN(gmap->GetForward(t))) continue; 00045 v2[i][t] = vall[s][t]; 00046 } 00047 } 00048 return v2; 00049 } 00050 00051 float** ReadUserGenes(const vector<string> &struserGenes, 00052 const vector<utype> &veciGenes, const vector<string> &vecstrGenes, 00053 const vector<utype> &veciallGenes, CSeekIntIntMap *gmap, 00054 CSeekDataset *vcd, CDataPair &Dat){ 00055 00056 float** v2 = CSeekTools::Init2DArray(struserGenes.size(), 00057 vecstrGenes.size(), CMeta::GetNaN()); 00058 size_t i, j; 00059 00060 for(i=0; i<struserGenes.size(); i++){ 00061 utype s = veciGenes[i]; //s is user gene ID 00062 if(CSeekTools::IsNaN(s)) continue; 00063 float *v = Dat.GetFullRow(s); 00064 00065 for(j=0; j<vecstrGenes.size(); j++){ 00066 utype t = veciallGenes[j]; 00067 if(CSeekTools::IsNaN(t)) continue; 00068 if(CMeta::IsNaN(v[t])) continue; 00069 if(CSeekTools::IsNaN(gmap->GetForward(t))) continue; 00070 //Disable gene average subtraction! 00071 //v[t] = v[t] - vcd->GetGeneAverage(t); 00072 v2[i][t] = v[t]; 00073 //fprintf(stderr, "%.2f\n", v2[i][t]); 00074 } 00075 free(v); 00076 } 00077 return v2; 00078 } 00079 00080 float** ReadAllGenes(const vector<string> &vecstrGenes, 00081 const vector<utype> &veciallGenes, CSeekIntIntMap *gmap, 00082 CSeekDataset *vcd, CDataPair &Dat){ 00083 00084 float** v2 = CSeekTools::Init2DArray(vecstrGenes.size(), 00085 vecstrGenes.size(), CMeta::GetNaN()); 00086 size_t i, j; 00087 size_t ss = Dat.GetGenes(); 00088 00089 map<string, utype> mapstrintGenes; 00090 for(i=0; i<vecstrGenes.size(); i++) 00091 mapstrintGenes[vecstrGenes[i]] = i; 00092 00093 00094 //fprintf(stderr, "thread limit is %d\n", omp_get_max_threads()); 00095 00096 //getchar(); 00097 #pragma omp parallel for \ 00098 private(i, j) \ 00099 shared(Dat, gmap, mapstrintGenes, vcd, v2) \ 00100 firstprivate(ss) \ 00101 schedule(dynamic) 00102 for(i=0; i<ss; i++){ 00103 utype tid = omp_get_thread_num(); 00104 string s = Dat.GetGene(i); 00105 utype ii = mapstrintGenes[s]; 00106 if(CSeekTools::IsNaN(gmap->GetForward(ii))) continue; 00107 float *v = Dat.GetFullRow(i); 00108 00109 for(j=0; j<ss; j++){ 00110 string t = Dat.GetGene(j); 00111 utype jj = mapstrintGenes[t]; 00112 if(CSeekTools::IsNaN(gmap->GetForward(jj))) continue; 00113 v[j] = v[j] - vcd->GetGeneAverage(jj); 00114 v2[ii][jj] = v[j]; 00115 //fprintf(stderr, "%.2f\n", v2[s][t]); 00116 } 00117 //fprintf(stderr, "Done row %d", i); 00118 //getchar(); 00119 free(v); 00120 //fprintf(stderr, "TID %d Gene %d is done.\n", tid, i); 00121 //fprintf(stderr, "Gene %d / %d is done.\n", i, ss); 00122 //fprintf(stderr, "Done freeing row %d", i); getchar(); 00123 } 00124 return v2; 00125 } 00126 00127 bool GetRandomGenes(gsl_rng *r, int size, vector<string> &struserGenes, 00128 vector<utype> &veciGenes, const vector<string> &vecstrGenes, 00129 const vector<utype> &veciallGenes, CSeekIntIntMap *gmap){ 00130 00131 struserGenes.clear(); 00132 veciGenes.clear(); 00133 struserGenes.resize(size); 00134 veciGenes.resize(size); 00135 00136 utype i; 00137 int totSize = 0; 00138 00139 for(i=0; i<veciallGenes.size(); i++){ 00140 utype t = veciallGenes[i]; 00141 if(CSeekTools::IsNaN(t)) continue; 00142 if(CSeekTools::IsNaN(gmap->GetForward(t))) continue; 00143 totSize++; 00144 } 00145 00146 int *gsample = (int*)malloc(size*sizeof(int)); 00147 int *gs = (int*)malloc(totSize*sizeof(int)); 00148 utype ti = 0; 00149 00150 for(i=0; i<veciallGenes.size(); i++){ 00151 utype t = veciallGenes[i]; 00152 if(CSeekTools::IsNaN(t)) continue; 00153 if(CSeekTools::IsNaN(gmap->GetForward(t))) continue; 00154 gs[ti] = i; 00155 ti++; 00156 } 00157 00158 gsl_ran_choose(r, gsample, size, gs, totSize, sizeof(int)); 00159 00160 for(i=0; i<size; i++){ 00161 veciGenes[i] = veciallGenes[gsample[i]]; 00162 struserGenes[i] = vecstrGenes[gsample[i]]; 00163 } 00164 00165 free(gsample); 00166 free(gs); 00167 return true; 00168 } 00169 00170 bool GetMeanStdev(const vector<float> &f, float &mean, float &stdev){ 00171 size_t i, j; 00172 mean = 0; 00173 stdev = 0; 00174 for(i=0; i<f.size(); i++){ 00175 mean+=f[i]; 00176 } 00177 mean/=f.size(); 00178 for(i=0; i<f.size(); i++){ 00179 stdev+=(f[i] - mean) * (f[i] - mean); 00180 } 00181 stdev/=f.size(); 00182 stdev=sqrt(stdev); 00183 return true; 00184 } 00185 00186 bool CalculateWelch(const float &mean1, const float &stdev1, const int &size1, 00187 const float &mean2, const float &stdev2, const int &size2, 00188 double &pvalt, double &t){ 00189 00190 double v1 = (double) stdev1 * (double) stdev1; 00191 double v2 = (double) stdev2 * (double) stdev2; 00192 00193 int n1 = size1; 00194 int n2 = size2; 00195 t = (mean1 - mean2) /sqrt(v1/n1 + v2/n2); 00196 double x1 = v1/n1; 00197 double x2 = v2/n2; 00198 double i1 = x1 + x2; 00199 double df = i1*i1 / (x1*x1/(n1-1) + x2*x2/(n2-1)); 00200 if(t>0){ 00201 //fprintf(stderr, "t:%.2f df:%.2f n1:%d n2:%d mean1:%.2f mean2:%.2f v1:%.2f v2:%.2f\n", 00202 // t, df, n1, n2, mean1, mean2, v1, v2); 00203 double pval1 = gsl_cdf_tdist_Q(t, df); 00204 double pval2 = gsl_cdf_tdist_P(-1.0*t, df); 00205 pvalt = pval1 + pval2; 00206 00207 }else{ 00208 //fprintf(stderr, "t:%.2f df:%.2f n1:%d n2:%d mean1:%.2f mean2:%.2f v1:%.2f v2:%.2f\n", 00209 // t, df, n1, n2, mean1, mean2, v1, v2); 00210 double pval1 = gsl_cdf_tdist_Q(-1.0*t, df); 00211 double pval2 = gsl_cdf_tdist_P(t, df); 00212 pvalt = pval1 + pval2; 00213 } 00214 00215 return true; 00216 } 00217 00218 vector<string> Do_Pair_Proportion(){ 00219 00220 00221 } 00222 00223 00224 vector<string> Do_T_Test( 00225 gsl_rng *rnd, 00226 float **vall, 00227 const vector<utype> &veciGenes, //process 00228 const vector<string> &vecstrProcess, //process 00229 00230 const vector<utype> &veciallGenes, //background 00231 const vector<string> &vecstrGenes, //background 00232 CSeekIntIntMap *gmap 00233 ){ 00234 00235 unsigned int seed = gsl_rng_get(rnd); 00236 const gsl_rng_type *T; 00237 gsl_rng *r; 00238 gsl_rng_env_setup(); 00239 T = gsl_rng_default; 00240 r = gsl_rng_alloc(T); 00241 gsl_rng_set(r, seed); 00242 00243 size_t i, j, k; 00244 00245 vector< vector<utype> > vi; 00246 vector< vector<string> > si; 00247 vi.resize(100); 00248 si.resize(100); 00249 int size = veciGenes.size(); 00250 vector<string> outstr; 00251 00252 for(i=0; i<100; i++){ 00253 vi[i] = vector<utype>(); 00254 si[i] = vector<string>(); 00255 GetRandomGenes(r, size, si[i], vi[i], vecstrGenes, veciallGenes, gmap); 00256 } 00257 00258 float user_mean, user_stdev; 00259 vector<float> rand_mean, rand_stdev; 00260 rand_mean.resize(100); 00261 rand_stdev.resize(100); 00262 00263 for(i=0; i<100; i++){ 00264 vector<float> vv; //preparation 00265 for(j=0; j<veciGenes.size(); j++){ 00266 for(k=0; k<veciGenes.size(); k++){ 00267 if(CMeta::IsNaN(vall[vi[i][j]][vi[i][k]])) continue; 00268 vv.push_back(vall[vi[i][j]][vi[i][k]]); 00269 } 00270 } 00271 GetMeanStdev(vv, rand_mean[i], rand_stdev[i]); 00272 //fprintf(stderr, "Random %.2f %.2f\n", rand_mean[i], rand_stdev[i]); 00273 } 00274 00275 vector<float> vf; //preparation 00276 for(j=0; j<veciGenes.size(); j++){ 00277 for(k=0; k<veciGenes.size(); k++){ 00278 if(CMeta::IsNaN(vall[veciGenes[j]][veciGenes[k]])) continue; 00279 vf.push_back(vall[veciGenes[j]][veciGenes[k]]); 00280 } 00281 } 00282 GetMeanStdev(vf, user_mean, user_stdev); 00283 00284 ostringstream ost; 00285 ost.setf(ios::fixed); 00286 ost << "User " << setprecision (2) << user_mean << " " << setprecision (2) << user_stdev; 00287 outstr.push_back(ost.str()); 00288 00289 for(i=0; i<100; i++){ 00290 double pvalt, t; 00291 CalculateWelch(rand_mean[i], rand_stdev[i], veciGenes.size(), 00292 user_mean, user_stdev, veciGenes.size(), pvalt, t); 00293 //printf("%.2f %.2f %.3E\n", (float) user_mean - rand_mean[i], t, pvalt); 00294 ostringstream ost2; 00295 ost2.setf(ios::fixed); 00296 ost2 << setprecision(2) << (float) user_mean - rand_mean[i] << " " << setprecision(2) << t << " " << setprecision(2) << scientific << pvalt; 00297 outstr.push_back(ost2.str()); 00298 } 00299 return outstr; 00300 } 00301 00302 bool Get_Mann_Whitney_U_Statistics(const vector<float> &v1, const vector<float> &v2, 00303 double &U1, double &U2, double &z, double &auc){ 00304 00305 size_t i, j, k; 00306 vector<AResultFloat> vv; //preparation 00307 int s1 = v1.size(); 00308 int s2 = v2.size(); 00309 vv.resize(v1.size() + v2.size()); 00310 size_t kk = 0; 00311 for(i=0; i<v1.size(); i++){ 00312 vv[kk].i = 0; 00313 vv[kk].f = v1[i]; 00314 kk++; 00315 } 00316 for(i=0; i<v2.size(); i++){ 00317 vv[kk].i = 1; 00318 vv[kk].f = v2[i]; 00319 kk++; 00320 } 00321 00322 sort(vv.begin(), vv.end()); 00323 00324 U1 = 0; 00325 U2 = 0; 00326 for(j=0; j<vv.size(); j++){ 00327 if(vv[j].i==0){ 00328 U1+=j+1.0; 00329 }else{ 00330 U2+=j+1.0; 00331 } 00332 } 00333 U1-=(s1*(s1+1.0)/2.0); 00334 U2-=(s2*(s2+1.0)/2.0); 00335 00336 //normal approximation 00337 double mu = s1*s2 / 2.0; 00338 double sigma_u = sqrt((s1*s2)*(s1+s2+1.0)/12.0); 00339 z = (U1 - mu) / sigma_u; 00340 auc = U1/(s1*s2); 00341 return true; 00342 } 00343 00344 vector<string> Do_Mann_Whitney_U_Test( 00345 gsl_rng *rnd, 00346 float **vall, 00347 const vector<utype> &veciGenes, //process 00348 const vector<string> &vecstrProcess, //process 00349 00350 const vector<utype> &veciallGenes, //background 00351 const vector<string> &vecstrGenes, //background 00352 CSeekIntIntMap *gmap 00353 ){ 00354 00355 unsigned int seed = gsl_rng_get(rnd); 00356 vector<string> outstr; 00357 00358 const gsl_rng_type *T; 00359 gsl_rng *r; 00360 gsl_rng_env_setup(); 00361 T = gsl_rng_default; 00362 r = gsl_rng_alloc(T); 00363 gsl_rng_set(r, seed); 00364 00365 size_t i, j, k; 00366 00367 vector< vector<utype> > vi; 00368 vector< vector<string> > si; 00369 vi.resize(100); 00370 si.resize(100); 00371 int size = veciGenes.size(); 00372 00373 for(i=0; i<100; i++){ 00374 vi[i] = vector<utype>(); 00375 si[i] = vector<string>(); 00376 GetRandomGenes(r, size, si[i], vi[i], vecstrGenes, veciallGenes, gmap); 00377 } 00378 00379 vector<double> U1, U2; 00380 vector<float> z; 00381 vector<int> s1, s2; 00382 vector<float> auc; 00383 00384 U1.resize(100); U2.resize(100); z.resize(100); 00385 s1.resize(100); s2.resize(100); auc.resize(100); 00386 00387 for(i=0; i<100; i++){ 00388 vector<AResultFloat> vv; //preparation 00389 s1[i] = 0; 00390 s2[i] = 0; 00391 for(j=0; j<veciGenes.size(); j++){ 00392 for(k=0; k<veciGenes.size(); k++){ 00393 if(CMeta::IsNaN(vall[vi[i][j]][vi[i][k]])) continue; 00394 s1[i]++; 00395 } 00396 } 00397 for(j=0; j<veciGenes.size(); j++){ 00398 for(k=0; k<veciGenes.size(); k++){ 00399 if(CMeta::IsNaN(vall[veciGenes[j]][veciGenes[k]])) continue; 00400 s2[i]++; 00401 } 00402 } 00403 vv.resize(s1[i]+s2[i]); 00404 00405 int kk = 0; 00406 for(j=0; j<veciGenes.size(); j++){ 00407 for(k=0; k<veciGenes.size(); k++){ 00408 if(CMeta::IsNaN(vall[vi[i][j]][vi[i][k]])) continue; 00409 vv[kk].f = vall[vi[i][j]][vi[i][k]]; 00410 vv[kk].i = 0; 00411 kk++; 00412 } 00413 } 00414 for(j=0; j<veciGenes.size(); j++){ 00415 for(k=0; k<veciGenes.size(); k++){ 00416 if(CMeta::IsNaN(vall[veciGenes[j]][veciGenes[k]])) continue; 00417 vv[kk].f = vall[veciGenes[j]][veciGenes[k]]; 00418 vv[kk].i = 1; 00419 kk++; 00420 } 00421 } 00422 sort(vv.begin(), vv.end()); 00423 00424 U1[i] = 0; 00425 U2[i] = 0; 00426 for(j=0; j<vv.size(); j++){ 00427 if(vv[j].i==0){ 00428 U1[i]+=j+1.0; 00429 }else{ 00430 U2[i]+=j+1.0; 00431 } 00432 } 00433 U1[i]-=(s1[i]*(s1[i]+1.0)/2.0); 00434 U2[i]-=(s2[i]*(s2[i]+1.0)/2.0); 00435 00436 /*double U = 0; 00437 if(U1<U2){ 00438 U = U1; 00439 }else{ 00440 U = U2; 00441 }*/ 00442 00443 //normal approximation 00444 double mu = s1[i]*s2[i] / 2.0; 00445 double sigma_u = sqrt((s1[i]*s2[i])*(s1[i]+s2[i]+1.0)/12.0); 00446 z[i] = (U1[i] - mu) / sigma_u; 00447 auc[i] = U1[i]/(s1[i]*s2[i]); 00448 //printf("%d %.2f %.2f %.2f %.2f\n", s1, (float) U1, (float) z, (float) U1/(s1*s2), (float) U2/(s1*s2)); 00449 } 00450 00451 for(i=0; i<100; i++){ 00452 ostringstream ost; 00453 ost.setf(ios::fixed); 00454 ost << s1[i] << " " << U1[i] << " " << setprecision (2) << z[i] << " " << setprecision (2) << auc[i]; 00455 outstr.push_back(ost.str()); 00456 } 00457 00458 return outstr; 00459 } 00460 00461 vector<string> Do_Mann_Whitney_U_Test_By_Gene(gsl_rng *rnd, float **vall, 00462 const vector<string> &vecstrProcess, 00463 const vector<utype> &veciProcess, 00464 const vector<string> &vecstrGenes, 00465 const vector<utype> &veciallGenes, 00466 CSeekIntIntMap *gmap 00467 ){ 00468 unsigned int seed = gsl_rng_get(rnd); 00469 00470 vector<string> outstr; 00471 size_t i, j, jj, k; 00472 int size = veciProcess.size(); 00473 int trials = 1; 00474 int perGeneTrials = 11; 00475 00476 const gsl_rng_type *T; 00477 gsl_rng *r; 00478 gsl_rng_env_setup(); 00479 T = gsl_rng_default; 00480 r = gsl_rng_alloc(T); 00481 gsl_rng_set(r, seed); 00482 00483 for(i=0; i<trials; i++){ 00484 vector<utype> veciRandom; 00485 vector<string> vecstrRandom; 00486 GetRandomGenes(r, size, vecstrRandom, veciRandom, vecstrGenes, veciallGenes, gmap); 00487 00488 outstr.push_back("Process"); 00489 00490 //Process 00491 for(j=0; j<size; j++){ 00492 utype thisGene = veciProcess[j]; 00493 vector<float> vProcess; 00494 for(k=0; k<size; k++){ 00495 float x = vall[thisGene][veciProcess[k]]; 00496 if(CMeta::IsNaN(x)) continue; 00497 vProcess.push_back(x); 00498 } 00499 00500 ostringstream ost; 00501 ost << "Gene " << vecstrProcess[j] << " size " << vProcess.size(); 00502 outstr.push_back(ost.str()); 00503 //fprintf(stdout, "Gene %s size %d\n", vecstrProcess[j].c_str(), vProcess.size()); 00504 00505 vector<AResultFloat> ar; 00506 vector<double> U1, U2, z, auc; 00507 00508 ar.resize(perGeneTrials); 00509 U1.resize(perGeneTrials); 00510 U2.resize(perGeneTrials); 00511 z.resize(perGeneTrials); 00512 auc.resize(perGeneTrials); 00513 00514 for(jj=0; jj<perGeneTrials; jj++){ 00515 vector<utype> veciRandomGene; 00516 vector<string> vecstrRandomGene; 00517 GetRandomGenes(r, size, vecstrRandomGene, veciRandomGene, vecstrGenes, veciallGenes, gmap); 00518 00519 vector<float> vRandom; 00520 for(k=0; k<size; k++){ 00521 float x = vall[thisGene][veciRandomGene[k]]; 00522 if(CMeta::IsNaN(x)) continue; 00523 vRandom.push_back(x); 00524 } 00525 Get_Mann_Whitney_U_Statistics(vProcess, vRandom, U1[jj], U2[jj], z[jj], auc[jj]); 00526 00527 ar[jj].i = jj; 00528 ar[jj].f = auc[jj]; 00529 00530 //fprintf(stdout, "%d %d %.2f %.2f\n", (int) U1, (int) U2, z, auc); 00531 } 00532 00533 sort(ar.begin(), ar.end()); 00534 int med = perGeneTrials/2; 00535 int ar_med = ar[med].i; 00536 //Show the median 00537 ostringstream ost2; 00538 ost2.setf(ios::fixed); 00539 ost2 << (int) U1[ar_med] << " " << (int) U2[ar_med] << " " << setprecision(2) << z[ar_med] << " " << setprecision(2) << auc[ar_med]; 00540 outstr.push_back(ost2.str()); 00541 //fprintf(stdout, "%d %d %.2f %.2f\n", (int) U1[ar_med], (int) U2[ar_med], z[ar_med], auc[ar_med]); 00542 00543 } 00544 00545 outstr.push_back("Random"); 00546 //fprintf(stdout, "Random\n"); 00547 //If process genes were just randomly selected geneset 00548 for(j=0; j<size; j++){ 00549 utype thisGene = veciRandom[j]; 00550 vector<float> vRan; 00551 for(k=0; k<size; k++){ 00552 float x = vall[thisGene][veciRandom[k]]; 00553 if(CMeta::IsNaN(x)) continue; 00554 vRan.push_back(x); 00555 } 00556 00557 ostringstream ost; 00558 ost << "Gene number " << j << " size " << vRan.size(); 00559 outstr.push_back(ost.str()); 00560 00561 vector<AResultFloat> ar; 00562 vector<double> U1, U2, z, auc; 00563 00564 ar.resize(perGeneTrials); 00565 U1.resize(perGeneTrials); 00566 U2.resize(perGeneTrials); 00567 z.resize(perGeneTrials); 00568 auc.resize(perGeneTrials); 00569 00570 for(jj=0; jj<perGeneTrials; jj++){ 00571 vector<utype> veciRandomGene; 00572 vector<string> vecstrRandomGene; 00573 GetRandomGenes(r, size, vecstrRandomGene, veciRandomGene, vecstrGenes, veciallGenes, gmap); 00574 vector<float> vRandom; 00575 for(k=0; k<size; k++){ 00576 float x = vall[thisGene][veciRandomGene[k]]; 00577 if(CMeta::IsNaN(x)) continue; 00578 vRandom.push_back(x); 00579 } 00580 Get_Mann_Whitney_U_Statistics(vRan, vRandom, U1[jj], U2[jj], z[jj], auc[jj]); 00581 00582 //fprintf(stdout, "%d %d %.2f %.2f\n", (int) U1, (int) U2, z, auc); 00583 ar[jj].i = jj; 00584 ar[jj].f = auc[jj]; 00585 } 00586 00587 sort(ar.begin(), ar.end()); 00588 int med = perGeneTrials/2; 00589 int ar_med = ar[med].i; 00590 //Show the median 00591 //fprintf(stdout, "%d %d %.2f %.2f\n", (int) U1[ar_med], (int) U2[ar_med], z[ar_med], auc[ar_med]); 00592 ostringstream ost2; 00593 ost2.setf(ios::fixed); 00594 ost2 << (int) U1[ar_med] << " " << (int) U2[ar_med] << " " << setprecision (2) << z[ar_med] << " " << setprecision (2) << auc[ar_med]; 00595 outstr.push_back(ost2.str()); 00596 00597 } 00598 00599 00600 } 00601 00602 return outstr; 00603 } 00604 00605 vector<string> Do_One(const char *file, gsl_rng *rnd, CSeekDataset *vcd, float **vall, 00606 map<string, utype> &mapstrintGene, vector<string> &vecstrGenes){ 00607 00608 size_t i, j, k; 00609 vector<string> ostr; 00610 00611 vector<string> vecstrUserGenes; 00612 CSeekStrIntMap mapTmp; 00613 if(!CSeekTools::ReadListOneColumn(file, vecstrUserGenes, mapTmp)) 00614 return ostr; 00615 00616 vector<utype> userGenes; 00617 for(i=0; i<vecstrUserGenes.size(); i++){ 00618 userGenes.push_back(mapstrintGene[vecstrUserGenes[i]]); 00619 } 00620 00621 CSeekIntIntMap *gmap = vcd->GetGeneMap(); 00622 00623 vector<string> struserGenes; 00624 for(i=0; i<userGenes.size(); i++){ 00625 if(CSeekTools::IsNaN(gmap->GetForward(userGenes[i]))){ 00626 ostringstream ost; 00627 ost << "Error: gene " << vecstrUserGenes[i] << " is not found in this dataset"; 00628 ostr.push_back(ost.str()); 00629 continue; 00630 } 00631 struserGenes.push_back(vecstrGenes[userGenes[i]]); 00632 } 00633 00634 if(struserGenes.size()==0 || struserGenes.size()<5){ 00635 return ostr; 00636 } 00637 00638 vector<utype> veciGenes; 00639 veciGenes.clear(); //only user gene set 00640 veciGenes.resize(struserGenes.size()); 00641 for( i = 0; i < struserGenes.size( ); ++i ){ 00642 //veciGenes[ i ] = Dat.GetGene( struserGenes[i] ); 00643 veciGenes[i] = mapstrintGene[struserGenes[i]]; 00644 } 00645 00646 vector<utype> veciallGenes; 00647 veciallGenes.clear(); //all genes 00648 veciallGenes.resize(vecstrGenes.size()); 00649 for( i = 0; i < vecstrGenes.size( ); ++i ){ 00650 //veciallGenes[ i ] = Dat.GetGene( vecstrGenes[i] ); 00651 veciallGenes[i] = mapstrintGene[vecstrGenes[i]]; 00652 } 00653 00654 //float** v2 = LoadGenes(struserGenes, veciGenes, vecstrGenes, 00655 // veciallGenes, gmap, vcd, vall); 00656 00657 vector<string> outstr = 00658 // Do_Mann_Whitney_U_Test_By_Gene(rnd, vall, struserGenes, veciGenes, vecstrGenes, veciallGenes, gmap); 00659 Do_T_Test(rnd, vall, veciGenes, struserGenes, veciallGenes, vecstrGenes, gmap); 00660 //Do_Mann_Whitney_U_Test(rnd, vall, veciGenes, struserGenes, veciallGenes, vecstrGenes, gmap); 00661 00662 00663 for(i=0; i<outstr.size(); i++){ 00664 ostr.push_back(outstr[i]); 00665 } 00666 00667 //Do_T_Test(vall, veciGenes, struserGenes, veciallGenes, vecstrGenes, gmap); 00668 //ostr = Do_Mann_Whitney_U_Test(vall, veciGenes, struserGenes, veciallGenes, vecstrGenes, gmap); 00669 00670 /* 00671 int *a1 = (int*)malloc(2*sizeof(int)); 00672 int *a2 = (int*)malloc(size*sizeof(int)); 00673 for(i=0; i<size; i++){ 00674 a2[i] = i; 00675 } 00676 00677 fprintf(stderr, "Random to random\n"); 00678 for(i=0; i<100; i++){ 00679 gsl_ran_choose(rnd, a1, 2, a2, size, sizeof(int)); 00680 double pvalt, t; 00681 CalculateWelch(rand_mean[a1[0]], rand_stdev[a1[0]], veciGenes.size(), 00682 rand_mean[a1[1]], rand_stdev[a1[1]], veciGenes.size(), pvalt, t); 00683 fprintf(stderr, "%.2f %.3E\n", t, pvalt); 00684 } 00685 00686 free(a1); 00687 free(a2); 00688 */ 00689 //CSeekTools::Free2DArray(v2); 00690 00691 //for(i=0; i<100; i++){ 00692 // CSeekTools::Free2DArray(randomGenes[i]); 00693 //} 00694 return ostr; 00695 } 00696 00697 00698 int main( int iArgs, char** aszArgs ) { 00699 static const size_t c_iBuffer = 1024; 00700 gengetopt_args_info sArgs; 00701 ifstream ifsm; 00702 istream* pistm; 00703 vector<string> vecstrLine, vecstrGenes, vecstrDatasets, vecstrQuery, vecstrUserDatasets; 00704 char acBuffer[ c_iBuffer ]; 00705 size_t i, j, k; 00706 00707 00708 omp_set_num_threads(8); 00709 /*utype ii; 00710 omp_set_num_threads(8); 00711 #pragma omp parellel for \ 00712 private(i) \ 00713 schedule(static, 1) 00714 for(ii=0; ii<100; ii++){ 00715 utype tid = omp_get_thread_num(); 00716 fprintf(stderr, "Hello World %d\n", tid); 00717 }*/ 00718 00719 00720 const gsl_rng_type *T; 00721 gsl_rng *rnd; 00722 gsl_rng_env_setup(); 00723 T = gsl_rng_default; 00724 rnd = gsl_rng_alloc(T); 00725 00726 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00727 cmdline_parser_print_help( ); 00728 return 1; } 00729 00730 vector<string> vecstrGeneID; 00731 map<string, utype> mapstrintGene; 00732 if(!CSeekTools::ReadListTwoColumns(sArgs.input_arg, vecstrGeneID, vecstrGenes)) 00733 return 1; 00734 for(i=0; i<vecstrGenes.size(); i++) 00735 mapstrintGene[vecstrGenes[i]] = i; 00736 00737 if(sArgs.db_flag==1){ 00738 vector<float> quant; 00739 CSeekTools::ReadQuantFile(sArgs.quant_arg, quant); 00740 00741 vector<string> vecstrDataset, vDP; 00742 00743 if(!CSeekTools::ReadListTwoColumns(sArgs.dataset_list_arg, 00744 vecstrDataset, vDP)) 00745 return false; 00746 00747 CDatabase *DB = new CDatabase(false); 00748 DB->Open(sArgs.db_dir_arg, vecstrGenes, vecstrDataset.size(), sArgs.db_num_arg); 00749 00750 string strPrepInputDirectory = sArgs.prep_arg; 00751 string strSinfoInputDirectory = sArgs.sinfo_arg; 00752 vector<CSeekDataset*> vc; 00753 vc.resize(vecstrDataset.size()); 00754 size_t i, j, k; 00755 for(i=0; i<vecstrDataset.size(); i++){ 00756 vc[i] = new CSeekDataset(); 00757 string strFileStem = vecstrDataset[i]; 00758 string strAvgPath = strPrepInputDirectory+"/"+ 00759 strFileStem + ".gavg"; 00760 string strPresencePath = strPrepInputDirectory+"/"+ 00761 strFileStem + ".gpres"; 00762 string strSinfoPath = strSinfoInputDirectory+"/"+ 00763 strFileStem + ".sinfo"; 00764 vc[i]->ReadGeneAverage(strAvgPath); 00765 vc[i]->ReadGenePresence(strPresencePath); 00766 vc[i]->ReadDatasetAverageStdev(strSinfoPath); 00767 vc[i]->InitializeGeneMap(); 00768 } 00769 00770 size_t iGenes = vecstrGenes.size(); 00771 size_t iDatasets = vecstrDataset.size(); 00772 00773 vector<string> vecstrQuery; 00774 CSeekTools::ReadMultiGeneOneLine(sArgs.query_arg, vecstrQuery); 00775 //Need to load the query 00776 00777 vector<char> cQuery; 00778 CSeekTools::InitVector(cQuery, iGenes, (char)0); 00779 for(i=0; i<vecstrQuery.size(); i++){ 00780 if(mapstrintGene.find(vecstrQuery[i])==mapstrintGene.end()) continue; 00781 utype k = mapstrintGene.find(vecstrQuery[i])->second; 00782 cQuery[k] = 1; 00783 } 00784 vector<utype> allQ; 00785 for(i=0; i<cQuery.size(); i++) 00786 if(cQuery[i]==1) 00787 allQ.push_back(i); 00788 allQ.resize(allQ.size()); 00789 00790 for(i=0; i<iDatasets; i++) 00791 if(vc[i]->GetDBMap()!=NULL) 00792 vc[i]->DeleteQueryBlock(); 00793 00794 for(i=0; i<iDatasets; i++) 00795 vc[i]->InitializeQueryBlock(allQ); 00796 00797 size_t m, d; 00798 for(i=0; i<allQ.size(); i++){ 00799 m = allQ[i]; 00800 vector<unsigned char> Qi; 00801 if(!DB->GetGene(m, Qi)){ 00802 cerr << "Gene does not exist" << endl; 00803 continue; 00804 } 00805 utype db; 00806 CSeekIntIntMap *qu = NULL; 00807 unsigned char **r = NULL; 00808 for(j=0; j<vecstrDataset.size(); j++){ 00809 if((qu=vc[j]->GetDBMap())==NULL) 00810 continue; 00811 if(CSeekTools::IsNaN(db=(qu->GetForward(m)))) 00812 continue; 00813 for(r = vc[j]->GetMatrix(), k=0; k<iGenes; k++) 00814 r[db][k] = Qi[k*vecstrDataset.size()+j]; 00815 } 00816 Qi.clear(); 00817 } 00818 00819 if(!!sArgs.count_pair_flag){ 00820 map<float,vector<int> > countPairs; 00821 float point = 0.0; 00822 while(point<=5.0){ 00823 countPairs[point] = vector<int>(); 00824 CSeekTools::InitVector(countPairs[point], iDatasets, (int)0); 00825 point+=0.25; 00826 } 00827 for(k=0; k<iDatasets; k++){ 00828 CSeekIntIntMap *mapQ = vc[k]->GetDBMap(); 00829 CSeekIntIntMap *mapG = vc[k]->GetGeneMap(); 00830 if(mapQ==NULL) continue; 00831 unsigned char **f = vc[k]->GetMatrix(); 00832 size_t qi, qj; 00833 for(qi=0; qi<allQ.size(); qi++){ 00834 utype gene_qi = allQ[qi]; 00835 utype iQ = mapQ->GetForward(gene_qi); 00836 if(CSeekTools::IsNaN(iQ)) continue; 00837 for(qj=qi+1; qj<allQ.size(); qj++){ 00838 utype gene_qj = allQ[qj]; 00839 utype jQ = mapG->GetForward(gene_qj); 00840 if(CSeekTools::IsNaN(jQ)) continue; 00841 unsigned char uc = f[iQ][gene_qj]; 00842 if(uc==255) continue; 00843 float vv = quant[uc]; 00844 point = 0.0; 00845 while(point<=5.0){ 00846 if(vv>point) 00847 countPairs[point][k]++; 00848 point+=0.25; 00849 } 00850 } 00851 } 00852 } 00853 for(i=0; i<iDatasets; i++) 00854 vc[i]->DeleteQueryBlock(); 00855 point = 0.0; 00856 while(point<=5.0){ 00857 sort(countPairs[point].begin(), countPairs[point].end(), greater<int>()); 00858 float tmp = 0; 00859 for(i=0; i<10; i++) 00860 tmp+=(float)countPairs[point][i]; 00861 tmp/=10.0; 00862 fprintf(stderr, "%.2f\t%.1f pairs\n", point, tmp); 00863 point+=0.25; 00864 } 00865 } 00866 00867 if(!!sArgs.histogram_flag){ 00868 srand(unsigned(time(0))); 00869 vector<int> dID; 00870 for(k=0; k<iDatasets; k++) 00871 dID.push_back(k); 00872 random_shuffle(dID.begin(), dID.end()); 00873 utype kk; 00874 for(kk=0; kk<100; kk++){ 00875 k = dID[kk]; 00876 CSeekIntIntMap *mapQ = vc[k]->GetDBMap(); 00877 CSeekIntIntMap *mapG = vc[k]->GetGeneMap(); 00878 if(mapQ==NULL) continue; 00879 unsigned char **f = vc[k]->GetMatrix(); 00880 size_t qi, ii; 00881 for(qi=0; qi<allQ.size(); qi++){ 00882 utype gene_qi = allQ[qi]; 00883 utype iQ = mapQ->GetForward(gene_qi); 00884 if(CSeekTools::IsNaN(iQ)) continue; 00885 vector<float> z_score; 00886 const vector<utype> &allRGenes = mapG->GetAllReverse(); 00887 for(ii=0; ii<mapG->GetNumSet(); ii++){ 00888 size_t ij = allRGenes[ii]; 00889 float a = vc[k]->GetGeneAverage(ij); 00890 unsigned char x = f[iQ][ij]; 00891 if(x==255) continue; 00892 //float xnew = quant[x] - a; 00893 float xnew = quant[x]; 00894 z_score.push_back(xnew); 00895 } 00896 sort(z_score.begin(), z_score.end()); 00897 float pts[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}; 00898 fprintf(stderr, "Query %s\tDataset %d\t", vecstrGenes[gene_qi].c_str(), k); 00899 for(ii=0; ii<9; ii++) 00900 fprintf(stderr, "%.2f\t", z_score[(int)(pts[ii]*z_score.size())]); 00901 fprintf(stderr, "\n"); 00902 } 00903 } 00904 return 0; 00905 } 00906 00907 } 00908 00909 CSeekStrIntMap mapTmp; 00910 vector<string> vecstrList; 00911 if(!CSeekTools::ReadListOneColumn(sArgs.gene_set_list_arg, vecstrList, mapTmp)) 00912 return 1; 00913 00914 if(sArgs.dab_flag==1){ 00915 CSeekDataset *vcd = new CSeekDataset(); 00916 00917 string strAvg = sArgs.gavg_input_arg; 00918 string strPres = sArgs.gpres_input_arg; 00919 vcd->ReadGeneAverage(strAvg); 00920 vcd->ReadGenePresence(strPres); 00921 vcd->InitializeGeneMap(); 00922 00923 CDataPair Dat; 00924 if(!Dat.Open(sArgs.dabinput_arg, false, false)){ 00925 cerr << "Error opening dab file" << endl; 00926 return 1; 00927 } 00928 //fprintf(stderr, "Read.\n"); 00929 //getchar(); 00930 00931 CSeekIntIntMap *gmap = vcd->GetGeneMap(); 00932 vector<utype> veciallGenes; 00933 veciallGenes.clear(); //all genes 00934 veciallGenes.resize(vecstrGenes.size()); 00935 for( i = 0; i < vecstrGenes.size( ); ++i ){ 00936 veciallGenes[i] = mapstrintGene[vecstrGenes[i]]; 00937 //fprintf(stderr, "Gene %s is id %d %d\n", vecstrGenes[i].c_str(), i, Dat.GetGene(vecstrGenes[i])); 00938 } 00939 //getchar(); 00940 00941 float** vall = ReadAllGenes(vecstrGenes, veciallGenes, gmap, vcd, Dat); 00942 //fprintf(stderr, "Finished loading all\n"); 00943 00944 vector< vector<string> > vx; 00945 vx.resize(vecstrList.size()); 00946 00947 //getchar(); 00948 #pragma omp parallel for \ 00949 private(i) \ 00950 shared(vx, vcd, vall, mapstrintGene, vecstrGenes, vecstrList) \ 00951 schedule(dynamic) 00952 for(i=0; i<vecstrList.size(); i++){ 00953 fprintf(stderr, "Doing: %s\n", vecstrList[i].c_str()); 00954 vx[i] = Do_One(vecstrList[i].c_str(), rnd, vcd, vall, mapstrintGene, vecstrGenes); 00955 } 00956 00957 for(i=0; i<vecstrList.size(); i++){ 00958 printf("File: %s\n", vecstrList[i].c_str()); 00959 for(j=0; j<vx[i].size(); j++){ 00960 cout << vx[i][j] << endl; 00961 } 00962 } 00963 00964 00965 00966 00967 } 00968 00969 00970 //#ifdef WIN32 00971 // pthread_win32_process_detach_np( ); 00972 //#endif // WIN32 00973 return 0; 00974 00975 }