Sleipnir
tools/SeekTest/SeekTest.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 #include <iomanip>
00025 
00026 
00027 float** LoadGenes(const vector<string> struserGenes, 
00028     const vector<utype> &veciGenes, const vector<string> &vecstrGenes, 
00029     const vector<utype> &veciallGenes, CSeekIntIntMap *gmap, 
00030     CSeekDataset *vcd, float **vall){
00031 
00032     float** v2 = CSeekTools::Init2DArray(struserGenes.size(), 
00033         vecstrGenes.size(), CMeta::GetNaN());
00034     size_t i, j;
00035 
00036     for(i=0; i<struserGenes.size(); i++){
00037         utype s = veciGenes[i]; //s is user gene ID
00038         if(CSeekTools::IsNaN(s)) continue;
00039 
00040         for(j=0; j<vecstrGenes.size(); j++){
00041             utype t = veciallGenes[j];
00042             if(CSeekTools::IsNaN(t)) continue;
00043             if(CMeta::IsNaN(vall[s][t])) continue;
00044             if(CSeekTools::IsNaN(gmap->GetForward(t))) continue;
00045             v2[i][t] = vall[s][t];
00046         }
00047     }
00048     return v2;
00049 }
00050 
00051 float** ReadUserGenes(const vector<string> &struserGenes, 
00052     const vector<utype> &veciGenes, const vector<string> &vecstrGenes, 
00053     const vector<utype> &veciallGenes, CSeekIntIntMap *gmap, 
00054     CSeekDataset *vcd, CDataPair &Dat){
00055 
00056     float** v2 = CSeekTools::Init2DArray(struserGenes.size(), 
00057         vecstrGenes.size(), CMeta::GetNaN());
00058     size_t i, j;
00059 
00060     for(i=0; i<struserGenes.size(); i++){
00061         utype s = veciGenes[i]; //s is user gene ID
00062         if(CSeekTools::IsNaN(s)) continue;
00063         float *v = Dat.GetFullRow(s);
00064 
00065         for(j=0; j<vecstrGenes.size(); j++){
00066             utype t = veciallGenes[j];
00067             if(CSeekTools::IsNaN(t)) continue;
00068             if(CMeta::IsNaN(v[t])) continue;
00069             if(CSeekTools::IsNaN(gmap->GetForward(t))) continue;
00070             //Disable gene average subtraction!
00071             //v[t] = v[t] - vcd->GetGeneAverage(t);
00072             v2[i][t] = v[t];
00073             //fprintf(stderr, "%.2f\n", v2[i][t]);
00074         }
00075         free(v);
00076     }   
00077     return v2;
00078 }
00079 
00080 float** ReadAllGenes(const vector<string> &vecstrGenes, 
00081     const vector<utype> &veciallGenes, CSeekIntIntMap *gmap, 
00082     CSeekDataset *vcd, CDataPair &Dat){
00083 
00084     float** v2 = CSeekTools::Init2DArray(vecstrGenes.size(), 
00085         vecstrGenes.size(), CMeta::GetNaN());
00086     size_t i, j;
00087     size_t ss = Dat.GetGenes();
00088 
00089     map<string, utype> mapstrintGenes;
00090     for(i=0; i<vecstrGenes.size(); i++)
00091         mapstrintGenes[vecstrGenes[i]] = i;
00092 
00093 
00094     //fprintf(stderr, "thread limit is %d\n", omp_get_max_threads());
00095 
00096     //getchar();
00097     #pragma omp parallel for \
00098     private(i, j) \
00099     shared(Dat, gmap, mapstrintGenes, vcd, v2) \
00100     firstprivate(ss) \
00101     schedule(dynamic)
00102     for(i=0; i<ss; i++){
00103         utype tid = omp_get_thread_num();
00104         string s = Dat.GetGene(i);
00105         utype ii = mapstrintGenes[s];
00106         if(CSeekTools::IsNaN(gmap->GetForward(ii))) continue;
00107         float *v = Dat.GetFullRow(i);
00108 
00109         for(j=0; j<ss; j++){
00110             string t = Dat.GetGene(j);
00111             utype jj = mapstrintGenes[t];
00112             if(CSeekTools::IsNaN(gmap->GetForward(jj))) continue;
00113             v[j] = v[j] - vcd->GetGeneAverage(jj);
00114             v2[ii][jj] = v[j];
00115             //fprintf(stderr, "%.2f\n", v2[s][t]);
00116         }
00117         //fprintf(stderr, "Done row %d", i);
00118         //getchar();
00119         free(v);
00120         //fprintf(stderr, "TID %d Gene %d is done.\n", tid, i);
00121         //fprintf(stderr, "Gene %d / %d is done.\n", i, ss);
00122         //fprintf(stderr, "Done freeing row %d", i); getchar();
00123     }   
00124     return v2;
00125 }
00126 
00127 bool GetRandomGenes(gsl_rng *r, int size, vector<string> &struserGenes, 
00128     vector<utype> &veciGenes, const vector<string> &vecstrGenes,
00129     const vector<utype> &veciallGenes, CSeekIntIntMap *gmap){
00130 
00131     struserGenes.clear();
00132     veciGenes.clear();
00133     struserGenes.resize(size);
00134     veciGenes.resize(size);
00135 
00136     utype i;
00137     int totSize = 0;
00138 
00139     for(i=0; i<veciallGenes.size(); i++){
00140         utype t = veciallGenes[i];
00141         if(CSeekTools::IsNaN(t)) continue;
00142         if(CSeekTools::IsNaN(gmap->GetForward(t))) continue;
00143         totSize++;
00144     }
00145 
00146     int *gsample = (int*)malloc(size*sizeof(int));      
00147     int *gs = (int*)malloc(totSize*sizeof(int));
00148     utype ti = 0;
00149 
00150     for(i=0; i<veciallGenes.size(); i++){
00151         utype t = veciallGenes[i];
00152         if(CSeekTools::IsNaN(t)) continue;
00153         if(CSeekTools::IsNaN(gmap->GetForward(t))) continue;
00154         gs[ti] = i;
00155         ti++;
00156     }
00157 
00158     gsl_ran_choose(r, gsample, size, gs, totSize, sizeof(int));
00159 
00160     for(i=0; i<size; i++){
00161         veciGenes[i] = veciallGenes[gsample[i]];
00162         struserGenes[i] = vecstrGenes[gsample[i]];
00163     }
00164 
00165     free(gsample);
00166     free(gs);
00167     return true;
00168 }
00169 
00170 bool GetMeanStdev(const vector<float> &f, float &mean, float &stdev){
00171     size_t i, j;
00172     mean = 0;
00173     stdev = 0;
00174     for(i=0; i<f.size(); i++){
00175         mean+=f[i];
00176     }
00177     mean/=f.size();
00178     for(i=0; i<f.size(); i++){
00179         stdev+=(f[i] - mean) * (f[i] - mean);
00180     }
00181     stdev/=f.size();
00182     stdev=sqrt(stdev);
00183     return true;
00184 }
00185 
00186 bool CalculateWelch(const float &mean1, const float &stdev1, const int &size1, 
00187     const float &mean2, const float &stdev2, const int &size2, 
00188     double &pvalt, double &t){
00189 
00190     double v1 = (double) stdev1 * (double) stdev1;
00191     double v2 = (double) stdev2 * (double) stdev2;
00192 
00193     int n1 = size1;
00194     int n2 = size2;
00195     t = (mean1 - mean2) /sqrt(v1/n1 + v2/n2);
00196     double x1 = v1/n1;
00197     double x2 = v2/n2;
00198     double i1 = x1 + x2;
00199     double df = i1*i1 / (x1*x1/(n1-1) + x2*x2/(n2-1));
00200     if(t>0){
00201         //fprintf(stderr, "t:%.2f df:%.2f n1:%d n2:%d mean1:%.2f mean2:%.2f v1:%.2f v2:%.2f\n", 
00202         //  t, df, n1, n2, mean1, mean2, v1, v2);
00203         double pval1 = gsl_cdf_tdist_Q(t, df);
00204         double pval2 = gsl_cdf_tdist_P(-1.0*t, df);
00205         pvalt = pval1 + pval2;
00206 
00207     }else{
00208         //fprintf(stderr, "t:%.2f df:%.2f n1:%d n2:%d mean1:%.2f mean2:%.2f v1:%.2f v2:%.2f\n", 
00209         //  t, df, n1, n2, mean1, mean2, v1, v2);
00210         double pval1 = gsl_cdf_tdist_Q(-1.0*t, df);
00211         double pval2 = gsl_cdf_tdist_P(t, df);
00212         pvalt = pval1 + pval2;
00213     }
00214 
00215     return true;
00216 }
00217 
00218 vector<string> Do_Pair_Proportion(){
00219 
00220 
00221 }
00222 
00223 
00224 vector<string> Do_T_Test(
00225     gsl_rng *rnd,
00226     float **vall, 
00227     const vector<utype> &veciGenes, //process
00228     const vector<string> &vecstrProcess, //process
00229 
00230     const vector<utype> &veciallGenes, //background
00231     const vector<string> &vecstrGenes, //background
00232     CSeekIntIntMap *gmap
00233 ){
00234 
00235     unsigned int seed = gsl_rng_get(rnd);
00236     const gsl_rng_type *T;
00237     gsl_rng *r;
00238     gsl_rng_env_setup();
00239     T = gsl_rng_default;
00240     r = gsl_rng_alloc(T);
00241     gsl_rng_set(r, seed);
00242 
00243     size_t i, j, k;
00244 
00245     vector< vector<utype> > vi;
00246     vector< vector<string> > si;
00247     vi.resize(100);
00248     si.resize(100);
00249     int size = veciGenes.size();
00250     vector<string> outstr;
00251 
00252     for(i=0; i<100; i++){
00253         vi[i] = vector<utype>();
00254         si[i] = vector<string>();
00255         GetRandomGenes(r, size, si[i], vi[i], vecstrGenes, veciallGenes, gmap);
00256     }
00257 
00258     float user_mean, user_stdev;
00259     vector<float> rand_mean, rand_stdev;
00260     rand_mean.resize(100);
00261     rand_stdev.resize(100);
00262 
00263     for(i=0; i<100; i++){
00264         vector<float> vv; //preparation
00265         for(j=0; j<veciGenes.size(); j++){
00266             for(k=0; k<veciGenes.size(); k++){
00267                 if(CMeta::IsNaN(vall[vi[i][j]][vi[i][k]])) continue;
00268                 vv.push_back(vall[vi[i][j]][vi[i][k]]);
00269             }
00270         }
00271         GetMeanStdev(vv, rand_mean[i], rand_stdev[i]);
00272         //fprintf(stderr, "Random %.2f %.2f\n", rand_mean[i], rand_stdev[i]);
00273     }
00274 
00275     vector<float> vf; //preparation
00276     for(j=0; j<veciGenes.size(); j++){
00277         for(k=0; k<veciGenes.size(); k++){
00278             if(CMeta::IsNaN(vall[veciGenes[j]][veciGenes[k]])) continue;
00279             vf.push_back(vall[veciGenes[j]][veciGenes[k]]);
00280         }
00281     }
00282     GetMeanStdev(vf, user_mean, user_stdev);
00283 
00284     ostringstream ost;
00285     ost.setf(ios::fixed);
00286     ost << "User " << setprecision (2) << user_mean << " " << setprecision (2) << user_stdev;
00287     outstr.push_back(ost.str());
00288     
00289     for(i=0; i<100; i++){
00290         double pvalt, t;
00291         CalculateWelch(rand_mean[i], rand_stdev[i], veciGenes.size(), 
00292             user_mean, user_stdev, veciGenes.size(), pvalt, t);
00293         //printf("%.2f %.2f %.3E\n", (float) user_mean - rand_mean[i], t, pvalt);
00294         ostringstream ost2;
00295         ost2.setf(ios::fixed);
00296         ost2 << setprecision(2) << (float) user_mean - rand_mean[i] << " " << setprecision(2) << t << " " << setprecision(2) << scientific << pvalt;
00297         outstr.push_back(ost2.str());
00298     }
00299     return outstr;
00300 }
00301 
00302 bool Get_Mann_Whitney_U_Statistics(const vector<float> &v1, const vector<float> &v2,
00303     double &U1, double &U2, double &z, double &auc){
00304 
00305     size_t i, j, k;
00306     vector<AResultFloat> vv; //preparation
00307     int s1 = v1.size();
00308     int s2 = v2.size();
00309     vv.resize(v1.size() + v2.size());
00310     size_t kk = 0;
00311     for(i=0; i<v1.size(); i++){
00312         vv[kk].i = 0;
00313         vv[kk].f = v1[i];
00314         kk++;
00315     }
00316     for(i=0; i<v2.size(); i++){
00317         vv[kk].i = 1;
00318         vv[kk].f = v2[i];
00319         kk++;
00320     }
00321 
00322     sort(vv.begin(), vv.end());
00323 
00324     U1 = 0;
00325     U2 = 0; 
00326     for(j=0; j<vv.size(); j++){
00327         if(vv[j].i==0){
00328             U1+=j+1.0;
00329         }else{
00330             U2+=j+1.0;
00331         }
00332     }
00333     U1-=(s1*(s1+1.0)/2.0);
00334     U2-=(s2*(s2+1.0)/2.0);
00335 
00336     //normal approximation      
00337     double mu = s1*s2 / 2.0;
00338     double sigma_u = sqrt((s1*s2)*(s1+s2+1.0)/12.0);
00339     z = (U1 - mu) / sigma_u;
00340     auc = U1/(s1*s2);
00341     return true;
00342 }
00343 
00344 vector<string> Do_Mann_Whitney_U_Test(
00345     gsl_rng *rnd,
00346     float **vall, 
00347     const vector<utype> &veciGenes, //process
00348     const vector<string> &vecstrProcess, //process
00349 
00350     const vector<utype> &veciallGenes, //background
00351     const vector<string> &vecstrGenes, //background
00352     CSeekIntIntMap *gmap
00353 ){
00354 
00355     unsigned int seed = gsl_rng_get(rnd);
00356     vector<string> outstr;
00357 
00358     const gsl_rng_type *T;
00359     gsl_rng *r;
00360     gsl_rng_env_setup();
00361     T = gsl_rng_default;
00362     r = gsl_rng_alloc(T);
00363     gsl_rng_set(r, seed);
00364 
00365     size_t i, j, k;
00366 
00367     vector< vector<utype> > vi;
00368     vector< vector<string> > si;
00369     vi.resize(100);
00370     si.resize(100);
00371     int size = veciGenes.size();
00372 
00373     for(i=0; i<100; i++){
00374         vi[i] = vector<utype>();
00375         si[i] = vector<string>();
00376         GetRandomGenes(r, size, si[i], vi[i], vecstrGenes, veciallGenes, gmap);
00377     }
00378 
00379     vector<double> U1, U2;
00380     vector<float> z;
00381     vector<int> s1, s2;
00382     vector<float> auc;
00383 
00384     U1.resize(100); U2.resize(100); z.resize(100); 
00385     s1.resize(100); s2.resize(100); auc.resize(100);
00386 
00387     for(i=0; i<100; i++){
00388         vector<AResultFloat> vv; //preparation
00389         s1[i] = 0;
00390         s2[i] = 0;
00391         for(j=0; j<veciGenes.size(); j++){
00392             for(k=0; k<veciGenes.size(); k++){
00393                 if(CMeta::IsNaN(vall[vi[i][j]][vi[i][k]])) continue;
00394                 s1[i]++;
00395             }
00396         }
00397         for(j=0; j<veciGenes.size(); j++){
00398             for(k=0; k<veciGenes.size(); k++){
00399                 if(CMeta::IsNaN(vall[veciGenes[j]][veciGenes[k]])) continue;
00400                 s2[i]++;
00401             }
00402         }
00403         vv.resize(s1[i]+s2[i]);
00404 
00405         int kk = 0;
00406         for(j=0; j<veciGenes.size(); j++){
00407             for(k=0; k<veciGenes.size(); k++){
00408                 if(CMeta::IsNaN(vall[vi[i][j]][vi[i][k]])) continue;
00409                 vv[kk].f = vall[vi[i][j]][vi[i][k]];
00410                 vv[kk].i = 0;
00411                 kk++;
00412             }
00413         }
00414         for(j=0; j<veciGenes.size(); j++){
00415             for(k=0; k<veciGenes.size(); k++){
00416                 if(CMeta::IsNaN(vall[veciGenes[j]][veciGenes[k]])) continue;
00417                 vv[kk].f = vall[veciGenes[j]][veciGenes[k]];
00418                 vv[kk].i = 1;
00419                 kk++;
00420             }
00421         }
00422         sort(vv.begin(), vv.end());
00423     
00424         U1[i] = 0;
00425         U2[i] = 0;
00426         for(j=0; j<vv.size(); j++){
00427             if(vv[j].i==0){
00428                 U1[i]+=j+1.0;
00429             }else{
00430                 U2[i]+=j+1.0;
00431             }
00432         }
00433         U1[i]-=(s1[i]*(s1[i]+1.0)/2.0);
00434         U2[i]-=(s2[i]*(s2[i]+1.0)/2.0);
00435     
00436         /*double U = 0;
00437         if(U1<U2){
00438             U = U1;
00439         }else{
00440             U = U2;
00441         }*/
00442 
00443         //normal approximation      
00444         double mu = s1[i]*s2[i] / 2.0;
00445         double sigma_u = sqrt((s1[i]*s2[i])*(s1[i]+s2[i]+1.0)/12.0);
00446         z[i] = (U1[i] - mu) / sigma_u;
00447         auc[i] = U1[i]/(s1[i]*s2[i]);
00448         //printf("%d %.2f %.2f %.2f %.2f\n", s1, (float) U1, (float) z, (float) U1/(s1*s2), (float) U2/(s1*s2));
00449     }
00450     
00451     for(i=0; i<100; i++){
00452         ostringstream ost;
00453         ost.setf(ios::fixed);
00454         ost << s1[i] << " " << U1[i] << " " << setprecision (2) << z[i] << " " << setprecision (2) << auc[i];
00455         outstr.push_back(ost.str());
00456     }
00457 
00458     return outstr;
00459 }
00460 
00461 vector<string> Do_Mann_Whitney_U_Test_By_Gene(gsl_rng *rnd, float **vall,
00462     const vector<string> &vecstrProcess,
00463     const vector<utype> &veciProcess,
00464     const vector<string> &vecstrGenes,
00465     const vector<utype> &veciallGenes,
00466     CSeekIntIntMap *gmap
00467 ){
00468     unsigned int seed = gsl_rng_get(rnd);
00469 
00470     vector<string> outstr;
00471     size_t i, j, jj, k;
00472     int size = veciProcess.size();
00473     int trials = 1;
00474     int perGeneTrials = 11;
00475 
00476     const gsl_rng_type *T;
00477     gsl_rng *r;
00478     gsl_rng_env_setup();
00479     T = gsl_rng_default;
00480     r = gsl_rng_alloc(T);
00481     gsl_rng_set(r, seed);
00482 
00483     for(i=0; i<trials; i++){
00484         vector<utype> veciRandom;
00485         vector<string> vecstrRandom;
00486         GetRandomGenes(r, size, vecstrRandom, veciRandom, vecstrGenes, veciallGenes, gmap);
00487 
00488         outstr.push_back("Process");
00489 
00490         //Process
00491         for(j=0; j<size; j++){
00492             utype thisGene = veciProcess[j];
00493             vector<float> vProcess;
00494             for(k=0; k<size; k++){
00495                 float x = vall[thisGene][veciProcess[k]];
00496                 if(CMeta::IsNaN(x)) continue;
00497                 vProcess.push_back(x);
00498             }
00499 
00500             ostringstream ost; 
00501             ost << "Gene " << vecstrProcess[j] << " size " << vProcess.size();
00502             outstr.push_back(ost.str());
00503             //fprintf(stdout, "Gene %s size %d\n", vecstrProcess[j].c_str(), vProcess.size());
00504 
00505             vector<AResultFloat> ar;
00506             vector<double> U1, U2, z, auc;
00507 
00508             ar.resize(perGeneTrials);
00509             U1.resize(perGeneTrials); 
00510             U2.resize(perGeneTrials); 
00511             z.resize(perGeneTrials); 
00512             auc.resize(perGeneTrials);
00513 
00514             for(jj=0; jj<perGeneTrials; jj++){
00515                 vector<utype> veciRandomGene;
00516                 vector<string> vecstrRandomGene;
00517                 GetRandomGenes(r, size, vecstrRandomGene, veciRandomGene, vecstrGenes, veciallGenes, gmap);
00518 
00519                 vector<float> vRandom;
00520                 for(k=0; k<size; k++){
00521                     float x = vall[thisGene][veciRandomGene[k]];
00522                     if(CMeta::IsNaN(x)) continue;
00523                     vRandom.push_back(x);
00524                 }
00525                 Get_Mann_Whitney_U_Statistics(vProcess, vRandom, U1[jj], U2[jj], z[jj], auc[jj]);
00526 
00527                 ar[jj].i = jj;
00528                 ar[jj].f = auc[jj];
00529 
00530                 //fprintf(stdout, "%d %d %.2f %.2f\n", (int) U1, (int) U2, z, auc);
00531             }
00532 
00533             sort(ar.begin(), ar.end());
00534             int med = perGeneTrials/2;
00535             int ar_med = ar[med].i;
00536             //Show the median
00537             ostringstream ost2;
00538             ost2.setf(ios::fixed);
00539             ost2 << (int) U1[ar_med] << " " << (int) U2[ar_med] << " " << setprecision(2) << z[ar_med] << " " << setprecision(2) << auc[ar_med];
00540             outstr.push_back(ost2.str());
00541             //fprintf(stdout, "%d %d %.2f %.2f\n", (int) U1[ar_med], (int) U2[ar_med], z[ar_med], auc[ar_med]);
00542 
00543         }
00544 
00545         outstr.push_back("Random");
00546         //fprintf(stdout, "Random\n");
00547         //If process genes were just randomly selected geneset
00548         for(j=0; j<size; j++){
00549             utype thisGene = veciRandom[j];
00550             vector<float> vRan;
00551             for(k=0; k<size; k++){
00552                 float x = vall[thisGene][veciRandom[k]];
00553                 if(CMeta::IsNaN(x)) continue;
00554                 vRan.push_back(x);
00555             }
00556 
00557             ostringstream ost; 
00558             ost << "Gene number " << j << " size " << vRan.size();
00559             outstr.push_back(ost.str());
00560 
00561             vector<AResultFloat> ar;
00562             vector<double> U1, U2, z, auc;
00563 
00564             ar.resize(perGeneTrials);
00565             U1.resize(perGeneTrials); 
00566             U2.resize(perGeneTrials); 
00567             z.resize(perGeneTrials); 
00568             auc.resize(perGeneTrials);
00569 
00570             for(jj=0; jj<perGeneTrials; jj++){
00571                 vector<utype> veciRandomGene;
00572                 vector<string> vecstrRandomGene;
00573                 GetRandomGenes(r, size, vecstrRandomGene, veciRandomGene, vecstrGenes, veciallGenes, gmap);
00574                 vector<float> vRandom;
00575                 for(k=0; k<size; k++){
00576                     float x = vall[thisGene][veciRandomGene[k]];
00577                     if(CMeta::IsNaN(x)) continue;
00578                     vRandom.push_back(x);
00579                 }
00580                 Get_Mann_Whitney_U_Statistics(vRan, vRandom, U1[jj], U2[jj], z[jj], auc[jj]);
00581 
00582                 //fprintf(stdout, "%d %d %.2f %.2f\n", (int) U1, (int) U2, z, auc);
00583                 ar[jj].i = jj;
00584                 ar[jj].f = auc[jj];
00585             }
00586 
00587             sort(ar.begin(), ar.end());
00588             int med = perGeneTrials/2;
00589             int ar_med = ar[med].i;
00590             //Show the median
00591             //fprintf(stdout, "%d %d %.2f %.2f\n", (int) U1[ar_med], (int) U2[ar_med], z[ar_med], auc[ar_med]);
00592             ostringstream ost2;
00593             ost2.setf(ios::fixed);
00594             ost2 << (int) U1[ar_med] << " " << (int) U2[ar_med] << " " << setprecision (2) << z[ar_med] << " " << setprecision (2) << auc[ar_med];
00595             outstr.push_back(ost2.str());
00596 
00597         }
00598 
00599 
00600     }
00601 
00602     return outstr;
00603 }
00604 
00605 vector<string> Do_One(const char *file, gsl_rng *rnd, CSeekDataset *vcd, float **vall,
00606     map<string, utype> &mapstrintGene, vector<string> &vecstrGenes){
00607 
00608     size_t i, j, k;
00609     vector<string> ostr;
00610 
00611     vector<string> vecstrUserGenes;
00612     CSeekStrIntMap mapTmp;
00613     if(!CSeekTools::ReadListOneColumn(file, vecstrUserGenes, mapTmp))
00614         return ostr;
00615 
00616     vector<utype> userGenes;
00617     for(i=0; i<vecstrUserGenes.size(); i++){
00618         userGenes.push_back(mapstrintGene[vecstrUserGenes[i]]);
00619     }
00620 
00621     CSeekIntIntMap *gmap = vcd->GetGeneMap();
00622 
00623     vector<string> struserGenes;
00624     for(i=0; i<userGenes.size(); i++){
00625         if(CSeekTools::IsNaN(gmap->GetForward(userGenes[i]))){
00626             ostringstream ost;
00627             ost << "Error: gene " << vecstrUserGenes[i] << " is not found in this dataset";
00628             ostr.push_back(ost.str());
00629             continue;
00630         }
00631         struserGenes.push_back(vecstrGenes[userGenes[i]]);
00632     }
00633 
00634     if(struserGenes.size()==0 || struserGenes.size()<5){
00635         return ostr;
00636     }
00637 
00638     vector<utype> veciGenes;
00639     veciGenes.clear(); //only user gene set
00640     veciGenes.resize(struserGenes.size());
00641     for( i = 0; i < struserGenes.size( ); ++i ){
00642         //veciGenes[ i ] = Dat.GetGene( struserGenes[i] );
00643         veciGenes[i] = mapstrintGene[struserGenes[i]];
00644     }
00645 
00646     vector<utype> veciallGenes;
00647     veciallGenes.clear(); //all genes
00648     veciallGenes.resize(vecstrGenes.size());
00649     for( i = 0; i < vecstrGenes.size( ); ++i ){
00650         //veciallGenes[ i ] = Dat.GetGene( vecstrGenes[i] );
00651         veciallGenes[i] = mapstrintGene[vecstrGenes[i]];
00652     }
00653 
00654     //float** v2 = LoadGenes(struserGenes, veciGenes, vecstrGenes, 
00655     //  veciallGenes, gmap, vcd, vall);
00656 
00657     vector<string> outstr = 
00658     //  Do_Mann_Whitney_U_Test_By_Gene(rnd, vall, struserGenes, veciGenes, vecstrGenes, veciallGenes, gmap);
00659     Do_T_Test(rnd, vall, veciGenes, struserGenes, veciallGenes, vecstrGenes, gmap);
00660     //Do_Mann_Whitney_U_Test(rnd, vall, veciGenes, struserGenes, veciallGenes, vecstrGenes, gmap);
00661 
00662 
00663     for(i=0; i<outstr.size(); i++){
00664         ostr.push_back(outstr[i]);
00665     }
00666     
00667     //Do_T_Test(vall, veciGenes, struserGenes, veciallGenes, vecstrGenes, gmap);
00668     //ostr = Do_Mann_Whitney_U_Test(vall, veciGenes, struserGenes, veciallGenes, vecstrGenes, gmap);
00669 
00670     /*
00671     int *a1 = (int*)malloc(2*sizeof(int));
00672     int *a2 = (int*)malloc(size*sizeof(int));
00673     for(i=0; i<size; i++){
00674         a2[i] = i;
00675     }
00676 
00677     fprintf(stderr, "Random to random\n");
00678     for(i=0; i<100; i++){
00679         gsl_ran_choose(rnd, a1, 2, a2, size, sizeof(int));
00680         double pvalt, t;
00681         CalculateWelch(rand_mean[a1[0]], rand_stdev[a1[0]], veciGenes.size(), 
00682             rand_mean[a1[1]], rand_stdev[a1[1]], veciGenes.size(), pvalt, t);
00683         fprintf(stderr, "%.2f %.3E\n", t, pvalt);
00684     }
00685 
00686     free(a1);
00687     free(a2);
00688     */
00689     //CSeekTools::Free2DArray(v2);
00690 
00691     //for(i=0; i<100; i++){
00692     //  CSeekTools::Free2DArray(randomGenes[i]);
00693     //}
00694     return ostr;
00695 }
00696 
00697 
00698 int main( int iArgs, char** aszArgs ) {
00699     static const size_t c_iBuffer   = 1024;
00700     gengetopt_args_info sArgs;
00701     ifstream            ifsm;
00702     istream*            pistm;
00703     vector<string>      vecstrLine, vecstrGenes, vecstrDatasets, vecstrQuery, vecstrUserDatasets;
00704     char                acBuffer[ c_iBuffer ];
00705     size_t              i, j, k;
00706 
00707 
00708     omp_set_num_threads(8);
00709     /*utype ii;
00710     omp_set_num_threads(8);
00711     #pragma omp parellel for \
00712     private(i) \
00713     schedule(static, 1)
00714     for(ii=0; ii<100; ii++){
00715         utype tid = omp_get_thread_num();
00716         fprintf(stderr, "Hello World %d\n", tid);
00717     }*/
00718 
00719 
00720     const gsl_rng_type *T;
00721     gsl_rng *rnd;
00722     gsl_rng_env_setup();
00723     T = gsl_rng_default;
00724     rnd = gsl_rng_alloc(T);
00725 
00726     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00727         cmdline_parser_print_help( );
00728         return 1; }
00729 
00730     vector<string> vecstrGeneID;
00731     map<string, utype> mapstrintGene;
00732     if(!CSeekTools::ReadListTwoColumns(sArgs.input_arg, vecstrGeneID, vecstrGenes))
00733         return 1;
00734     for(i=0; i<vecstrGenes.size(); i++)
00735         mapstrintGene[vecstrGenes[i]] = i;
00736 
00737     if(sArgs.db_flag==1){
00738         vector<float> quant;
00739         CSeekTools::ReadQuantFile(sArgs.quant_arg, quant);
00740 
00741         vector<string> vecstrDataset, vDP;
00742 
00743         if(!CSeekTools::ReadListTwoColumns(sArgs.dataset_list_arg, 
00744         vecstrDataset, vDP))
00745             return false;
00746 
00747         CDatabase *DB = new CDatabase(false);
00748         DB->Open(sArgs.db_dir_arg, vecstrGenes, vecstrDataset.size(), sArgs.db_num_arg);
00749 
00750         string strPrepInputDirectory = sArgs.prep_arg;
00751         string strSinfoInputDirectory = sArgs.sinfo_arg;
00752         vector<CSeekDataset*> vc;
00753         vc.resize(vecstrDataset.size());
00754         size_t i, j, k;
00755         for(i=0; i<vecstrDataset.size(); i++){
00756             vc[i] = new CSeekDataset();
00757             string strFileStem = vecstrDataset[i];
00758             string strAvgPath = strPrepInputDirectory+"/"+
00759                 strFileStem + ".gavg";
00760             string strPresencePath = strPrepInputDirectory+"/"+
00761                 strFileStem + ".gpres";
00762             string strSinfoPath = strSinfoInputDirectory+"/"+
00763                 strFileStem + ".sinfo";
00764             vc[i]->ReadGeneAverage(strAvgPath);
00765             vc[i]->ReadGenePresence(strPresencePath);
00766             vc[i]->ReadDatasetAverageStdev(strSinfoPath);
00767             vc[i]->InitializeGeneMap();
00768         }
00769     
00770         size_t iGenes = vecstrGenes.size();
00771         size_t iDatasets = vecstrDataset.size();
00772         
00773         vector<string> vecstrQuery;
00774         CSeekTools::ReadMultiGeneOneLine(sArgs.query_arg, vecstrQuery);
00775         //Need to load the query
00776 
00777         vector<char> cQuery;
00778         CSeekTools::InitVector(cQuery, iGenes, (char)0);
00779         for(i=0; i<vecstrQuery.size(); i++){
00780             if(mapstrintGene.find(vecstrQuery[i])==mapstrintGene.end()) continue;
00781             utype k = mapstrintGene.find(vecstrQuery[i])->second;
00782             cQuery[k] = 1;
00783         }
00784         vector<utype> allQ;
00785         for(i=0; i<cQuery.size(); i++)
00786             if(cQuery[i]==1)
00787                 allQ.push_back(i);
00788         allQ.resize(allQ.size());
00789 
00790         for(i=0; i<iDatasets; i++)
00791             if(vc[i]->GetDBMap()!=NULL)
00792                 vc[i]->DeleteQueryBlock();
00793     
00794         for(i=0; i<iDatasets; i++)
00795             vc[i]->InitializeQueryBlock(allQ);
00796 
00797         size_t m, d;
00798         for(i=0; i<allQ.size(); i++){
00799             m = allQ[i];
00800             vector<unsigned char> Qi;
00801             if(!DB->GetGene(m, Qi)){
00802                 cerr << "Gene does not exist" << endl;
00803                 continue;
00804             }
00805             utype db;
00806             CSeekIntIntMap *qu = NULL;
00807             unsigned char **r = NULL;
00808             for(j=0; j<vecstrDataset.size(); j++){
00809                 if((qu=vc[j]->GetDBMap())==NULL)
00810                     continue;
00811                 if(CSeekTools::IsNaN(db=(qu->GetForward(m)))) 
00812                     continue;
00813                 for(r = vc[j]->GetMatrix(), k=0; k<iGenes; k++)
00814                     r[db][k] = Qi[k*vecstrDataset.size()+j];
00815             }
00816             Qi.clear();
00817         }
00818 
00819         if(!!sArgs.count_pair_flag){
00820             map<float,vector<int> > countPairs;
00821             float point = 0.0;
00822             while(point<=5.0){
00823                 countPairs[point] = vector<int>();
00824                 CSeekTools::InitVector(countPairs[point], iDatasets, (int)0);
00825                 point+=0.25;
00826             }
00827             for(k=0; k<iDatasets; k++){
00828                 CSeekIntIntMap *mapQ = vc[k]->GetDBMap();
00829                 CSeekIntIntMap *mapG = vc[k]->GetGeneMap();
00830                 if(mapQ==NULL) continue;
00831                 unsigned char **f = vc[k]->GetMatrix();
00832                 size_t qi, qj;
00833                 for(qi=0; qi<allQ.size(); qi++){
00834                     utype gene_qi = allQ[qi];
00835                     utype iQ = mapQ->GetForward(gene_qi);
00836                     if(CSeekTools::IsNaN(iQ)) continue;
00837                     for(qj=qi+1; qj<allQ.size(); qj++){
00838                         utype gene_qj = allQ[qj];
00839                         utype jQ = mapG->GetForward(gene_qj);
00840                         if(CSeekTools::IsNaN(jQ)) continue;
00841                         unsigned char uc = f[iQ][gene_qj];
00842                         if(uc==255) continue;
00843                         float vv = quant[uc];
00844                         point = 0.0;
00845                         while(point<=5.0){
00846                             if(vv>point)
00847                                 countPairs[point][k]++;
00848                             point+=0.25;
00849                         }
00850                     }
00851                 }
00852             }
00853             for(i=0; i<iDatasets; i++)
00854                 vc[i]->DeleteQueryBlock();
00855             point = 0.0;            
00856             while(point<=5.0){
00857                 sort(countPairs[point].begin(), countPairs[point].end(), greater<int>());
00858                 float tmp = 0;
00859                 for(i=0; i<10; i++)
00860                     tmp+=(float)countPairs[point][i];
00861                 tmp/=10.0;
00862                 fprintf(stderr, "%.2f\t%.1f pairs\n", point, tmp);
00863                 point+=0.25;
00864             }
00865         }
00866 
00867         if(!!sArgs.histogram_flag){
00868             srand(unsigned(time(0)));
00869             vector<int> dID;
00870             for(k=0; k<iDatasets; k++)
00871                 dID.push_back(k);
00872             random_shuffle(dID.begin(), dID.end());
00873             utype kk;
00874             for(kk=0; kk<100; kk++){
00875                 k = dID[kk];
00876                 CSeekIntIntMap *mapQ = vc[k]->GetDBMap();
00877                 CSeekIntIntMap *mapG = vc[k]->GetGeneMap();
00878                 if(mapQ==NULL) continue;
00879                 unsigned char **f = vc[k]->GetMatrix();
00880                 size_t qi, ii;
00881                 for(qi=0; qi<allQ.size(); qi++){
00882                     utype gene_qi = allQ[qi];
00883                     utype iQ = mapQ->GetForward(gene_qi);
00884                     if(CSeekTools::IsNaN(iQ)) continue;
00885                     vector<float> z_score;
00886                     const vector<utype> &allRGenes = mapG->GetAllReverse();
00887                     for(ii=0; ii<mapG->GetNumSet(); ii++){
00888                         size_t ij = allRGenes[ii];
00889                         float a = vc[k]->GetGeneAverage(ij);
00890                         unsigned char x = f[iQ][ij];
00891                         if(x==255) continue;
00892                         //float xnew = quant[x] - a;
00893                         float xnew = quant[x];
00894                         z_score.push_back(xnew);
00895                     }
00896                     sort(z_score.begin(), z_score.end());
00897                     float pts[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9};
00898                     fprintf(stderr, "Query %s\tDataset %d\t", vecstrGenes[gene_qi].c_str(), k);
00899                     for(ii=0; ii<9; ii++)
00900                         fprintf(stderr, "%.2f\t", z_score[(int)(pts[ii]*z_score.size())]);
00901                     fprintf(stderr, "\n");
00902                 }
00903             }
00904             return 0;
00905         }
00906 
00907     }
00908         
00909     CSeekStrIntMap mapTmp;
00910     vector<string> vecstrList;
00911     if(!CSeekTools::ReadListOneColumn(sArgs.gene_set_list_arg, vecstrList, mapTmp))
00912         return 1;
00913 
00914     if(sArgs.dab_flag==1){
00915         CSeekDataset *vcd = new CSeekDataset();
00916 
00917         string strAvg = sArgs.gavg_input_arg;
00918         string strPres = sArgs.gpres_input_arg;
00919         vcd->ReadGeneAverage(strAvg);
00920         vcd->ReadGenePresence(strPres);
00921         vcd->InitializeGeneMap();
00922 
00923         CDataPair Dat;
00924         if(!Dat.Open(sArgs.dabinput_arg, false, false)){
00925             cerr << "Error opening dab file" << endl;
00926             return 1;
00927         }
00928         //fprintf(stderr, "Read.\n");
00929         //getchar();
00930     
00931         CSeekIntIntMap *gmap = vcd->GetGeneMap();
00932         vector<utype> veciallGenes;
00933         veciallGenes.clear(); //all genes
00934         veciallGenes.resize(vecstrGenes.size());
00935         for( i = 0; i < vecstrGenes.size( ); ++i ){
00936             veciallGenes[i] = mapstrintGene[vecstrGenes[i]];
00937             //fprintf(stderr, "Gene %s is id %d %d\n", vecstrGenes[i].c_str(), i, Dat.GetGene(vecstrGenes[i]));
00938         }
00939         //getchar();
00940 
00941         float** vall = ReadAllGenes(vecstrGenes, veciallGenes, gmap, vcd, Dat);
00942         //fprintf(stderr, "Finished loading all\n");
00943 
00944         vector< vector<string> > vx;
00945         vx.resize(vecstrList.size());
00946 
00947         //getchar();    
00948         #pragma omp parallel for \
00949         private(i) \
00950         shared(vx, vcd, vall, mapstrintGene, vecstrGenes, vecstrList) \
00951         schedule(dynamic)
00952         for(i=0; i<vecstrList.size(); i++){
00953             fprintf(stderr, "Doing: %s\n", vecstrList[i].c_str());
00954             vx[i] = Do_One(vecstrList[i].c_str(), rnd, vcd, vall, mapstrintGene, vecstrGenes);
00955         }
00956 
00957         for(i=0; i<vecstrList.size(); i++){
00958             printf("File: %s\n", vecstrList[i].c_str());
00959             for(j=0; j<vx[i].size(); j++){
00960                 cout << vx[i][j] << endl;
00961             }
00962         }
00963             
00964 
00965     
00966 
00967     }
00968 
00969         
00970 //#ifdef WIN32
00971 //  pthread_win32_process_detach_np( );
00972 //#endif // WIN32
00973     return 0; 
00974 
00975 }