Sleipnir
tools/SeekGeneRecommender/SeekGeneRecommender.cpp
00001 #include "stdafx.h"
00002 #include "cmdline.h"
00003 
00004 struct bresult{
00005     int i;
00006     int j;
00007     float f;
00008     float f2;
00009 };
00010 
00011 struct ascend{
00012     bool operator()(const bresult& lx, const bresult& rx) const{
00013         return lx.f < rx.f;
00014     }
00015 };
00016 
00017 struct descend{
00018     bool operator()(const bresult& lx, const bresult& rx) const{
00019         return lx.f > rx.f;
00020     }
00021 };
00022 
00023 bool rank_transform(vector< vector< vector<float> > > &mat, 
00024     vector<CSeekIntIntMap*> &dm, vector<char> &absent_g, 
00025     int numGenes){
00026 
00027     int d, dd, i, j, k;
00028     int numDatasets = mat.size();
00029     absent_g.clear();
00030     absent_g.resize(numGenes);
00031     
00032     for(i=0; i<numGenes; i++){
00033         int tot = 0;
00034         if(i%500==0 || i==numGenes-1) 
00035             fprintf(stderr, "Gene %d\n", i);
00036         for(d=0; d<numDatasets; d++){
00037             CSeekIntIntMap *mapG = dm[d];
00038             if(CSeekTools::IsNaN(mapG->GetForward(i))) continue;
00039             tot+=mat[d][0].size(); //mat[d][0].size() is the number of conditions
00040         }
00041         if(tot==0){
00042             absent_g[i] = 1;
00043             continue;
00044         }
00045         
00046         absent_g[i] = 0;
00047         vector<bresult> a;
00048         a.resize(tot);
00049         
00050         k=0; 
00051         for(d=0; d<numDatasets; d++){
00052             if(CSeekTools::IsNaN(dm[d]->GetForward(i))) continue;
00053             for(j=0; j<mat[d][0].size(); j++){
00054                 a[k].i = d;
00055                 a[k].j = j;
00056                 a[k].f = mat[d][dm[d]->GetForward(i)][j];
00057                 k++;
00058             }
00059         }
00060         
00061         sort(a.begin(), a.end(), ascend());
00062         float b1 = (float) (1.0 - (tot+1.0) / 2.0) / (float) ((tot+1.0)/2.0);
00063         float b2 = (float) (tot-1.0-(tot+1.0)/2.0) / (float)((tot+1.0)/2.0);
00064         
00065         for(j=0; j<tot; j++){
00066             int id1 = a[j].i;
00067             int id2 = a[j].j;
00068             int p = tot+1;
00069             mat[id1][dm[id1]->GetForward(i)][id2] = 
00070                 ((float)(j+1) - (float) p/2.0) / ((float)p/2.0);
00071         }
00072         a.clear();  
00073     }
00074     return true;    
00075 }
00076 
00077 bool weight_experiment(
00078     vector<vector<vector<float> > > &mat, vector<CSeekIntIntMap*> &dm, 
00079     int &numExperiments, float cut_off_percentage,
00080     vector<utype> &q, vector<bresult> &a){
00081 
00082     int d, dd, j, qi;
00083     numExperiments = 0;
00084     int numDatasets = mat.size();
00085     for(d=0; d<numDatasets; d++){
00086         //for(j=0; j<mat[d][0].size(); j++){
00087             int q_size = 0;
00088             for(qi=0; qi<q.size(); qi++){
00089                 int g = q[qi];
00090                 if(CSeekTools::IsNaN(dm[d]->GetForward(g))) continue;
00091                 q_size++;
00092             }
00093             int pq_size = (int)(0.5*q.size());
00094             if(q_size<pq_size || q_size<2){
00095                 continue;
00096             }
00097             numExperiments+=mat[d][0].size();
00098         //}
00099     }
00100     
00101     a.clear();
00102     a.resize(numExperiments);
00103     int numThreads=8;
00104     omp_set_num_threads(numThreads);
00105     
00106     int k=0; 
00107     /*
00108     vector<int> array_i, array_j;
00109     vector<float> array_f, array_f2;
00110     */
00111 
00112     vector< vector<int> > array_i, array_j;
00113     vector< vector<float> > array_f, array_f2;
00114     array_i.resize(numThreads);
00115     array_j.resize(numThreads);
00116     array_f.resize(numThreads);
00117     array_f2.resize(numThreads);
00118     
00119     for(j=0; j<numThreads; j++){
00120         array_i[j] = vector<int>();
00121         array_j[j] = vector<int>();
00122         array_f[j] = vector<float>();
00123         array_f2[j] = vector<float>();
00124     }
00125     
00126     #pragma omp parallel for \
00127     shared(numExperiments, mat, q, dm, a, array_i, array_j, array_f, array_f2) \
00128     private(d, j, qi) \
00129     firstprivate(numDatasets) schedule(dynamic)
00130     for(d=0; d<numDatasets; d++){
00131         utype tid = omp_get_thread_num();
00132         for(j=0; j<mat[d][0].size(); j++){
00133             float mean_query = 0;
00134             float sample_variance = 0;
00135             int q_size = 0;
00136             float significance = 0;
00137             for(qi=0; qi<q.size(); qi++){
00138                 int g= q[qi];
00139                 if(CSeekTools::IsNaN(dm[d]->GetForward(g))) continue;
00140                 q_size++;
00141                 mean_query+=mat[d][dm[d]->GetForward(g)][j];
00142             }
00143             int pq_size = (int)(0.5*q.size());
00144             if(q_size<pq_size || q_size<2) continue;
00145             
00146             mean_query/=(float)q_size;
00147             for(qi=0; qi<q.size(); qi++){
00148                 int g=q[qi];
00149                 if(CSeekTools::IsNaN(dm[d]->GetForward(g))) continue;
00150                 float diff = mat[d][dm[d]->GetForward(g)][j] - mean_query;
00151                 sample_variance+=diff*diff;
00152             }
00153             sample_variance/=(float)q_size;
00154             significance = sqrt((float)q_size)*mean_query/sqrt(sample_variance + 1.0 /
00155                 (3*numExperiments*numExperiments));
00156             
00157             array_i[tid].push_back(d);
00158             array_j[tid].push_back(j);
00159             array_f[tid].push_back(significance);
00160             array_f2[tid].push_back(mean_query);
00161             
00162             /*array_i.push_back(d);
00163             array_j.push_back(j);
00164             array_f.push_back(significance);
00165             array_f2.push_back(mean_query);*/
00166             /*a[k].i = d;
00167             a[k].j = j;
00168             a[k].f = sqrt((float)q_size)*mean_query/sqrt(sample_variance + 1.0 /
00169                 (3*numExperiments*numExperiments));
00170             a[k].f2 = mean_query;
00171             k++;*/
00172         }
00173     }
00174 
00175     /*
00176     for(d=0; d<array_i.size(); d++){
00177         a[k].i = array_i[d];
00178         a[k].j = array_j[d];
00179         a[k].f = array_f[d];
00180         a[k].f2 = array_f2[d];
00181         k++;
00182     }
00183     */
00184     
00185     
00186     for(j=0; j<numThreads; j++){
00187         for(d=0; d<array_i[j].size(); d++){
00188             a[k].i = array_i[j][d];
00189             a[k].j = array_j[j][d];
00190             a[k].f = array_f[j][d];
00191             a[k].f2 = array_f2[j][d];
00192             k++;
00193         }
00194     }
00195     
00196     //int nth=(int) ((float) numExperiments * cut_off_percentage);
00197     //std::nth_element(a.begin(), a.begin()+nth, a.end(), descend());
00198     sort(a.begin(), a.end(), descend());
00199     return true;
00200 }
00201 
00202 bool gene_scoring(
00203     int numGenes,
00204     vector<vector<vector<float> > > &mat, vector<CSeekIntIntMap*> &dm, 
00205     vector<bresult> &a, int numExperiments, vector<char> &absent_g, 
00206     float cut_off_percentage, vector<float> &gs){
00207 
00208     int i, ii, d, k;
00209     CSeekTools::InitVector(gs, numGenes, (float) -320);
00210     vector<int> tot;
00211     CSeekTools::InitVector(tot, numGenes, (int) 0);
00212     
00213     int numDatasets = mat.size();
00214     CSeekIntIntMap mapG(numGenes);
00215     
00216     for(i=0; i<numGenes; i++){
00217         if(absent_g[i]==1) continue;
00218         mapG.Add(i);
00219         for(d=0; d<numDatasets; d++){
00220             if(CSeekTools::IsNaN(dm[d]->GetForward(i))) continue;
00221             tot[i]+=mat[d][0].size(); //numConditions in a dataset
00222         }
00223     }
00224     
00225     int numThreads=8;
00226     omp_set_num_threads(numThreads);
00227     
00228     const vector<utype> &allGenes = mapG.GetAllReverse();
00229     float cutoff = cut_off_percentage;
00230     int numExp = numExperiments;    
00231 
00232     if(numExp==0){
00233         return true;
00234     }
00235 
00236     #pragma omp parallel for \
00237     shared(allGenes, absent_g, a, tot, mat, dm, gs) \
00238     private(i, ii, k) \
00239     firstprivate(cutoff, numExp) schedule(dynamic)  
00240     //fprintf(stderr, "Number of experiments: %d\n", numExp);
00241     for(ii=0; ii<mapG.GetNumSet(); ii++){
00242         i = allGenes[ii];
00243         int num_valid_experiments = 0;
00244         float sc = 0;
00245         //fprintf(stderr, "Gene %d, %d of %d\n", i, ii, mapG.GetNumSet());
00246         for(k=0; k<tot[i]; k++){
00247             if(k+1 > (int) (cutoff*(float) numExp)){
00248                 break;
00249             }
00250             int id1 = a[k].i;
00251             int id2 = a[k].j;
00252             
00253             if(CSeekTools::IsNaN(dm[id1]->GetForward(i))) continue;
00254             sc+=mat[id1][dm[id1]->GetForward(i)][id2] * a[k].f2;
00255             num_valid_experiments++;
00256         }
00257         //fprintf(stderr, "Valid experiments %d\n", num_valid_experiments);
00258 
00259         int valid_cutoff = (int) (cutoff / 2.0 * (float) numExp);
00260         if(num_valid_experiments<valid_cutoff){
00261             continue;
00262         }
00263         gs[i] = sc / (float)num_valid_experiments;
00264     }
00265 
00266     return true;
00267 }
00268 
00269 int main(int iArgs, char **aszArgs){
00270     static const size_t c_iBuffer   = 1024;
00271     char acBuffer[c_iBuffer];
00272     
00273     gengetopt_args_info sArgs;
00274     ifstream ifsm;
00275     istream *pistm;
00276     vector<string> vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery;
00277     utype i, qi, j, k, l;
00278     
00279     if(cmdline_parser(iArgs, aszArgs, &sArgs)){
00280         cmdline_parser_print_help();
00281         return 1;
00282     }
00283 
00284     if( sArgs.input_arg ) {
00285         ifsm.open( sArgs.input_arg );
00286         pistm = &ifsm; 
00287     }else{
00288         pistm = &cin;
00289     }
00290     
00291     map<string, size_t> mapstriGenes;
00292     while( !pistm->eof( ) ) {
00293         pistm->getline( acBuffer, c_iBuffer - 1 );
00294         acBuffer[ c_iBuffer - 1 ] = 0;
00295         vecstrLine.clear( );
00296         CMeta::Tokenize( acBuffer, vecstrLine );
00297         if( vecstrLine.size( ) < 2 ) {
00298             cerr << "Ignoring line: " << acBuffer << endl;
00299             continue;
00300         }
00301         if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
00302             cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for "
00303                 << vecstrLine[ 1 ] << endl;
00304             return 1;
00305         }
00306         i--;
00307         if( vecstrGenes.size( ) <= i )
00308             vecstrGenes.resize( i + 1 );
00309         vecstrGenes[ i ] = vecstrLine[ 1 ];
00310         mapstriGenes[vecstrGenes[i]] = i;
00311     }
00312             
00313     //char acBuffer[1024];
00314 
00315     if(sArgs.pcl_flag==1){
00316         string pcl_dir = sArgs.pcl_dir_arg;
00317         string output_dir = sArgs.dir_out_arg;
00318         vector<string> pcl_list;
00319         vector< vector<string> > vecstrAllQuery;
00320         int numGenes = vecstrGenes.size();
00321         
00322         if(!CSeekTools::ReadMultipleQueries(sArgs.query_arg, vecstrAllQuery))
00323             return -1;
00324         
00325         CSeekTools::ReadListOneColumn(sArgs.pcl_list_arg, pcl_list);
00326         vector< vector< vector<float> > > mat;
00327         vector<CSeekIntIntMap*> dm;
00328         dm.resize(pcl_list.size());
00329         mat.resize(pcl_list.size());
00330         
00331         for(i=0; i<pcl_list.size(); i++){
00332             fprintf(stderr, "Reading %d: %s\n", i, pcl_list[i].c_str());
00333             dm[i] = new CSeekIntIntMap(vecstrGenes.size());
00334             
00335             string pclfile = pcl_dir + "/" + pcl_list[i] + ".bin";
00336 
00337             CPCL pcl;
00338             pcl.Open(pclfile.c_str());
00339             int totNumExperiments = pcl.GetExperiments() - 2;
00340             
00341             vector<utype> presentIndex;
00342             vector<string> presentGeneNames;
00343             for(j=0; j<vecstrGenes.size(); j++){
00344                 utype g = pcl.GetGene(vecstrGenes[j]);
00345                 if(CSeekTools::IsNaN(g)) continue; //gene does not exist in the dataset
00346                 presentIndex.push_back(g);
00347                 presentGeneNames.push_back(vecstrGenes[j]);
00348                 dm[i]->Add(j);
00349             }
00350             
00351             mat[i].resize(presentIndex.size());
00352             for(j=0; j<presentIndex.size(); j++)
00353                 mat[i][j].resize(totNumExperiments);
00354             
00355             for(j=0; j<presentIndex.size(); j++){
00356                 float *val = pcl.Get(presentIndex[j]);
00357                 for(k=2; k<pcl.GetExperiments(); k++)
00358                     mat[i][j][k-2] = val[k];                    
00359             }
00360 
00361         }
00362         
00363         fprintf(stderr, "Finished loading...\n"); //getchar();
00364         vector<char> absent_g;
00365         rank_transform(mat, dm, absent_g, vecstrGenes.size());
00366         fprintf(stderr, "Finished transforming...\n"); //getchar();
00367         
00368         for(i=0; i<vecstrAllQuery.size(); i++){
00369             vector<utype> q;
00370             vector<char> is_query;
00371 
00372             fprintf(stderr, "Query %d begin\n", i); //getchar();    
00373 
00374             CSeekTools::InitVector(is_query, numGenes, (char) 0);
00375             for(j=0; j<vecstrAllQuery[i].size(); j++){
00376                 q.push_back(mapstriGenes[vecstrAllQuery[i][j]]);
00377                 is_query[mapstriGenes[vecstrAllQuery[i][j]]] = 1;
00378             }
00379             int numExperiments = 0;
00380             vector<bresult> b;
00381             vector<float> gs;
00382             fprintf(stderr, "Query %d finished initializing query\n", i); //getchar();
00383             weight_experiment(mat, dm, numExperiments, 0.05, q, b);
00384 
00385             fprintf(stderr, "Query %d finished weighting\n", i); //getchar();
00386             gene_scoring(vecstrGenes.size(), mat, dm, b, numExperiments, absent_g, 
00387                 0.05, gs);
00388     
00389             fprintf(stderr, "Query %d finished scoring\n", i); //getchar();
00390             
00391             for(j=0; j<absent_g.size(); j++){
00392                 if(absent_g[j]==1){
00393                     gs[j] = -320;
00394                 }
00395             }
00396 
00397             
00398             fprintf(stderr, "Query %d end\n", i); //getchar();
00399 
00400             sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), i);
00401             CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[i]);
00402             sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), i);
00403             CSeekTools::WriteArray(acBuffer, gs);
00404 
00405             /*
00406             fprintf(stdout, "Query %d\n", i);
00407             fprintf(stdout, "Weights:");
00408             //fprintf(stderr, "Weights:\n");
00409             for(j=0; j<b.size() && j<(int)(0.05*numExperiments); j++){
00410                 if(j%10==0){
00411                     fprintf(stdout, "\n");
00412                 }   
00413                 fprintf(stdout, "%.2f ", b[j].f);
00414                 //fprintf(stderr, "%d %.2f\n", j, b[j].f);
00415             }
00416             fprintf(stdout, "\n");
00417             vector<AResultFloat> gsc;
00418             gsc.resize(numGenes);
00419             //fprintf(stderr, "numexp %d gsc %d gs %d\n", numExperiments, gsc.size(), gs.size());
00420             for(j=0; j<numGenes; j++){
00421                 gsc[j].i = j;
00422                 gsc[j].f = gs[j];
00423                 //fprintf(stderr, "Gene %d %.2f\n", j, gsc[j].f);
00424             }
00425             sort(gsc.begin(), gsc.end());
00426             fprintf(stdout, "Results:\n");
00427             int kk=0;
00428             for(k=0; k<numGenes && kk<500; k++){
00429                 if(is_query[gsc[k].i]==1) continue;
00430                 if(gsc[k].f==-320) continue;
00431                 fprintf(stdout, "%s %.5f\n", vecstrGenes[gsc[k].i].c_str(), gsc[k].f);
00432                 kk++;
00433             }
00434             */
00435             /*
00436             fprintf(stderr, "Query %d\n", i);
00437             fprintf(stderr, "Weights:");
00438             for(j=0; j<(int)(0.05* numExperiments); j++){
00439                 if(j%10==0){
00440                     fprintf(stderr, "\n");
00441                 }
00442                 fprintf(stderr, "%.2f ", b[j].f);
00443             }
00444             fprintf(stderr, "\n");
00445             vector<AResultFloat> gsc;
00446             gsc.resize(numGenes);
00447             for(j=0; j<numGenes; j++){
00448                 gsc[j].i = j;
00449                 gsc[j].f = gs[j];
00450             }
00451             sort(gsc.begin(), gsc.end());
00452             fprintf(stderr, "Results:\n");
00453             int kk=0;
00454             for(k=0; k<numGenes && kk<500; k++){
00455                 if(is_query[gsc[k].i]==1) continue;
00456                 if(gsc[k].f==-320) continue;
00457                 fprintf(stderr, "%d %.5f\n", gsc[k].i, gsc[k].f);
00458                 kk++;
00459             }
00460             */
00461             gs.clear();
00462             b.clear();      
00463 
00464         }
00465                 
00466         
00467     }
00468     
00469     
00470 }