Sleipnir
tools/SeekAggregatedDataset/SeekAggregatedDataset.cpp
00001 #include "stdafx.h"
00002 #include "cmdline.h"
00003 
00004 struct bresult{
00005     int i;
00006     int j;
00007     float f;
00008     float f2;
00009 };
00010 
00011 struct ascend{
00012     bool operator()(const bresult& lx, const bresult& rx) const{
00013         return lx.f < rx.f;
00014     }
00015 };
00016 
00017 struct descend{
00018     bool operator()(const bresult& lx, const bresult& rx) const{
00019         return lx.f > rx.f;
00020     }
00021 };
00022 
00023 
00024 bool calculate_correlation(
00025     vector<vector<vector<float> > > &mat,
00026     vector<CSeekIntIntMap*> &dm,
00027     utype g1, 
00028     utype g2, 
00029     float &r){
00030 
00031     int numDatasets = mat.size();
00032     int d, i;
00033     vector<float> v1, v2;
00034 
00035     vector<int> count_common;
00036     CSeekTools::InitVector(count_common, numDatasets, (int) 0);
00037 
00038     for(d=0; d<numDatasets; d++){
00039         if(CSeekTools::IsNaN(dm[d]->GetForward(g1))) continue;
00040         count_common[d]++;
00041     }
00042 
00043     for(d=0; d<numDatasets; d++){
00044         if(CSeekTools::IsNaN(dm[d]->GetForward(g2))) continue;
00045         count_common[d]++;
00046     }
00047 
00048     int total_common = 0;
00049     for(d=0; d<numDatasets; d++){
00050         if(count_common[d]==2) total_common++;
00051     }
00052 
00053     /*if(total_common<0.5*(float)numDatasets){
00054         r = -320;
00055         return true;
00056     }*/
00057 
00058     for(d=0; d<numDatasets; d++){
00059         if(count_common[d]==2){
00060             utype n1 = dm[d]->GetForward(g1);
00061             utype n2 = dm[d]->GetForward(g2);
00062             int numExp = mat[d][n1].size();
00063             for(i=0; i<numExp; i++){
00064                 v1.push_back(mat[d][n1][i]);
00065                 v2.push_back(mat[d][n2][i]);
00066                 /*if(isinf(mat[d][n1][i])||isnan(mat[d][n1][i])){
00067                     fprintf(stderr, "BAD: d %d g %d e %d v %.2f\n", d, n1, i, mat[d][n1][i]);
00068                 }
00069                 if(isinf(mat[d][n2][i])||isnan(mat[d][n2][i])){
00070                     fprintf(stderr, "BAD: d %d g %d e %d v %.2f\n", d, n2, i, mat[d][n2][i]);
00071                 }*/
00072             }
00073         }
00074     }
00075 
00076     //pearson correlation calculation
00077     double mean_x = 0;
00078     double mean_y = 0;
00079     int xx = 0;
00080     for(xx=0; xx<v1.size(); xx++){
00081         mean_x+=v1[xx];
00082         mean_y+=v2[xx];
00083     }
00084     mean_x /= (double) v1.size();
00085     mean_y /= (double) v2.size();
00086     
00087     double sum_xy = 0;
00088     double dev_x = 0;
00089     double dev_y = 0;
00090     for(xx=0; xx<v1.size(); xx++){
00091         sum_xy += (v1[xx] - mean_x) * (v2[xx] - mean_y);
00092         dev_x += (v1[xx] - mean_x) * (v1[xx] - mean_x);
00093         dev_y += (v2[xx] - mean_y) * (v2[xx] - mean_y);
00094     }
00095     dev_x = sqrt(dev_x);
00096     dev_y = sqrt(dev_y);
00097     r = (float) (sum_xy / dev_x / dev_y);
00098     if(isinf(r) || isnan(r)){
00099         r = -320;
00100     }
00101     //fprintf(stderr, "%.2f\t%d\n", r, total_common);
00102 
00103     return true;
00104             
00105 }
00106 
00107 int main(int iArgs, char **aszArgs){
00108     static const size_t c_iBuffer   = 1024;
00109     char acBuffer[c_iBuffer];
00110     
00111     gengetopt_args_info sArgs;
00112     ifstream ifsm;
00113     istream *pistm;
00114     vector<string> vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery;
00115     utype i, qi, j, k, l;
00116     
00117     if(cmdline_parser(iArgs, aszArgs, &sArgs)){
00118         cmdline_parser_print_help();
00119         return 1;
00120     }
00121 
00122     if( sArgs.input_arg ) {
00123         ifsm.open( sArgs.input_arg );
00124         pistm = &ifsm; 
00125     }else{
00126         pistm = &cin;
00127     }
00128 
00129     fprintf(stderr, "Reading gene list\n"); 
00130     map<string, size_t> mapstriGenes;
00131     while( !pistm->eof( ) ) {
00132         pistm->getline( acBuffer, c_iBuffer - 1 );
00133         acBuffer[ c_iBuffer - 1 ] = 0;
00134         vecstrLine.clear( );
00135         CMeta::Tokenize( acBuffer, vecstrLine );
00136         if( vecstrLine.size( ) < 2 ) {
00137             cerr << "Ignoring line: " << acBuffer << endl;
00138             continue;
00139         }
00140         if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
00141             cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for "
00142                 << vecstrLine[ 1 ] << endl;
00143             return 1;
00144         }
00145         i--;
00146         if( vecstrGenes.size( ) <= i )
00147             vecstrGenes.resize( i + 1 );
00148         vecstrGenes[ i ] = vecstrLine[ 1 ];
00149         mapstriGenes[vecstrGenes[i]] = i;
00150     }
00151             
00152     //char acBuffer[1024];
00153 
00154     fprintf(stderr, "Finished reading gene map\n");
00155     if(sArgs.pcl_flag==1){
00156         string pcl_dir = sArgs.pcl_dir_arg;
00157         string output_dir = sArgs.dir_out_arg;
00158         fprintf(stderr, "Arrived here\n");
00159         vector<string> pcl_list;
00160         vector< vector<string> > vecstrAllQuery;
00161         int numGenes = vecstrGenes.size();
00162     
00163         fprintf(stderr, "Reading query\n"); 
00164         if(!CSeekTools::ReadMultipleQueries(sArgs.query_arg, vecstrAllQuery))
00165             return -1;
00166         
00167         fprintf(stderr, "Reading pcl list\n");  
00168         CSeekTools::ReadListOneColumn(sArgs.pcl_list_arg, pcl_list);
00169         vector< vector< vector<float> > > mat; //only needed for cor-calc
00170         vector<CSeekIntIntMap*> dm;
00171         dm.resize(pcl_list.size());
00172         mat.resize(pcl_list.size()); //only needed for cor-calc
00173         for(i=0; i<pcl_list.size(); i++){
00174             fprintf(stderr, "Reading %d: %s\n", i, pcl_list[i].c_str());
00175             dm[i] = new CSeekIntIntMap(vecstrGenes.size());
00176             
00177             string pclfile = pcl_dir + "/" + pcl_list[i] + ".bin";
00178 
00179             CPCL pcl;
00180             pcl.Open(pclfile.c_str());
00181             int totNumExperiments = pcl.GetExperiments();
00182             
00183             vector<utype> presentIndex;
00184             vector<string> presentGeneNames;
00185             for(j=0; j<vecstrGenes.size(); j++){
00186                 utype g = pcl.GetGene(vecstrGenes[j]);
00187                 if(CSeekTools::IsNaN(g)) continue; //gene does not exist in the dataset
00188                 presentIndex.push_back(g);
00189                 presentGeneNames.push_back(vecstrGenes[j]);
00190                 dm[i]->Add(j);
00191             }
00192             
00193             //only needed for correlation calc
00194             mat[i].resize(presentIndex.size());
00195             for(j=0; j<presentIndex.size(); j++)
00196                 mat[i][j].resize(totNumExperiments);
00197             
00198             for(j=0; j<presentIndex.size(); j++){
00199                 float *val = pcl.Get(presentIndex[j]);
00200                 for(k=0; k<pcl.GetExperiments(); k++){
00201                     mat[i][j][k] = val[k];  
00202                     if(isinf(val[k])||isnan(val[k])){
00203                         fprintf(stderr, "loading error: %d of %d\n", k, pcl.GetExperiments());
00204                     }
00205                 }
00206             }
00207 
00208         }
00209     
00210         float present_cutoff = 0.5;
00211         vector<char> absent_g;
00212         CSeekTools::InitVector(absent_g, numGenes, (char) 0);
00213 
00214         for(i=0; i<numGenes; i++){
00215             int totAbsent = 0;
00216             for(j=0; j<pcl_list.size(); j++){
00217                 CSeekIntIntMap *mapG = dm[j];
00218                 if(CSeekTools::IsNaN(mapG->GetForward(i))){
00219                     totAbsent++;
00220                 }
00221             }
00222             if(totAbsent<0.5*(float)pcl_list.size()){
00223                 absent_g[i] = 0;
00224             }else{
00225                 absent_g[i] = 1;
00226             }
00227         }
00228 
00229         CSeekIntIntMap *geneMap = new CSeekIntIntMap(numGenes);
00230         for(i=0; i<numGenes; i++){
00231             if(absent_g[i]==1) continue;
00232             geneMap->Add(i);
00233         }
00234 
00235         int numActualGenes = geneMap->GetNumSet();
00236 
00237         
00238         //Do pair per machine, Step 2============================
00239 /*
00240         vector<utype> pairs;
00241         CSeekTools::ReadArray("/tmp/pairs_to_do", pairs);
00242 
00243         int numP = pairs.size()/2;
00244         int pi=0;
00245         const vector<utype> &allGenes = geneMap->GetAllReverse();
00246         vector<float> cor;
00247         cor.resize(numP);
00248 
00249         #pragma omp parallel for \
00250         shared(allGenes, cor, mat, dm, pairs) \
00251         private(pi) \
00252         firstprivate(numP) schedule(dynamic)
00253         for(pi=0; pi<numP; pi++){
00254             utype g1 = allGenes[pairs[pi*2]];
00255             utype g2 = allGenes[pairs[pi*2+1]];
00256             if(g1==g2){
00257                 cor[pi] = -320;
00258                 continue;
00259             }
00260             calculate_correlation(mat, dm, g1, g2, cor[pi]);
00261             if(pi%1000==0){
00262                 fprintf(stderr, "  %d of %d\n", pi, numP);
00263             }
00264         }
00265 
00266         CSeekTools::WriteArray("/tmp/results_pairs", cor);      
00267 */      
00268         //============================================================= 
00269         
00270         
00271     
00272         vector< vector<float> > correlations;
00273         correlations.resize(numActualGenes);
00274         for(i=0; i<numActualGenes; i++){
00275             correlations[i].resize(numActualGenes);
00276         }
00277 
00278         //combining pairs, Step 3=================================
00279     /*
00280         int max_i = 49;
00281         char file[256];
00282         const vector<utype> &allGenes = geneMap->GetAllReverse();
00283 
00284         for(i=0; i<=max_i; i++){
00285             vector<utype> pairs;
00286             vector<float> cor;
00287 
00288             sprintf(file, "/memex/qzhu/p1/pairs_to_do.%d", i);
00289             CSeekTools::ReadArray(file, pairs);
00290             sprintf(file, "/memex/qzhu/p1/oct6/pairs_to_do.%d_results", i);
00291             CSeekTools::ReadArray(file, cor);
00292 
00293             int numP = pairs.size()/2;
00294             int pi=0;
00295 
00296             for(pi=0; pi<numP; pi++){
00297                 correlations[pairs[pi*2]][pairs[pi*2+1]] = cor[pi];
00298                 correlations[pairs[pi*2+1]][pairs[pi*2]] = cor[pi];
00299             }
00300         }
00301 
00302         vector<float> correlation1D;
00303         correlation1D.resize(numActualGenes*numActualGenes);
00304         int kk=0;
00305         for(i=0; i<numActualGenes; i++){
00306             for(j=0; j<numActualGenes; j++){
00307                 correlation1D[kk] = correlations[i][j];
00308                 kk++;
00309             }
00310         }
00311         CSeekTools::WriteArray("/memex/qzhu/p1/aggregated_dataset_correlation", correlation1D);
00312 
00313         fprintf(stderr, "Finished creating file\n");
00314         getchar();
00315 */
00316         //============================================================
00317 
00318 
00319         /*
00320         //generate pairs per machine, Step 1================================
00321         vector<utype> pairs;
00322         int numP = numActualGenes*(numActualGenes-1);
00323         pairs.resize(numP);
00324         int ki = 0;
00325         for(i=0; i<numActualGenes; i++){
00326             for(j=i+1; j<numActualGenes; j++){
00327                 pairs[ki] = i; 
00328                 ki++;
00329                 pairs[ki] = j;
00330                 ki++;
00331             }
00332         }
00333 
00334         if(ki!=numP){
00335             fprintf(stderr, "Cannot continue!\n");
00336             return -1;
00337         }
00338 
00339         int num_pairs_per_file = (numP / 2) / 49;
00340         
00341         ki = 0;
00342         vector<utype> pp;
00343         pp.resize(num_pairs_per_file*2);
00344         char destfile[256];
00345         int kj = 0;
00346         int ii = 0;
00347 
00348         fprintf(stderr, "Numpairs per file: %d\n", num_pairs_per_file);
00349         for(ii=0; ii<numP/2; ii++){
00350             if(ii%num_pairs_per_file==0 && ii>0){
00351                 sprintf(destfile, "/memex/qzhu/p1/pairs_to_do.%d", ki);
00352                 CSeekTools::WriteArray(destfile, pp);
00353                 pp.clear();
00354                 pp.resize(num_pairs_per_file*2);
00355                 ki++;
00356                 kj = 0;
00357             }
00358             pp[kj] = pairs[ii*2];
00359             pp[kj+1] = pairs[ii*2+1];
00360             kj+=2;
00361             if(ii==numP/2-1){
00362                 sprintf(destfile, "/memex/qzhu/p1/pairs_to_do.%d", ki);
00363                 pp.resize(kj);
00364                 CSeekTools::WriteArray(destfile, pp);
00365             }
00366         }
00367         
00368 
00369         fprintf(stderr, "Finished\n");  
00370         getchar();
00371         //=====================================================================
00372         */
00373 
00374         //Load aggregated dataset, Step 4=====================================
00375 
00376         vector<float> correlation1D;
00377         CSeekTools::ReadArray("/home/qzhu/Seek/aggregated_dataset_correlation", correlation1D);
00378         int pi = 0;
00379         for(i=0; i<numActualGenes; i++){
00380             for(j=0; j<numActualGenes; j++){
00381                 correlations[i][j] = correlation1D[pi];
00382                 pi++;
00383             }
00384         }
00385         correlation1D.clear();
00386 
00387         //======================================================================
00388 
00389 
00390         /*
00391         const vector<utype> &allGenes = geneMap->GetAllReverse();
00392         //start correlation calculations
00393         for(i=0; i<numActualGenes; i++){
00394             utype g1 = allGenes[i];
00395             fprintf(stderr, "Gene %d of %d: %d\n", i, numActualGenes, numActualGenes - (i+1));
00396 
00397             #pragma omp parallel for \
00398             shared(allGenes, correlations, mat, dm) \
00399             private(j) \
00400             firstprivate(numActualGenes, g1, i) schedule(dynamic)
00401             for(j=i+1; j<numActualGenes; j++){
00402                 utype tid = omp_get_thread_num();
00403                 utype g2 = allGenes[j];
00404                 if(g1==g2){
00405                     correlations[i][j] = -320; //Null value
00406                     correlations[j][i] = -320;
00407                     continue;
00408                 }
00409                 float r = -320;
00410                 calculate_correlation(mat, dm, g1, g2, r);
00411                 correlations[i][j] = r;
00412                 correlations[j][i] = r;
00413                 //if((j-(i+1))%500==0){
00414                 //  fprintf(stderr, "   %d of %d from %d\n", j - (i+1), numActualGenes - (i+1), tid);
00415                 //}
00416             }
00417             fprintf(stderr, "Done\n");
00418         }
00419 
00420         vector<float> correlation1D;
00421         correlation1D.resize(numActualGenes*numActualGenes);
00422         int kk=0;
00423         for(i=0; i<numActualGenes; i++){
00424             for(j=0; j<numActualGenes; j++){
00425                 correlation1D[kk] = correlations[i][j];
00426                 kk++;
00427             }
00428         }
00429         CSeekTools::WriteArray("/memex/qzhu/p1/aggregated_dataset_correlation", correlation1D);
00430         */
00431 
00432         //Evaluation Last step==================================================
00433         
00434         const vector<utype> &allGenes = geneMap->GetAllReverse();       
00435 
00436         for(i=0; i<vecstrAllQuery.size(); i++){
00437             vector<utype> q;
00438             vector<char> is_query;
00439 
00440             fprintf(stderr, "Query %d begin\n", i); //getchar();    
00441 
00442             CSeekTools::InitVector(is_query, numGenes, (char) 0);
00443             for(j=0; j<vecstrAllQuery[i].size(); j++){
00444                 q.push_back(mapstriGenes[vecstrAllQuery[i][j]]);
00445                 is_query[mapstriGenes[vecstrAllQuery[i][j]]] = 1;
00446             }
00447 
00448             vector<float> gene_score;
00449             vector<char> gene_count;
00450             CSeekTools::InitVector(gene_score, numGenes, (float) CMeta::GetNaN());
00451             CSeekTools::InitVector(gene_count, numGenes, (char) 0);
00452             int qi=0;
00453             int tot_present_q = 0;
00454             for(qi=0; qi<q.size(); qi++){
00455                 if(CSeekTools::IsNaN(geneMap->GetForward(q[qi]))) continue;
00456                 tot_present_q++;
00457                 utype qg = geneMap->GetForward(q[qi]);
00458                 for(j=0; j<numActualGenes; j++){
00459                     utype gi = allGenes[j];
00460                     if(correlations[qg][j]==-320){
00461                         continue;
00462                     }
00463                     if(CMeta::IsNaN(gene_score[gi])){
00464                         gene_score[gi] = 0;
00465                     }
00466                     gene_score[gi] += correlations[qg][j];
00467                     gene_count[gi]++;
00468                     //fprintf(stderr, "%.2f\n", correlations[qg][j]);
00469                 }
00470             }
00471 
00472             int total_non_zero = 0;
00473             for(j=0; j<numActualGenes; j++){
00474                 utype gi = allGenes[j];
00475                 if(gene_count[gi]!=tot_present_q){
00476                     gene_score[gi] = -320;
00477                     continue;
00478                 }
00479                 if(CMeta::IsNaN(gene_score[gi])){
00480                     gene_score[gi] = -320;
00481                     continue;
00482                 }
00483                 if(is_query[gi]==1){
00484                     gene_score[gi] = -320;
00485                     continue;
00486                 }
00487                 if(tot_present_q==0){
00488                     gene_score[gi] = -320;
00489                     continue;
00490                 }
00491                 gene_score[gi] /= (float) tot_present_q;
00492                 total_non_zero++;
00493             }
00494 
00495             //fprintf(stderr, "Total non zero genes: %d\n", total_non_zero);
00496 
00497             for(j=0; j<numGenes; j++){
00498                 if(CMeta::IsNaN(gene_score[j])){
00499                     gene_score[j] = -320;
00500                 }
00501             }
00502 
00503             sprintf(acBuffer, "%s/%d.query", output_dir.c_str(), i);
00504             CSeekTools::WriteArrayText(acBuffer, vecstrAllQuery[i]);
00505             sprintf(acBuffer, "%s/%d.gscore", output_dir.c_str(), i);
00506             CSeekTools::WriteArray(acBuffer, gene_score);
00507             
00508             //gs.clear();
00509             //b.clear();        
00510 
00511         }
00512     
00513         
00514     }
00515     
00516     
00517 }