Sleipnir
tools/SeekPrep/SeekPrep.cpp
00001 /*****************************************************************************
00002 * This file is provided under the Creative Commons Attribution 3.0 license.
00003 *
00004 * You are free to share, copy, distribute, transmit, or adapt this work
00005 * PROVIDED THAT you attribute the work to the authors listed below.
00006 * For more information, please see the following web page:
00007 * http://creativecommons.org/licenses/by/3.0/
00008 *
00009 * This file is a component of the Sleipnir library for functional genomics,
00010 * authored by:
00011 * Curtis Huttenhower (chuttenh@princeton.edu)
00012 * Mark Schroeder
00013 * Maria D. Chikina
00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact)
00015 *
00016 * If you use this library, the included executable tools, or any related
00017 * code in your work, please cite the following publication:
00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and
00019 * Olga G. Troyanskaya.
00020 * "The Sleipnir library for computational functional genomics"
00021 *****************************************************************************/
00022 #include "stdafx.h"
00023 #include "cmdline.h"
00024 
00025 enum ExportMode{ 
00026     DISTANCE_MATRIX, COUNT_MATRIX, WEIGHTSUM_MATRIX, GENE_PRESENCE_VECTOR 
00027 };
00028 
00029 enum NormMode{
00030     RANK_NORM, Z_NORM
00031 };
00032 
00033 //tType can only be unsigned short or utype
00034 //norm_mode = 0 (rank norm), 1 (subtract_z norm)
00035 template<class tType>
00036 bool CalculateMatrix(const NormMode norm_mode, 
00037     const ExportMode e, const vector<string> &dab_list, 
00038     const string &dab_dir, const string &outdir, const string &outdab, 
00039     const vector<string> &vecstrGenes, const vector<float> &w, const float rbp_p,
00040     const int MAX_RANK, const float exp){ //w=weight of dsets 
00041     //rbp_p, MAX_RANK for rank-based normalize
00042     //exp for value-based or subtract_z normalize
00043     const int RANK_NORM = 0;
00044     const int Z_NORM = 1;
00045 
00046     size_t i, j, k;
00047 
00048     if(e==GENE_PRESENCE_VECTOR){
00049         vector<utype> gpres;
00050         CSeekTools::InitVector(gpres, vecstrGenes.size(), (utype) 0);
00051         for(i=0; i<dab_list.size(); i++){
00052             fprintf(stderr, "Reading %d of %d: %s\n", i, dab_list.size(), 
00053                 dab_list[i].c_str());
00054             CSeekIntIntMap d1(vecstrGenes.size());
00055             string dabfile = dab_dir + "/" + dab_list[i];
00056             CSeekWriter::ReadSeekSparseMatrixHeader<tType>(dabfile.c_str(), d1);
00057             const vector<utype> &allR = d1.GetAllReverse();
00058             for(j=0; j<d1.GetNumSet(); j++){
00059                 utype gi = allR[j];
00060                 gpres[gi]++;
00061             }
00062         }
00063         string ofile = outdir+"/"+outdab+".gene_count";
00064         CSeekTools::WriteArray(ofile.c_str(), gpres);
00065 
00066     }else if(e==DISTANCE_MATRIX){
00067         CHalfMatrix<float> res;
00068         res.Initialize(vecstrGenes.size());
00069         for(i=0; i<vecstrGenes.size(); i++)
00070             for(j=i+1; j<vecstrGenes.size(); j++)
00071                 res.Set(i, j, 0);
00072     
00073         for(i=0; i<dab_list.size(); i++){
00074             fprintf(stderr, "Reading %d of %d: %s\n", i, dab_list.size(), 
00075                 dab_list[i].c_str());
00076             CSeekIntIntMap d1(vecstrGenes.size());
00077             string dabfile = dab_dir + "/" + dab_list[i];
00078             CSparseFlatMatrix<float> sm (0);
00079             if(norm_mode==RANK_NORM){
00080                 CSeekWriter::ReadSeekSparseMatrix<tType>(dabfile.c_str(), sm, d1, 
00081                     MAX_RANK, rbp_p, vecstrGenes);
00082                 //NEW
00083                 //CSeekWriter::RemoveDominant<tType>(sm, d1, vecstrGenes);
00084                 //CSeekWriter::RemoveDominant<tType>(sm, d1, vecstrGenes);
00085             }else if(norm_mode==Z_NORM){
00086                 CSeekWriter::ReadSeekSparseMatrix<tType>(dabfile.c_str(), sm, d1,
00087                     vecstrGenes, (int) (0.10*vecstrGenes.size()), exp);
00088             }
00089             fprintf(stderr, "Summing...\n");
00090             CSeekWriter::SumSparseMatrix(sm, res, d1, w[i]);
00091             fprintf(stderr, "Finished Summing...\n");
00092         }
00093 
00094         /*for(i=0; i<vecstrGenes.size(); i++){
00095             for(j=i+1; j<vecstrGenes.size(); j++){
00096                 float v = res.Get(i,j);
00097                 if(CMeta::IsNaN(v)){
00098                     fprintf(stderr, "Error, %d %d is nan\n", i,j);
00099                 }
00100             }
00101         }*/
00102 
00103         ofstream ofsm;
00104         string ofile = outdir + "/" + outdab + ".half";
00105         ofsm.open(ofile.c_str(), ios_base::binary);
00106         res.Save(ofsm, true);
00107         ofsm.close();
00108 
00109     }else if(e==COUNT_MATRIX){
00110         CHalfMatrix<unsigned short> res;
00111         res.Initialize(vecstrGenes.size());
00112         for(i=0; i<vecstrGenes.size(); i++)
00113             for(j=i+1; j<vecstrGenes.size(); j++)
00114                 res.Set(i, j, 0);
00115 
00116         for(i=0; i<dab_list.size(); i++){
00117             fprintf(stderr, "Reading %d of %d: %s\n", i, dab_list.size(), 
00118                 dab_list[i].c_str());
00119             CSeekIntIntMap d1(vecstrGenes.size());
00120             string dabfile = dab_dir + "/" + dab_list[i];
00121             CSeekWriter::ReadSeekSparseMatrixHeader<tType>(dabfile.c_str(), d1);
00122             const vector<utype> &allR = d1.GetAllReverse();
00123             for(j=0; j<d1.GetNumSet(); j++){
00124                 utype gi = allR[j];
00125                 for(k=j+1; k<d1.GetNumSet(); k++){
00126                     utype gj = allR[k];
00127                     res.Set(gi, gj, res.Get(gi, gj)+1);
00128                 }
00129             }
00130         }
00131         ofstream ofsm;
00132         string ofile = outdir + "/" + outdab + ".pair_count";
00133         ofsm.open(ofile.c_str(), ios_base::binary);
00134         res.Save(ofsm, true);
00135         ofsm.close();
00136 
00137     }else if(e==WEIGHTSUM_MATRIX){
00138         CHalfMatrix<float> res;
00139         res.Initialize(vecstrGenes.size());
00140         for(i=0; i<vecstrGenes.size(); i++)
00141             for(j=i+1; j<vecstrGenes.size(); j++)
00142                 res.Set(i, j, 0);
00143 
00144         for(i=0; i<dab_list.size(); i++){
00145             fprintf(stderr, "Reading %d of %d: %s\n", i, dab_list.size(), 
00146                 dab_list[i].c_str());
00147             CSeekIntIntMap d1(vecstrGenes.size());
00148             string dabfile = dab_dir + "/" + dab_list[i];
00149             CSeekWriter::ReadSeekSparseMatrixHeader<tType>(dabfile.c_str(), d1);
00150             const vector<utype> &allR = d1.GetAllReverse();
00151             for(j=0; j<d1.GetNumSet(); j++){
00152                 utype gi = allR[j];
00153                 for(k=j+1; k<d1.GetNumSet(); k++){
00154                     utype gj = allR[k];
00155                     res.Set(gi, gj, res.Get(gi, gj) + w[i]);
00156                 }
00157             }
00158         }
00159         ofstream ofsm;
00160         string ofile = outdir + "/" + outdab + ".pair_weightsum";
00161         ofsm.open(ofile.c_str(), ios_base::binary);
00162         res.Save(ofsm, true);
00163         ofsm.close();
00164     }
00165     else{
00166         fprintf(stderr, "Unrecognized mode!\n");
00167         return false;
00168     }
00169     return true;
00170 }   
00171 
00172 
00173 bool InitializeDataset(size_t &iDatasets, vector<string> &vecstrDatasets,
00174     string &strPrepInputDirectory, vector<CSeekDataset*> &vc){
00175     vc.clear();
00176     vc.resize(iDatasets);
00177     size_t i, j, k;
00178 
00179     for(i=0; i<iDatasets; i++){
00180         vc[i] = new CSeekDataset();
00181         string strFileStem = vecstrDatasets[i];
00182         string strAvgPath = strPrepInputDirectory + "/" + strFileStem
00183             + ".gavg";
00184         string strPresencePath = strPrepInputDirectory + "/" + strFileStem
00185             + ".gpres";
00186         vc[i]->ReadGeneAverage(strAvgPath);
00187         vc[i]->ReadGenePresence(strPresencePath);
00188         vc[i]->InitializeGeneMap();
00189 
00190         //DEBUGGING==============
00191         /*
00192         fprintf(stderr, "==Dataset %d==\n", i);
00193         CSeekIntIntMap *mapG = vc[i]->GetGeneMap();
00194         const vector<utype> &allR = mapG->GetAllReverse();
00195         for(j=0; j<mapG->GetNumSet(); j++){
00196             utype ja = mapG->GetReverse(j);
00197             fprintf(stderr, "%d\t%.3f\n", ja, vc[i]->GetGeneAverage(ja));
00198         }*/         
00199     }
00200     return true;
00201 }
00202 
00203 
00204 bool InitializeDB(size_t &iDatasets, size_t &iGenes,
00205     vector<string> &vecstrGenes, vector<CSeekDataset*> &vc, CDatabaselet &DBL){
00206 
00207     utype i,j,k;
00208     vector<char> cQuery;
00209     CSeekTools::InitVector(cQuery, iGenes, (char) 0);
00210     vector<string> allQuery;
00211     map<string, size_t> mapstrGenes;
00212     for(i=0; i<vecstrGenes.size(); i++){
00213         mapstrGenes[vecstrGenes[i]] = i;
00214     }
00215 
00216     /* Databaselet mapping */
00217     map<string, size_t> dbmap;
00218     vector<utype> veciQuery;
00219     for(i=0; i<DBL.GetGenes(); i++){
00220         string strQuery = DBL.GetGene(i);
00221         dbmap[strQuery] = i;
00222         allQuery.push_back(strQuery);
00223         /* global mapping */
00224         cQuery[mapstrGenes[strQuery]] = 1;
00225         veciQuery.push_back((utype) mapstrGenes[strQuery]);
00226     }
00227 
00228     //fprintf(stderr, "Start initializing query map...\n");
00229     for(i=0; i<iDatasets; i++){
00230         vc[i]->InitializeQueryBlock(veciQuery);
00231     }
00232 
00233     //fprintf(stderr, "Finished initializing map\n");
00234 
00235     //fprintf(stderr, "Start making gene-centric...\n");
00236     for(i=0; i<DBL.GetGenes(); i++){
00237         vector<unsigned char> Q;
00238         /* expanded */
00239         DBL.Get(i, Q);
00240         utype m = mapstrGenes[DBL.GetGene(i)];
00241         CSeekIntIntMap *qu = NULL;
00242         utype db;
00243         for(j=0; j<iDatasets; j++){
00244             if((qu=vc[j]->GetDBMap())==NULL) continue;
00245             if(CSeekTools::IsNaN(db=qu->GetForward(m))) continue;
00246             unsigned char **r = vc[j]->GetMatrix();
00247             for(k=0; k<iGenes; k++){
00248                 unsigned char c = Q[k*iDatasets + j];
00249                 r[db][k] = c;
00250             }
00251         }
00252     }
00253 
00254     //fprintf(stderr, "Finished making gene-centric\n");
00255 
00256     return true;
00257 }
00258 
00259 bool OpenDBFiles(string &DBFile, vector<unsigned char *> &cc, bool &useNibble){
00260     CDatabaselet *CD;
00261     (CD = new CDatabaselet(useNibble))->Open(DBFile);
00262     cc.push_back(CD->GetCharImage());
00263     return true;
00264 }
00265 
00266 
00267 bool OpenDB(string &DBFile, bool &useNibble, size_t &iDatasets,
00268     size_t &m_iGenes, vector<string> &vecstrGenes,
00269     map<utype, utype> &mapiPlatform, const vector<float> &quant,
00270     vector<CSeekDataset*> &vc, CFullMatrix<float> &platform_avg,
00271     CFullMatrix<float> &platform_stdev, vector<string> &vecstrQuery,
00272     const bool &logit){
00273 
00274     size_t i, j, jj, k;
00275 
00276     CDatabaselet CD(useNibble);
00277     CD.Open(DBFile);
00278     InitializeDB(iDatasets, m_iGenes, vecstrGenes, vc, CD);
00279 
00280     vector<string> presGenes;
00281     for(i=0; i<CD.GetGenes(); i++)
00282         presGenes.push_back(CD.GetGene(i));
00283 
00284     size_t numPlatforms = platform_avg.GetRows();
00285     map<string, size_t> mapstriGenes;
00286     for(i=0; i<vecstrGenes.size(); i++)
00287         mapstriGenes[vecstrGenes[i]] = i;
00288 
00289     //fprintf(stderr, "Start calculating platform average\n");
00290     for(i=0; i<CD.GetGenes(); i++){
00291         vector<float> sum, sq_sum, mean, stdev;
00292         vector<int> num;
00293         sum.resize(numPlatforms);
00294         sq_sum.resize(numPlatforms);
00295         mean.resize(numPlatforms);
00296         stdev.resize(numPlatforms);
00297         num.resize(numPlatforms);
00298 
00299         for(k=0; k<numPlatforms; k++){
00300             sum[k] = 0;
00301             sq_sum[k] = 0;
00302             mean[k] = 0;
00303             stdev[k] = 0;
00304             num[k] = 0;
00305         }
00306 
00307         string thisGene = CD.GetGene(i);
00308         size_t geneID = mapstriGenes[thisGene];
00309         vecstrQuery.push_back(thisGene);
00310         for(k=0; k<iDatasets; k++){
00311             CSeekIntIntMap *mapQ = vc[k]->GetDBMap();
00312             CSeekIntIntMap *mapG = vc[k]->GetGeneMap();
00313             const vector<utype> &allR = mapG->GetAllReverse();
00314             if(mapQ==NULL) continue;
00315             unsigned char **f = vc[k]->GetMatrix();
00316             utype iQ = mapQ->GetForward(geneID);
00317             if(CSeekTools::IsNaN(iQ)){
00318                 continue;
00319             }
00320             utype platform_id = mapiPlatform[k];
00321             if(platform_id>=(utype) numPlatforms){
00322                 printf("Error, platforms are equal %d %d",
00323                     (int) platform_id, (int) numPlatforms); getchar();
00324             }
00325 
00326             for(jj=0; jj<mapG->GetNumSet(); jj++){
00327                 j = mapG->GetReverse(jj);
00328                 unsigned char uc = f[iQ][j];
00329                 float v = 0;
00330                 if(uc==255) v = CMeta::GetNaN();
00331                 else{
00332                     float vv = -1;
00333                     if(logit) vv = log(quant[uc]) - log((float)(1.0-quant[uc]));
00334                     else vv = quant[uc];
00335                     v = vv - vc[k]->GetGeneAverage(j);
00336                     //fprintf(stderr, "%.5f\t%.5f\n", vv, v);
00337                     if(isnan(vv) || isinf(vv) || isnan(vc[k]->GetGeneAverage(j)) ||
00338                         isinf(vc[k]->GetGeneAverage(j))){
00339                         fprintf(stderr, "%d D%d %.5f %.5f %d\n", (int) j, (int) k, 
00340                         vv, vc[k]->GetGeneAverage(j), mapG->GetForward(j));
00341                     }
00342                     //v = quant[uc];
00343                     sum[platform_id] += v;
00344                     num[platform_id]++;
00345                     sq_sum[platform_id] += v*v;
00346                 }
00347             }
00348         }
00349 
00350         for(k=0; k<numPlatforms; k++){
00351             if(num[k]==0) continue;
00352             mean[k] = sum[k] / (float) num[k];
00353             stdev[k] = sq_sum[k] / (float) num[k] - mean[k] * mean[k];
00354             stdev[k] = sqrt(stdev[k]);
00355             fprintf(stderr, "%d %.5f %.5f\n", geneID, mean[k], stdev[k]);
00356             platform_avg.Set(k, geneID, mean[k]);
00357             platform_stdev.Set(k, geneID, stdev[k]);
00358         }
00359     }
00360 
00361     //fprintf(stderr, "Finished calculating platform average\n");
00362     //fprintf(stderr, "Start deleting\n");
00363 
00364     for(i=0; i<iDatasets; i++)
00365         vc[i]->DeleteQueryBlock();
00366     //fprintf(stderr, "Finished deleting\n");
00367     return true;
00368 }
00369 
00370 
00371 int main( int iArgs, char** aszArgs ) {
00372     static const size_t c_iBuffer   = 1024;
00373 #ifdef WIN32
00374     pthread_win32_process_attach_np( );
00375 #endif // WIN32
00376     gengetopt_args_info sArgs;
00377     ifstream            ifsm;
00378     istream*            pistm;
00379     vector<string>      vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery;
00380     char                acBuffer[ c_iBuffer ];
00381     utype               i, j, k, l;
00382 
00383     if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) {
00384         cmdline_parser_print_help( );
00385         return 1; }
00386 
00387     /* reading gene-mapping file */
00388     if( sArgs.input_arg ) {
00389         ifsm.open( sArgs.input_arg );
00390         pistm = &ifsm; }
00391     else
00392         pistm = &cin;
00393 
00394     map<string, size_t> mapstriGenes;
00395     while( !pistm->eof( ) ) {
00396         pistm->getline( acBuffer, c_iBuffer - 1 );
00397         acBuffer[ c_iBuffer - 1 ] = 0;
00398         vecstrLine.clear( );
00399         CMeta::Tokenize( acBuffer, vecstrLine );
00400         if( vecstrLine.size( ) < 2 ) {
00401             //cerr << "Ignoring line: " << acBuffer << endl;
00402             continue;
00403         }
00404         if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) {
00405             cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for "
00406                 << vecstrLine[ 1 ] << endl;
00407             return 1;
00408         }
00409         i--;
00410         if( vecstrGenes.size( ) <= i )
00411             vecstrGenes.resize( i + 1 );
00412         vecstrGenes[ i ] = vecstrLine[ 1 ];
00413         mapstriGenes[vecstrGenes[i]] = i;
00414     }
00415 
00416     if( sArgs.input_arg ) ifsm.close( );
00417 
00418     omp_set_num_threads(1);
00419     int numThreads = omp_get_max_threads();
00420 
00421     if((sArgs.dab_flag==1 && sArgs.norm_flag==1) ||
00422         sArgs.dabset_flag==1){
00423         if(sArgs.default_type_arg!=0 && sArgs.default_type_arg!=1){
00424             fprintf(stderr, "Error, invalid default_type argument!\n");
00425             return -1;
00426         }
00427     }
00428 
00429     /* PCL mode */
00430     if(sArgs.pclbin_flag==1){
00431 
00432         if(sArgs.sinfo_flag==1){
00433             string fileName = CMeta::Basename(sArgs.pclinput_arg);
00434             string fileStem = CMeta::Deextension(fileName);
00435             char outFile[125];
00436             sprintf(outFile, "%s/%s.sinfo", sArgs.dir_out_arg, fileStem.c_str());
00437 
00438             string pclfile = sArgs.pclinput_arg;
00439             if(!CMeta::IsExtension(pclfile, ".bin")){
00440                 fprintf(stderr, "Input file is not bin type!\n");
00441                 return 1;
00442             }
00443             
00444             CPCL pcl;
00445             if(!pcl.Open(pclfile.c_str())){
00446                 fprintf(stderr, "Error opening file\n");
00447             }
00448             CMeasurePearNorm pn;
00449             CDat Dat;
00450             int numG = pcl.GetGeneNames().size();
00451 
00452             vector<int> veciGenes;
00453             veciGenes.resize(numG);
00454             for (j = 0; j < veciGenes.size(); ++j)
00455                 veciGenes[j] = j;
00456             int iOne, iTwo;
00457             Dat.Open(pcl.GetGeneNames());
00458 
00459             for (k = 0; k < Dat.GetGenes(); ++k)
00460                 for (j = (k + 1); j < Dat.GetGenes(); ++j)
00461                     Dat.Set(k, j, CMeta::GetNaN());
00462 
00463             omp_set_num_threads(8);
00464 
00465             #pragma omp parallel for \
00466             shared(pcl, Dat, veciGenes, pn) \
00467             private(k,iOne,j,iTwo) \
00468             firstprivate(numG) \
00469             schedule(dynamic)
00470             for (k = 0; k < numG; ++k) {
00471                 if ((iOne = veciGenes[k]) == -1)
00472                     continue;
00473                 float *adOne = &pcl.Get(iOne)[0];
00474                 for (j = (k + 1); j < numG; ++j){
00475                     if ((iTwo = veciGenes[j]) != -1){
00476                         float x = (float) pn.Measure(adOne,
00477                             pcl.GetExperiments(), &pcl.Get(iTwo)[0],
00478                             pcl.GetExperiments());
00479                         Dat.Set(k, j, x);
00480                         //fprintf(stderr, "%d %d %.5f\n", k, j, x);
00481                     }
00482                 }
00483             }
00484             
00485             double gmean, gstdev;
00486             size_t iN;
00487 
00488             Dat.AveStd(gmean, gstdev, iN);
00489             fprintf(stderr, "%.5f %.5f\n", (float) gmean, (float) gstdev);
00490             vector<float> vv;
00491             vv.resize(2);
00492             vv[0] = (float) gmean;
00493             vv[1] = (float) gstdev;
00494             CSeekTools::WriteArray(outFile, vv);
00495 
00496         }
00497 
00498         //if calculating gene variance per dataset
00499         else if(sArgs.gexpvarmean_flag==1){
00500             string fileName = CMeta::Basename(sArgs.pclinput_arg);
00501             string fileStem = CMeta::Deextension(fileName);
00502             char outFile[125];
00503 
00504             string pclfile = sArgs.pclinput_arg;
00505             if(!CMeta::IsExtension(pclfile, ".bin")){
00506                 fprintf(stderr, "Input file is not bin type!\n");
00507                 return 1;
00508             }
00509             
00510             CPCL pcl;
00511             if(!pcl.Open(pclfile.c_str())){
00512                 fprintf(stderr, "Error opening file\n");
00513                 return 1;
00514             }
00515 
00516             vector<float> var;
00517             vector<float> avg;
00518 
00519             CSeekTools::InitVector(var, vecstrGenes.size(), (float) CMeta::GetNaN());
00520             CSeekTools::InitVector(avg, vecstrGenes.size(), (float) CMeta::GetNaN());
00521 
00522             int totNumExperiments = pcl.GetExperiments();
00523             if(totNumExperiments<=2){
00524                 fprintf(stderr, "This dataset is skipped because it has <=2 columns\n");
00525                 fprintf(stderr, "An empty vector will be returned\n");
00526             }else{
00527                 for(j=0; j<vecstrGenes.size(); j++){
00528                     utype g = pcl.GetGene(vecstrGenes[j]);
00529                     if(CSeekTools::IsNaN(g)) continue; //gene does not exist in the dataset
00530                     float *val = pcl.Get(g);
00531                     vector<float> rowVal;
00532                     for(k=0; k<totNumExperiments; k++)
00533                         rowVal.push_back(val[k]);
00534                     float mean = 0;
00535                     float variance = 0;
00536                     for(k=0; k<rowVal.size(); k++)
00537                         mean+=rowVal[k];
00538                     mean/=rowVal.size();
00539                     for(k=0; k<rowVal.size(); k++)
00540                         variance += (rowVal[k] - mean) * (rowVal[k] - mean);
00541                     variance /= rowVal.size();
00542                     var[j] = variance;
00543                     avg[j] = mean;
00544                     //fprintf(stderr, "%.5f %.5f\n", mean, variance);
00545                 }
00546                 //fprintf(stderr, "done\n"); 
00547             }
00548             //DEBUG
00549             fprintf(stderr, "UCSC genes\tVariance\tAverage\n");
00550             for(j=0; j<vecstrGenes.size(); j++){
00551                 utype g = pcl.GetGene(vecstrGenes[j]);
00552                 if(CSeekTools::IsNaN(g)) continue;
00553                 fprintf(stderr, "%s\t%0.4f\t%.4f\n", vecstrGenes[j].c_str(), 
00554                     var[j], avg[j]);
00555             }
00556             //fprintf(stderr, "G\n"); 
00557             sprintf(outFile, "%s/%s.gexpvar", sArgs.dir_out_arg, 
00558                 fileStem.c_str());
00559             CSeekTools::WriteArray(outFile, var);
00560             sprintf(outFile, "%s/%s.gexpmean", sArgs.dir_out_arg, 
00561                 fileStem.c_str());
00562             CSeekTools::WriteArray(outFile, avg);
00563         }
00564 
00565     }
00566 
00567 
00568     /* DB mode */
00569     if(sArgs.db_flag==1){
00570 
00571         if(!sArgs.quant_arg){
00572             fprintf(stderr, "Must give quant file\n");
00573             return -1;
00574         }
00575 
00576         vector<float> quant;
00577         string strQuantFile = sArgs.quant_arg;
00578         //fprintf(stderr, "%s\n", strQuantFile.c_str());
00579         CSeekTools::ReadQuantFile(strQuantFile, quant);
00580 
00581         //fprintf(stderr, "quant %d %.5f\n", 170, quant[170]);
00582         //getchar();
00583 
00584         if(sArgs.gplat_flag==1){
00585             bool logit = false;
00586             if(sArgs.logit_flag==1) logit = true;
00587 
00588             vector<CSeekPlatform> vp;
00589             map<string, utype> mapstriPlatform;
00590             map<utype, string> mapistrPlatform;
00591             map<utype, utype> mapiPlatform;
00592             vector<string> vecstrPlatforms;
00593 
00594             vector<string> vecstrDatasets;
00595 
00596             ifsm.open(sArgs.dset_arg);
00597             i = 0;
00598             while(!pistm->eof()){
00599                 pistm->getline(acBuffer, c_iBuffer -1);
00600                 if(acBuffer[0]==0) break;
00601                 acBuffer[c_iBuffer-1] = 0;
00602                 vecstrLine.clear();
00603                 CMeta::Tokenize( acBuffer, vecstrLine );
00604                 /* read dataset name */
00605                 vecstrDatasets.push_back(vecstrLine[0]);
00606                 /* just read the platform information */
00607                 string pl = vecstrLine[1];
00608                 map< string, utype >::const_iterator iter;
00609                 iter = mapstriPlatform.find(pl);
00610                 if(iter== mapstriPlatform.end()){
00611                     utype s = mapstriPlatform.size();
00612                     mapstriPlatform[pl] = s;
00613                     mapistrPlatform[s] = pl;
00614                     vecstrPlatforms.push_back(pl);
00615                 }
00616                 utype platform_id = mapstriPlatform[pl];
00617                 mapiPlatform[i] = platform_id;
00618                 i++;
00619             }
00620             ifsm.close();
00621 
00622             vector<string> dblist;
00623             ifsm.open(sArgs.dblist_arg);
00624             i = 0;
00625             while(!pistm->eof()){
00626                 pistm->getline(acBuffer, c_iBuffer -1);
00627                 if(acBuffer[0]==0) break;
00628                 acBuffer[c_iBuffer-1] = 0;
00629                 dblist.push_back(acBuffer);
00630             }
00631             dblist.resize(dblist.size());
00632             ifsm.close();
00633 
00634             bool useNibble = false;
00635             if(sArgs.useNibble_flag==1) useNibble = true;
00636 
00637             size_t numPlatforms = mapstriPlatform.size();
00638             size_t iDatasets = vecstrDatasets.size();
00639             size_t m_iGenes = vecstrGenes.size();
00640             CFullMatrix<float> platform_avg, platform_stdev;
00641             platform_avg.Initialize(numPlatforms, m_iGenes);
00642             platform_stdev.Initialize(numPlatforms, m_iGenes);
00643 
00644             //printf("Size: %d %d\n", numPlatforms, m_iGenes); getchar();
00645 
00646             for(i=0; i<numPlatforms; i++){
00647                 for(j=0; j<m_iGenes; j++){
00648                     platform_avg.Set(i, j, CMeta::GetNaN());
00649                     platform_stdev.Set(i, j, CMeta::GetNaN());
00650                     //platform_avg.Set(i, j, (float) 0);
00651                     //platform_stdev.Set(i, j, (float) 1.0);
00652                 }
00653             }
00654 
00655             /*if(iDatasets<numThreads){
00656                 numThreads = iDatasets;
00657                 omp_set_num_threads(numThreads);
00658             }*/
00659 
00660             string strPrepInputDirectory = sArgs.dir_prep_in_arg;
00661             vector<CSeekDataset*> *vc = new vector<CSeekDataset*>[numThreads];
00662             CFullMatrix<float> *platform_avg_threads =
00663                 new CFullMatrix<float>[numThreads];
00664             CFullMatrix<float> *platform_stdev_threads=
00665                 new CFullMatrix<float>[numThreads];
00666 
00667             for(i=0; i<numThreads; i++){
00668                 InitializeDataset(iDatasets, vecstrDatasets,
00669                     strPrepInputDirectory, vc[i]);
00670                 platform_avg_threads[i].Initialize(numPlatforms, m_iGenes);
00671                 platform_stdev_threads[i].Initialize(numPlatforms, m_iGenes);
00672                 for(j=0; j<numPlatforms; j++){
00673                     for(k=0; k<m_iGenes; k++){
00674                         platform_avg_threads[i].Set(j, k, CMeta::GetNaN());
00675                         platform_stdev_threads[i].Set(j, k, CMeta::GetNaN());
00676                         //platform_avg_threads[i].Set(j, k, (float) 0);
00677                         //platform_stdev_threads[i].Set(j, k, (float) 1.0);
00678                     }
00679                 }
00680             }
00681 
00682             //printf("Dataset initialized"); getchar();
00683             vector<string> vecstrQuery;
00684 
00685             //#pragma omp parallel for \
00686             shared(vc, dblist, iDatasets, m_iGenes, vecstrGenes, mapiPlatform, quant, \
00687             platform_avg_threads, platform_stdev_threads, vecstrQuery, logit) \
00688             private(i) firstprivate(useNibble) schedule(dynamic)
00689             for(i=0; i<dblist.size(); i++){
00690                 int tid = omp_get_thread_num();
00691                 string DBFile = dblist[i];
00692                 fprintf(stderr, "opening db file %s\n", DBFile.c_str());
00693                 OpenDB(DBFile, useNibble, iDatasets, m_iGenes,
00694                 vecstrGenes, mapiPlatform, quant, vc[tid],
00695                 platform_avg_threads[tid], platform_stdev_threads[tid],
00696                 vecstrQuery, logit);
00697                 fprintf(stderr, "finished opening db file %s\n",
00698                     DBFile.c_str());
00699             }
00700 
00701             for(i=0; i<numThreads; i++){
00702                 for(j=0; j<numPlatforms; j++){
00703                     for(k=0; k<m_iGenes; k++){
00704                         float ca = platform_avg_threads[i].Get(j, k);
00705                         float cs = platform_stdev_threads[i].Get(j, k);
00706 
00707                         if(ca==CMeta::GetNaN() || cs==CMeta::GetNaN()){
00708                             continue;
00709                         }
00710                         platform_avg.Set(j, k, ca);
00711                         platform_stdev.Set(j, k, cs);
00712                     }
00713                 }
00714             }
00715 
00716             /*
00717             for(i=0; i<numPlatforms; i++){
00718                 printf("Platform %s\n", mapistrPlatform[i].c_str());
00719                 for(j=0; j<vecstrQuery.size(); j++){
00720                     size_t iGene = mapstriGenes[vecstrQuery[j]];
00721                     printf("Gene %s %.5f %.5f\n", vecstrQuery[j].c_str(), 
00722                         platform_avg.Get(i, iGene), platform_stdev.Get(i,iGene));
00723                 }
00724             }*/
00725 
00726             char outFile[125];
00727             sprintf(outFile, "%s/all_platforms.gplatavg", sArgs.dir_out_arg);
00728             platform_avg.Save(outFile);
00729             sprintf(outFile, "%s/all_platforms.gplatstdev", sArgs.dir_out_arg);
00730             platform_stdev.Save(outFile);
00731             sprintf(outFile, "%s/all_platforms.gplatorder", sArgs.dir_out_arg);
00732             ofstream outfile;
00733             outfile.open(outFile);
00734             for(i=0; i<vecstrPlatforms.size(); i++){
00735                 outfile << vecstrPlatforms[i] << "\n";
00736             }
00737             outfile.close();
00738         }
00739 
00740     } else if(sArgs.dab_flag==1){
00741         
00742         string norm_mode = sArgs.norm_mode_arg;
00743         if(sArgs.norm_flag==1 && norm_mode=="rank"){
00744             if(sArgs.default_type_arg==-1){
00745                 fprintf(stderr, "Please supply parameter --default_type\n");
00746                 return 1;
00747             }
00748             if(sArgs.max_rank_arg==-1){
00749                 fprintf(stderr, "Please supply parameter --max_rank\n");
00750                 return 1;
00751             }
00752 
00753             CDataPair Dat;
00754             char outFile[1024];
00755             fprintf(stderr, "Opening file...\n");
00756             //if(!Dat.Open(sArgs.dabinput_arg, false, 2, false, false, true)){
00757             if(!Dat.Open(sArgs.dabinput_arg, false, false, 2, false, false)){
00758                 cerr << "error opening file" << endl;
00759                 return 1;
00760             }
00761             fprintf(stderr, "Finished opening file\n");
00762             string fileName = CMeta::Basename(sArgs.dabinput_arg);
00763             string fileStem = CMeta::Deextension(fileName);
00764             sprintf(outFile, "%s/%s.2.dab", sArgs.dir_out_arg,
00765                 fileStem.c_str());
00766             int max_rank = sArgs.max_rank_arg;
00767             fprintf(stderr, "Using max_rank: %d\n", max_rank);
00768             //cutoff, expTransform, divideNorm, subtractNorm
00769             //CSeekWriter::NormalizeDAB(Dat, vecstrGenes, true, false, true, false);
00770             //CSeekWriter::RankNormalizeDAB(Dat, vecstrGenes, max_rank, rbp_p);
00771             //Dat.Save(outFile);
00772             if(sArgs.default_type_arg==0){
00773                 vector<map<utype,unsigned short> > umat;
00774                 CSeekWriter::GetSparseRankMatrix<utype>(Dat, umat, max_rank, 
00775                     vecstrGenes);
00776                 CSeekWriter::WriteSparseMatrix<utype>(Dat, umat, 
00777                     vecstrGenes, outFile);
00778             }else if(sArgs.default_type_arg==1){
00779                 vector<map<unsigned short,unsigned short> > umat;
00780                 CSeekWriter::GetSparseRankMatrix<unsigned short>(Dat, umat, max_rank, 
00781                     vecstrGenes);
00782                 CSeekWriter::WriteSparseMatrix<unsigned short>(Dat, umat, 
00783                     vecstrGenes, outFile);
00784             }else{
00785                 fprintf(stderr, "Invalid default type!\n");
00786                 return -1;
00787             }
00788 
00789             //TEST========================================
00790             /*CSeekIntIntMap d1(vecstrGenes.size());
00791             CSparseFlatMatrix<float> sm(0);
00792             float rbp_p = sArgs.rbp_p_arg;
00793             CSeekWriter::ReadSeekSparseMatrix<unsigned short>(outFile, sm, d1, 
00794             max_rank, rbp_p, vecstrGenes);*/
00795             //============================================
00796 
00797             /*fprintf(stderr, "Begin\n");
00798             vector<unsigned short> l;
00799             vector<vector<float> > mat;
00800             CSeekWriter::ReadSparseMatrixAsArray(l, outFile);
00801             fprintf(stderr, "Begin 2\n");
00802             CSeekWriter::ReadSparseMatrix(l, mat, 0.99, vecstrGenes);*/
00803         }
00804 
00805         if(sArgs.view_flag==1){
00806             //fprintf(stderr, "Operation not implemented yet!\n");
00807             CDataPair Dat;
00808             char outFile[125];
00809             if(!Dat.Open(sArgs.dabinput_arg, false, false)){
00810                 cerr << "error opening file" << endl;
00811                 return 1;
00812             }
00813             vector<unsigned int> veciGenes;
00814             veciGenes.resize(vecstrGenes.size());
00815             for(i=0; i<vecstrGenes.size(); i++)
00816                 veciGenes[i] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[i]);
00817 
00818             unsigned int s,t;
00819             vector<float> d;
00820             for(i=0; i<vecstrGenes.size(); i++){
00821                 if((s=veciGenes[i])==(unsigned int)-1) continue;
00822                 for(j=0; j<vecstrGenes.size(); j++){
00823                     if((t=veciGenes[j])==(unsigned int) -1) continue;
00824                     d.push_back(Dat.Get(s, t));
00825                 }
00826             }
00827             sort(d.begin(), d.end());
00828             float ff[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9};
00829             for(i=0; i<9; i++){
00830                 size_t ind = (size_t) ((float) d.size() * ff[i]);
00831                 fprintf(stderr, "%.3f %.3e\n", ff[i], d[ind]);
00832             }
00833             return 1;
00834         }
00835 
00836         if(sArgs.norm_flag==1 && norm_mode=="subtract_z"){
00837             if(sArgs.cutoff_value_arg==-1.0){
00838                 fprintf(stderr, "Please supply parameter --cutoff_value\n");
00839                 return 1;
00840             }
00841             if(sArgs.default_type_arg==-1){
00842                 fprintf(stderr, "Please supply parameter --default_type\n");
00843                 return 1;
00844             }
00845 
00846             CDataPair Dat;
00847             char outFile[125];
00848             if(!Dat.Open(sArgs.dabinput_arg, false, false)){
00849                 cerr << "error opening file" << endl;
00850                 return 1;
00851             }
00852             fprintf(stderr, "Finished opening file\n");
00853             string fileName = CMeta::Basename(sArgs.dabinput_arg);
00854             string fileStem = CMeta::Deextension(fileName);
00855             sprintf(outFile, "%s/%s.2.dab", sArgs.dir_out_arg, fileStem.c_str());
00856             //expTransform, divideNorm, subtractNorm
00857             CSeekWriter::NormalizeDAB(Dat, vecstrGenes, false, false, true);
00858             float cutoff = sArgs.cutoff_value_arg;
00859             if(sArgs.default_type_arg==0){
00860                 //unsigned int
00861                 vector<map<utype,unsigned short> > umat;
00862                 CSeekWriter::ConvertToSparseMatrix<utype>(Dat, umat, vecstrGenes, 
00863                 cutoff);
00864                 CSeekWriter::WriteSparseMatrix<utype>(Dat, umat, vecstrGenes, 
00865                 outFile);
00866             }else if(sArgs.default_type_arg==1){
00867                 //unsigned short
00868                 vector<map<unsigned short,unsigned short> > umat;
00869                 CSeekWriter::ConvertToSparseMatrix<unsigned short>(Dat, umat, vecstrGenes, 
00870                 cutoff);
00871                 CSeekWriter::WriteSparseMatrix<unsigned short>(Dat, umat, vecstrGenes, 
00872                 outFile);
00873             }
00874             else{
00875                 fprintf(stderr, "Error, unsupported type --default_type_arg\n");
00876                 return 1;
00877             }       
00878         }
00879 
00880         if(sArgs.norm_flag==1 && norm_mode=="topological_overlap"){
00881             omp_set_num_threads(8);
00882             CDataPair Dat;
00883             char outFile[125];
00884             if(!Dat.Open(sArgs.dabinput_arg, false, false)){
00885                 cerr << "error opening file" << endl;
00886                 return 1;
00887             }
00888             fprintf(stderr, "Finished opening file\n");
00889             string fileName = CMeta::Basename(sArgs.dabinput_arg);
00890             string fileStem = CMeta::Deextension(fileName);
00891             sprintf(outFile, "%s/%s.dab", sArgs.dir_out_arg, fileStem.c_str());
00892             CSeekWriter::TopologicalOverlap(Dat, vecstrGenes);
00893             Dat.Save(outFile);
00894         }
00895 
00896         if(sArgs.gavg_flag==1){
00897             bool logit = false;
00898             if(sArgs.logit_flag==1) logit = true;
00899 
00900             CDataPair Dat;
00901             char outFile[125];
00902             if(!Dat.Open(sArgs.dabinput_arg, false, false)){
00903                 cerr << "error opening file" << endl;
00904                 return 1;
00905             }
00906             vector<float> vecGeneAvg;
00907             string fileName = CMeta::Basename(sArgs.dabinput_arg);
00908             string fileStem = CMeta::Deextension(fileName);
00909             sprintf(outFile, "%s/%s.gavg", sArgs.dir_out_arg,
00910                 fileStem.c_str());
00911             CSeekWriter::GetGeneAverage(Dat, vecstrGenes, vecGeneAvg, logit, 
00912                 sArgs.top_avg_percent_arg);
00913 
00914             //DEBUGGING
00915             //for(i=0; i<vecGeneAvg.size(); i++){
00916             //  fprintf(stderr, "%s\t%.3f\n", vecstrGenes[i].c_str(), vecGeneAvg[i]);
00917             //}
00918 
00919             CSeekTools::WriteArray(outFile, vecGeneAvg);
00920         }
00921 
00922         else if(sArgs.gpres_flag==1){
00923             CDataPair Dat;
00924             char outFile[125];
00925             if(!Dat.Open(sArgs.dabinput_arg, false, false)){
00926                 cerr << "error opening file" << endl;
00927                 return 1;
00928             }
00929             vector<char> vecGenePresence;
00930             string fileName = CMeta::Basename(sArgs.dabinput_arg);
00931             string fileStem = CMeta::Deextension(fileName);
00932             sprintf(outFile, "%s/%s.gpres", sArgs.dir_out_arg,
00933                 fileStem.c_str());
00934             CSeekWriter::GetGenePresence(Dat, vecstrGenes, vecGenePresence);
00935 
00936             //DEBUGGING
00937             //for(i=0; i<vecGenePresence.size(); i++){
00938             //  fprintf(stderr, "%s\t%d\n", vecstrGenes[i].c_str(), vecGenePresence[i]);
00939             //}
00940 
00941             CSeekTools::WriteArray(outFile, vecGenePresence);
00942         }
00943 
00944     }
00945 
00946     if(sArgs.dabset_flag==1){
00947         NormMode n;
00948         string norm_mode = sArgs.norm_mode_arg;
00949         if(norm_mode=="NA"){
00950             fprintf(stderr, "Error, please supply --norm_mode\n");
00951             return 1;
00952         }
00953         if(norm_mode=="subtract_z"){
00954             n = Z_NORM;
00955             if(sArgs.exp_arg==-1){
00956                 fprintf(stderr, "Error, please supply --exp\n");
00957                 return 1;
00958             }
00959         }else if(norm_mode=="rank"){
00960             n = RANK_NORM;
00961             if(sArgs.rbp_p_arg==-1 || sArgs.max_rank_arg==-1){
00962                 fprintf(stderr, "Error, Need both --rbp_p and --max_rank\n");
00963                 return 1;
00964             }
00965         }
00966 
00967         vector<string> dab_list;
00968         int numGenes = vecstrGenes.size();
00969         string dab_dir = sArgs.dab_dir_arg;
00970         string outdir = sArgs.dir_out_arg;
00971         string outdab = sArgs.out_dab_arg;
00972         if(dab_dir=="NA" || outdab=="NA"){
00973             fprintf(stderr, "Required args: dab_dir, out_dab");
00974             return -1;
00975         }
00976         string dw = sArgs.dataset_w_arg;
00977             
00978         CSeekTools::ReadListOneColumn(sArgs.dablist_arg, dab_list);
00979 
00980         vector<float> dweight;
00981         if(dw=="NA"){
00982             CSeekTools::InitVector(dweight, dab_list.size(), (float) 1.0);
00983         }else{
00984             CSeekTools::ReadArray(dw.c_str(), dweight);
00985             if(dweight.size()!=dab_list.size()){
00986                 fprintf(stderr, "Incorrect Size!\n");
00987                 return -1;
00988             }
00989         }
00990 
00991         float rbp_p = sArgs.rbp_p_arg;
00992         int max_rank = sArgs.max_rank_arg;
00993         float exp = sArgs.exp_arg;
00994         fprintf(stderr, "Using rbp_p: %.3f, max_rank: %d, exp: %.3f\n", rbp_p, max_rank, exp);
00995 
00996         if(sArgs.default_type_arg==0){
00997             CalculateMatrix<utype>(n, GENE_PRESENCE_VECTOR, dab_list, dab_dir, outdir, 
00998                 outdab, vecstrGenes, dweight, rbp_p, max_rank, exp);    
00999             CalculateMatrix<utype>(n, COUNT_MATRIX, dab_list, dab_dir, outdir, outdab,
01000                 vecstrGenes, dweight, rbp_p, max_rank, exp);    
01001             //CalculateMatrix<utype>(n, WEIGHTSUM_MATRIX, dab_list, dab_dir, outdir, outdab,
01002                 //vecstrGenes, dweight, rbp_p, max_rank, exp);  
01003             CalculateMatrix<utype>(n, DISTANCE_MATRIX, dab_list, dab_dir, outdir, outdab,
01004                 vecstrGenes, dweight, rbp_p, max_rank, exp);    
01005         }else if(sArgs.default_type_arg==1){
01006             CalculateMatrix<unsigned short>(n, GENE_PRESENCE_VECTOR, dab_list, dab_dir, outdir, 
01007                 outdab, vecstrGenes, dweight, rbp_p, max_rank, exp);    
01008             CalculateMatrix<unsigned short>(n, COUNT_MATRIX, dab_list, dab_dir, outdir, outdab,
01009                 vecstrGenes, dweight, rbp_p, max_rank, exp);    
01010             //CalculateMatrix<unsigned short>(n, WEIGHTSUM_MATRIX, dab_list, dab_dir, outdir, outdab,
01011                 //vecstrGenes, dweight, rbp_p, max_rank);   
01012             CalculateMatrix<unsigned short>(n, DISTANCE_MATRIX, dab_list, dab_dir, outdir, outdab,
01013                 vecstrGenes, dweight, rbp_p, max_rank, exp);    
01014         }else{
01015             fprintf(stderr, "Invalid default type!\n");
01016             return -1;
01017         }
01018     }
01019 
01020     if(sArgs.combined_dab_flag==1){
01021         utype numGenes = vecstrGenes.size();
01022         string dab_base = sArgs.dab_basename_arg;
01023         string dab_dir = sArgs.dab_dir2_arg;
01024         string outdir = sArgs.dir_out_arg;
01025 
01026         if(dab_base=="NA" || dab_dir=="NA"){
01027             fprintf(stderr, "Required args: dab_dir2, dab_basename");
01028             return -1;
01029         }
01030 
01031         string file1 = dab_dir + "/" + dab_base + ".half";
01032         string file2 = dab_dir + "/" + dab_base + ".pair_count";
01033         string file3 = dab_dir + "/" + dab_base + ".gene_count";
01034         string file4 = dab_dir + "/" + dab_base + ".pair_weightsum"; //optional
01035 
01036         vector<utype> gene_count;
01037         CSeekTools::ReadArray(file3.c_str(), gene_count);
01038 
01039         //normalize by counts
01040         float max_count = (float) *max_element(gene_count.begin(), gene_count.end());
01041         float min_count_required = max_count * 0.50;
01042         CSeekIntIntMap d1(vecstrGenes.size());
01043         vector<string> validGenes;
01044         int numValidGenes=0;
01045         for(i=0; i<vecstrGenes.size(); i++){
01046             if(gene_count[i]<min_count_required) continue;
01047             numValidGenes++;
01048             validGenes.push_back(vecstrGenes[i]);
01049             d1.Add(i);
01050         }
01051 
01052         fprintf(stderr, "%.2f %.2f %d\n", max_count, min_count_required, 
01053         numValidGenes);
01054 
01055         uint32_t dim;
01056         ifstream istm1, istm2, istm3;
01057         istm1.open(file1.c_str(), ios_base::binary);
01058         istm1.read((char*)(&dim), sizeof(dim));
01059         istm2.open(file2.c_str(), ios_base::binary);
01060         istm2.read((char*)(&dim), sizeof(dim));
01061         istm3.open(file4.c_str(), ios_base::binary); //test if file3 exists
01062         bool enableWeight = false;
01063         if(!istm3.fail()){
01064             istm3.read((char*)(&dim), sizeof(dim)); //read dimension if exists
01065             enableWeight = true;
01066         }
01067 
01068         /*CHalfMatrix<float> res;
01069         res.Initialize(dim);
01070         if(dim!=numGenes){
01071             fprintf(stderr, "Error in number of dimensions\n");
01072             return -1;
01073         }*/
01074 
01075         fprintf(stderr, "Begin Reading\n");
01076         if(enableWeight){
01077             fprintf(stderr, "Enable weight: True\n");
01078         }else{
01079             fprintf(stderr, "Enable weight: False\n");
01080         }
01081         CDat CD;
01082         CD.Open(validGenes);
01083         for(i=0; i<validGenes.size(); i++)
01084             for(j=i+1; j<validGenes.size(); j++)
01085                 CD.Set(i,j,CMeta::GetNaN());
01086 
01087         float *adScores = new float[dim-1];
01088         unsigned short *uCounts = new unsigned short[dim-1];
01089         float *fSum = new float[dim-1];
01090 
01091         for(i=0; (i+1)<dim; i++){
01092             if(i%2000==0){
01093                 fprintf(stderr, "Finished %d\n", i);
01094             }
01095             istm1.read((char*)adScores, sizeof(*adScores) * (dim-i-1));
01096             istm2.read((char*)uCounts, sizeof(*uCounts) * (dim-i-1));
01097             if(enableWeight)
01098                 istm3.read((char*)fSum, sizeof(*fSum) * (dim-i-1));
01099 
01100             utype gi = d1.GetForward(i);
01101             if(CSeekTools::IsNaN(gi))
01102                 continue;
01103             for(j=0; j<dim-i-1; j++){
01104                 //reverse of HalfIndex function from halfmatrixi.h
01105                 //to get the original coordinate for j:
01106                 //original j = new j + 1 + i
01107                 utype gj = d1.GetForward(j+i+1);
01108                 if(CSeekTools::IsNaN(gj))
01109                     continue;
01110                 if((float) uCounts[j]<min_count_required)
01111                     continue;
01112                 if(enableWeight)
01113                     adScores[j] /= fSum[j];
01114                 else
01115                     adScores[j] /= (float) uCounts[j];
01116                 CD.Set(gi, gj, adScores[j]);
01117             }
01118             //res.Set(i, adScores);
01119         }
01120         delete[] adScores;
01121         delete[] uCounts;
01122         delete[] fSum;
01123         istm1.close();
01124         istm2.close();
01125         if(enableWeight)
01126             istm3.close();
01127         string file_out = outdir + "/" + dab_base + ".dab";
01128         CD.Save(file_out.c_str());
01129 
01130     }
01131 
01132 #ifdef WIN32
01133     pthread_win32_process_detach_np( );
01134 #endif // WIN32
01135     return 0; }