Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "stdafx.h" 00023 #include "cmdline.h" 00024 00025 enum ExportMode{ 00026 DISTANCE_MATRIX, COUNT_MATRIX, WEIGHTSUM_MATRIX, GENE_PRESENCE_VECTOR 00027 }; 00028 00029 enum NormMode{ 00030 RANK_NORM, Z_NORM 00031 }; 00032 00033 //tType can only be unsigned short or utype 00034 //norm_mode = 0 (rank norm), 1 (subtract_z norm) 00035 template<class tType> 00036 bool CalculateMatrix(const NormMode norm_mode, 00037 const ExportMode e, const vector<string> &dab_list, 00038 const string &dab_dir, const string &outdir, const string &outdab, 00039 const vector<string> &vecstrGenes, const vector<float> &w, const float rbp_p, 00040 const int MAX_RANK, const float exp){ //w=weight of dsets 00041 //rbp_p, MAX_RANK for rank-based normalize 00042 //exp for value-based or subtract_z normalize 00043 const int RANK_NORM = 0; 00044 const int Z_NORM = 1; 00045 00046 size_t i, j, k; 00047 00048 if(e==GENE_PRESENCE_VECTOR){ 00049 vector<utype> gpres; 00050 CSeekTools::InitVector(gpres, vecstrGenes.size(), (utype) 0); 00051 for(i=0; i<dab_list.size(); i++){ 00052 fprintf(stderr, "Reading %d of %d: %s\n", i, dab_list.size(), 00053 dab_list[i].c_str()); 00054 CSeekIntIntMap d1(vecstrGenes.size()); 00055 string dabfile = dab_dir + "/" + dab_list[i]; 00056 CSeekWriter::ReadSeekSparseMatrixHeader<tType>(dabfile.c_str(), d1); 00057 const vector<utype> &allR = d1.GetAllReverse(); 00058 for(j=0; j<d1.GetNumSet(); j++){ 00059 utype gi = allR[j]; 00060 gpres[gi]++; 00061 } 00062 } 00063 string ofile = outdir+"/"+outdab+".gene_count"; 00064 CSeekTools::WriteArray(ofile.c_str(), gpres); 00065 00066 }else if(e==DISTANCE_MATRIX){ 00067 CHalfMatrix<float> res; 00068 res.Initialize(vecstrGenes.size()); 00069 for(i=0; i<vecstrGenes.size(); i++) 00070 for(j=i+1; j<vecstrGenes.size(); j++) 00071 res.Set(i, j, 0); 00072 00073 for(i=0; i<dab_list.size(); i++){ 00074 fprintf(stderr, "Reading %d of %d: %s\n", i, dab_list.size(), 00075 dab_list[i].c_str()); 00076 CSeekIntIntMap d1(vecstrGenes.size()); 00077 string dabfile = dab_dir + "/" + dab_list[i]; 00078 CSparseFlatMatrix<float> sm (0); 00079 if(norm_mode==RANK_NORM){ 00080 CSeekWriter::ReadSeekSparseMatrix<tType>(dabfile.c_str(), sm, d1, 00081 MAX_RANK, rbp_p, vecstrGenes); 00082 //NEW 00083 //CSeekWriter::RemoveDominant<tType>(sm, d1, vecstrGenes); 00084 //CSeekWriter::RemoveDominant<tType>(sm, d1, vecstrGenes); 00085 }else if(norm_mode==Z_NORM){ 00086 CSeekWriter::ReadSeekSparseMatrix<tType>(dabfile.c_str(), sm, d1, 00087 vecstrGenes, (int) (0.10*vecstrGenes.size()), exp); 00088 } 00089 fprintf(stderr, "Summing...\n"); 00090 CSeekWriter::SumSparseMatrix(sm, res, d1, w[i]); 00091 fprintf(stderr, "Finished Summing...\n"); 00092 } 00093 00094 /*for(i=0; i<vecstrGenes.size(); i++){ 00095 for(j=i+1; j<vecstrGenes.size(); j++){ 00096 float v = res.Get(i,j); 00097 if(CMeta::IsNaN(v)){ 00098 fprintf(stderr, "Error, %d %d is nan\n", i,j); 00099 } 00100 } 00101 }*/ 00102 00103 ofstream ofsm; 00104 string ofile = outdir + "/" + outdab + ".half"; 00105 ofsm.open(ofile.c_str(), ios_base::binary); 00106 res.Save(ofsm, true); 00107 ofsm.close(); 00108 00109 }else if(e==COUNT_MATRIX){ 00110 CHalfMatrix<unsigned short> res; 00111 res.Initialize(vecstrGenes.size()); 00112 for(i=0; i<vecstrGenes.size(); i++) 00113 for(j=i+1; j<vecstrGenes.size(); j++) 00114 res.Set(i, j, 0); 00115 00116 for(i=0; i<dab_list.size(); i++){ 00117 fprintf(stderr, "Reading %d of %d: %s\n", i, dab_list.size(), 00118 dab_list[i].c_str()); 00119 CSeekIntIntMap d1(vecstrGenes.size()); 00120 string dabfile = dab_dir + "/" + dab_list[i]; 00121 CSeekWriter::ReadSeekSparseMatrixHeader<tType>(dabfile.c_str(), d1); 00122 const vector<utype> &allR = d1.GetAllReverse(); 00123 for(j=0; j<d1.GetNumSet(); j++){ 00124 utype gi = allR[j]; 00125 for(k=j+1; k<d1.GetNumSet(); k++){ 00126 utype gj = allR[k]; 00127 res.Set(gi, gj, res.Get(gi, gj)+1); 00128 } 00129 } 00130 } 00131 ofstream ofsm; 00132 string ofile = outdir + "/" + outdab + ".pair_count"; 00133 ofsm.open(ofile.c_str(), ios_base::binary); 00134 res.Save(ofsm, true); 00135 ofsm.close(); 00136 00137 }else if(e==WEIGHTSUM_MATRIX){ 00138 CHalfMatrix<float> res; 00139 res.Initialize(vecstrGenes.size()); 00140 for(i=0; i<vecstrGenes.size(); i++) 00141 for(j=i+1; j<vecstrGenes.size(); j++) 00142 res.Set(i, j, 0); 00143 00144 for(i=0; i<dab_list.size(); i++){ 00145 fprintf(stderr, "Reading %d of %d: %s\n", i, dab_list.size(), 00146 dab_list[i].c_str()); 00147 CSeekIntIntMap d1(vecstrGenes.size()); 00148 string dabfile = dab_dir + "/" + dab_list[i]; 00149 CSeekWriter::ReadSeekSparseMatrixHeader<tType>(dabfile.c_str(), d1); 00150 const vector<utype> &allR = d1.GetAllReverse(); 00151 for(j=0; j<d1.GetNumSet(); j++){ 00152 utype gi = allR[j]; 00153 for(k=j+1; k<d1.GetNumSet(); k++){ 00154 utype gj = allR[k]; 00155 res.Set(gi, gj, res.Get(gi, gj) + w[i]); 00156 } 00157 } 00158 } 00159 ofstream ofsm; 00160 string ofile = outdir + "/" + outdab + ".pair_weightsum"; 00161 ofsm.open(ofile.c_str(), ios_base::binary); 00162 res.Save(ofsm, true); 00163 ofsm.close(); 00164 } 00165 else{ 00166 fprintf(stderr, "Unrecognized mode!\n"); 00167 return false; 00168 } 00169 return true; 00170 } 00171 00172 00173 bool InitializeDataset(size_t &iDatasets, vector<string> &vecstrDatasets, 00174 string &strPrepInputDirectory, vector<CSeekDataset*> &vc){ 00175 vc.clear(); 00176 vc.resize(iDatasets); 00177 size_t i, j, k; 00178 00179 for(i=0; i<iDatasets; i++){ 00180 vc[i] = new CSeekDataset(); 00181 string strFileStem = vecstrDatasets[i]; 00182 string strAvgPath = strPrepInputDirectory + "/" + strFileStem 00183 + ".gavg"; 00184 string strPresencePath = strPrepInputDirectory + "/" + strFileStem 00185 + ".gpres"; 00186 vc[i]->ReadGeneAverage(strAvgPath); 00187 vc[i]->ReadGenePresence(strPresencePath); 00188 vc[i]->InitializeGeneMap(); 00189 00190 //DEBUGGING============== 00191 /* 00192 fprintf(stderr, "==Dataset %d==\n", i); 00193 CSeekIntIntMap *mapG = vc[i]->GetGeneMap(); 00194 const vector<utype> &allR = mapG->GetAllReverse(); 00195 for(j=0; j<mapG->GetNumSet(); j++){ 00196 utype ja = mapG->GetReverse(j); 00197 fprintf(stderr, "%d\t%.3f\n", ja, vc[i]->GetGeneAverage(ja)); 00198 }*/ 00199 } 00200 return true; 00201 } 00202 00203 00204 bool InitializeDB(size_t &iDatasets, size_t &iGenes, 00205 vector<string> &vecstrGenes, vector<CSeekDataset*> &vc, CDatabaselet &DBL){ 00206 00207 utype i,j,k; 00208 vector<char> cQuery; 00209 CSeekTools::InitVector(cQuery, iGenes, (char) 0); 00210 vector<string> allQuery; 00211 map<string, size_t> mapstrGenes; 00212 for(i=0; i<vecstrGenes.size(); i++){ 00213 mapstrGenes[vecstrGenes[i]] = i; 00214 } 00215 00216 /* Databaselet mapping */ 00217 map<string, size_t> dbmap; 00218 vector<utype> veciQuery; 00219 for(i=0; i<DBL.GetGenes(); i++){ 00220 string strQuery = DBL.GetGene(i); 00221 dbmap[strQuery] = i; 00222 allQuery.push_back(strQuery); 00223 /* global mapping */ 00224 cQuery[mapstrGenes[strQuery]] = 1; 00225 veciQuery.push_back((utype) mapstrGenes[strQuery]); 00226 } 00227 00228 //fprintf(stderr, "Start initializing query map...\n"); 00229 for(i=0; i<iDatasets; i++){ 00230 vc[i]->InitializeQueryBlock(veciQuery); 00231 } 00232 00233 //fprintf(stderr, "Finished initializing map\n"); 00234 00235 //fprintf(stderr, "Start making gene-centric...\n"); 00236 for(i=0; i<DBL.GetGenes(); i++){ 00237 vector<unsigned char> Q; 00238 /* expanded */ 00239 DBL.Get(i, Q); 00240 utype m = mapstrGenes[DBL.GetGene(i)]; 00241 CSeekIntIntMap *qu = NULL; 00242 utype db; 00243 for(j=0; j<iDatasets; j++){ 00244 if((qu=vc[j]->GetDBMap())==NULL) continue; 00245 if(CSeekTools::IsNaN(db=qu->GetForward(m))) continue; 00246 unsigned char **r = vc[j]->GetMatrix(); 00247 for(k=0; k<iGenes; k++){ 00248 unsigned char c = Q[k*iDatasets + j]; 00249 r[db][k] = c; 00250 } 00251 } 00252 } 00253 00254 //fprintf(stderr, "Finished making gene-centric\n"); 00255 00256 return true; 00257 } 00258 00259 bool OpenDBFiles(string &DBFile, vector<unsigned char *> &cc, bool &useNibble){ 00260 CDatabaselet *CD; 00261 (CD = new CDatabaselet(useNibble))->Open(DBFile); 00262 cc.push_back(CD->GetCharImage()); 00263 return true; 00264 } 00265 00266 00267 bool OpenDB(string &DBFile, bool &useNibble, size_t &iDatasets, 00268 size_t &m_iGenes, vector<string> &vecstrGenes, 00269 map<utype, utype> &mapiPlatform, const vector<float> &quant, 00270 vector<CSeekDataset*> &vc, CFullMatrix<float> &platform_avg, 00271 CFullMatrix<float> &platform_stdev, vector<string> &vecstrQuery, 00272 const bool &logit){ 00273 00274 size_t i, j, jj, k; 00275 00276 CDatabaselet CD(useNibble); 00277 CD.Open(DBFile); 00278 InitializeDB(iDatasets, m_iGenes, vecstrGenes, vc, CD); 00279 00280 vector<string> presGenes; 00281 for(i=0; i<CD.GetGenes(); i++) 00282 presGenes.push_back(CD.GetGene(i)); 00283 00284 size_t numPlatforms = platform_avg.GetRows(); 00285 map<string, size_t> mapstriGenes; 00286 for(i=0; i<vecstrGenes.size(); i++) 00287 mapstriGenes[vecstrGenes[i]] = i; 00288 00289 //fprintf(stderr, "Start calculating platform average\n"); 00290 for(i=0; i<CD.GetGenes(); i++){ 00291 vector<float> sum, sq_sum, mean, stdev; 00292 vector<int> num; 00293 sum.resize(numPlatforms); 00294 sq_sum.resize(numPlatforms); 00295 mean.resize(numPlatforms); 00296 stdev.resize(numPlatforms); 00297 num.resize(numPlatforms); 00298 00299 for(k=0; k<numPlatforms; k++){ 00300 sum[k] = 0; 00301 sq_sum[k] = 0; 00302 mean[k] = 0; 00303 stdev[k] = 0; 00304 num[k] = 0; 00305 } 00306 00307 string thisGene = CD.GetGene(i); 00308 size_t geneID = mapstriGenes[thisGene]; 00309 vecstrQuery.push_back(thisGene); 00310 for(k=0; k<iDatasets; k++){ 00311 CSeekIntIntMap *mapQ = vc[k]->GetDBMap(); 00312 CSeekIntIntMap *mapG = vc[k]->GetGeneMap(); 00313 const vector<utype> &allR = mapG->GetAllReverse(); 00314 if(mapQ==NULL) continue; 00315 unsigned char **f = vc[k]->GetMatrix(); 00316 utype iQ = mapQ->GetForward(geneID); 00317 if(CSeekTools::IsNaN(iQ)){ 00318 continue; 00319 } 00320 utype platform_id = mapiPlatform[k]; 00321 if(platform_id>=(utype) numPlatforms){ 00322 printf("Error, platforms are equal %d %d", 00323 (int) platform_id, (int) numPlatforms); getchar(); 00324 } 00325 00326 for(jj=0; jj<mapG->GetNumSet(); jj++){ 00327 j = mapG->GetReverse(jj); 00328 unsigned char uc = f[iQ][j]; 00329 float v = 0; 00330 if(uc==255) v = CMeta::GetNaN(); 00331 else{ 00332 float vv = -1; 00333 if(logit) vv = log(quant[uc]) - log((float)(1.0-quant[uc])); 00334 else vv = quant[uc]; 00335 v = vv - vc[k]->GetGeneAverage(j); 00336 //fprintf(stderr, "%.5f\t%.5f\n", vv, v); 00337 if(isnan(vv) || isinf(vv) || isnan(vc[k]->GetGeneAverage(j)) || 00338 isinf(vc[k]->GetGeneAverage(j))){ 00339 fprintf(stderr, "%d D%d %.5f %.5f %d\n", (int) j, (int) k, 00340 vv, vc[k]->GetGeneAverage(j), mapG->GetForward(j)); 00341 } 00342 //v = quant[uc]; 00343 sum[platform_id] += v; 00344 num[platform_id]++; 00345 sq_sum[platform_id] += v*v; 00346 } 00347 } 00348 } 00349 00350 for(k=0; k<numPlatforms; k++){ 00351 if(num[k]==0) continue; 00352 mean[k] = sum[k] / (float) num[k]; 00353 stdev[k] = sq_sum[k] / (float) num[k] - mean[k] * mean[k]; 00354 stdev[k] = sqrt(stdev[k]); 00355 fprintf(stderr, "%d %.5f %.5f\n", geneID, mean[k], stdev[k]); 00356 platform_avg.Set(k, geneID, mean[k]); 00357 platform_stdev.Set(k, geneID, stdev[k]); 00358 } 00359 } 00360 00361 //fprintf(stderr, "Finished calculating platform average\n"); 00362 //fprintf(stderr, "Start deleting\n"); 00363 00364 for(i=0; i<iDatasets; i++) 00365 vc[i]->DeleteQueryBlock(); 00366 //fprintf(stderr, "Finished deleting\n"); 00367 return true; 00368 } 00369 00370 00371 int main( int iArgs, char** aszArgs ) { 00372 static const size_t c_iBuffer = 1024; 00373 #ifdef WIN32 00374 pthread_win32_process_attach_np( ); 00375 #endif // WIN32 00376 gengetopt_args_info sArgs; 00377 ifstream ifsm; 00378 istream* pistm; 00379 vector<string> vecstrLine, vecstrGenes, vecstrDBs, vecstrQuery; 00380 char acBuffer[ c_iBuffer ]; 00381 utype i, j, k, l; 00382 00383 if( cmdline_parser( iArgs, aszArgs, &sArgs ) ) { 00384 cmdline_parser_print_help( ); 00385 return 1; } 00386 00387 /* reading gene-mapping file */ 00388 if( sArgs.input_arg ) { 00389 ifsm.open( sArgs.input_arg ); 00390 pistm = &ifsm; } 00391 else 00392 pistm = &cin; 00393 00394 map<string, size_t> mapstriGenes; 00395 while( !pistm->eof( ) ) { 00396 pistm->getline( acBuffer, c_iBuffer - 1 ); 00397 acBuffer[ c_iBuffer - 1 ] = 0; 00398 vecstrLine.clear( ); 00399 CMeta::Tokenize( acBuffer, vecstrLine ); 00400 if( vecstrLine.size( ) < 2 ) { 00401 //cerr << "Ignoring line: " << acBuffer << endl; 00402 continue; 00403 } 00404 if( !( i = atoi( vecstrLine[ 0 ].c_str( ) ) ) ) { 00405 cerr << "Illegal gene ID: " << vecstrLine[ 0 ] << " for " 00406 << vecstrLine[ 1 ] << endl; 00407 return 1; 00408 } 00409 i--; 00410 if( vecstrGenes.size( ) <= i ) 00411 vecstrGenes.resize( i + 1 ); 00412 vecstrGenes[ i ] = vecstrLine[ 1 ]; 00413 mapstriGenes[vecstrGenes[i]] = i; 00414 } 00415 00416 if( sArgs.input_arg ) ifsm.close( ); 00417 00418 omp_set_num_threads(1); 00419 int numThreads = omp_get_max_threads(); 00420 00421 if((sArgs.dab_flag==1 && sArgs.norm_flag==1) || 00422 sArgs.dabset_flag==1){ 00423 if(sArgs.default_type_arg!=0 && sArgs.default_type_arg!=1){ 00424 fprintf(stderr, "Error, invalid default_type argument!\n"); 00425 return -1; 00426 } 00427 } 00428 00429 /* PCL mode */ 00430 if(sArgs.pclbin_flag==1){ 00431 00432 if(sArgs.sinfo_flag==1){ 00433 string fileName = CMeta::Basename(sArgs.pclinput_arg); 00434 string fileStem = CMeta::Deextension(fileName); 00435 char outFile[125]; 00436 sprintf(outFile, "%s/%s.sinfo", sArgs.dir_out_arg, fileStem.c_str()); 00437 00438 string pclfile = sArgs.pclinput_arg; 00439 if(!CMeta::IsExtension(pclfile, ".bin")){ 00440 fprintf(stderr, "Input file is not bin type!\n"); 00441 return 1; 00442 } 00443 00444 CPCL pcl; 00445 if(!pcl.Open(pclfile.c_str())){ 00446 fprintf(stderr, "Error opening file\n"); 00447 } 00448 CMeasurePearNorm pn; 00449 CDat Dat; 00450 int numG = pcl.GetGeneNames().size(); 00451 00452 vector<int> veciGenes; 00453 veciGenes.resize(numG); 00454 for (j = 0; j < veciGenes.size(); ++j) 00455 veciGenes[j] = j; 00456 int iOne, iTwo; 00457 Dat.Open(pcl.GetGeneNames()); 00458 00459 for (k = 0; k < Dat.GetGenes(); ++k) 00460 for (j = (k + 1); j < Dat.GetGenes(); ++j) 00461 Dat.Set(k, j, CMeta::GetNaN()); 00462 00463 omp_set_num_threads(8); 00464 00465 #pragma omp parallel for \ 00466 shared(pcl, Dat, veciGenes, pn) \ 00467 private(k,iOne,j,iTwo) \ 00468 firstprivate(numG) \ 00469 schedule(dynamic) 00470 for (k = 0; k < numG; ++k) { 00471 if ((iOne = veciGenes[k]) == -1) 00472 continue; 00473 float *adOne = &pcl.Get(iOne)[0]; 00474 for (j = (k + 1); j < numG; ++j){ 00475 if ((iTwo = veciGenes[j]) != -1){ 00476 float x = (float) pn.Measure(adOne, 00477 pcl.GetExperiments(), &pcl.Get(iTwo)[0], 00478 pcl.GetExperiments()); 00479 Dat.Set(k, j, x); 00480 //fprintf(stderr, "%d %d %.5f\n", k, j, x); 00481 } 00482 } 00483 } 00484 00485 double gmean, gstdev; 00486 size_t iN; 00487 00488 Dat.AveStd(gmean, gstdev, iN); 00489 fprintf(stderr, "%.5f %.5f\n", (float) gmean, (float) gstdev); 00490 vector<float> vv; 00491 vv.resize(2); 00492 vv[0] = (float) gmean; 00493 vv[1] = (float) gstdev; 00494 CSeekTools::WriteArray(outFile, vv); 00495 00496 } 00497 00498 //if calculating gene variance per dataset 00499 else if(sArgs.gexpvarmean_flag==1){ 00500 string fileName = CMeta::Basename(sArgs.pclinput_arg); 00501 string fileStem = CMeta::Deextension(fileName); 00502 char outFile[125]; 00503 00504 string pclfile = sArgs.pclinput_arg; 00505 if(!CMeta::IsExtension(pclfile, ".bin")){ 00506 fprintf(stderr, "Input file is not bin type!\n"); 00507 return 1; 00508 } 00509 00510 CPCL pcl; 00511 if(!pcl.Open(pclfile.c_str())){ 00512 fprintf(stderr, "Error opening file\n"); 00513 return 1; 00514 } 00515 00516 vector<float> var; 00517 vector<float> avg; 00518 00519 CSeekTools::InitVector(var, vecstrGenes.size(), (float) CMeta::GetNaN()); 00520 CSeekTools::InitVector(avg, vecstrGenes.size(), (float) CMeta::GetNaN()); 00521 00522 int totNumExperiments = pcl.GetExperiments(); 00523 if(totNumExperiments<=2){ 00524 fprintf(stderr, "This dataset is skipped because it has <=2 columns\n"); 00525 fprintf(stderr, "An empty vector will be returned\n"); 00526 }else{ 00527 for(j=0; j<vecstrGenes.size(); j++){ 00528 utype g = pcl.GetGene(vecstrGenes[j]); 00529 if(CSeekTools::IsNaN(g)) continue; //gene does not exist in the dataset 00530 float *val = pcl.Get(g); 00531 vector<float> rowVal; 00532 for(k=0; k<totNumExperiments; k++) 00533 rowVal.push_back(val[k]); 00534 float mean = 0; 00535 float variance = 0; 00536 for(k=0; k<rowVal.size(); k++) 00537 mean+=rowVal[k]; 00538 mean/=rowVal.size(); 00539 for(k=0; k<rowVal.size(); k++) 00540 variance += (rowVal[k] - mean) * (rowVal[k] - mean); 00541 variance /= rowVal.size(); 00542 var[j] = variance; 00543 avg[j] = mean; 00544 //fprintf(stderr, "%.5f %.5f\n", mean, variance); 00545 } 00546 //fprintf(stderr, "done\n"); 00547 } 00548 //DEBUG 00549 fprintf(stderr, "UCSC genes\tVariance\tAverage\n"); 00550 for(j=0; j<vecstrGenes.size(); j++){ 00551 utype g = pcl.GetGene(vecstrGenes[j]); 00552 if(CSeekTools::IsNaN(g)) continue; 00553 fprintf(stderr, "%s\t%0.4f\t%.4f\n", vecstrGenes[j].c_str(), 00554 var[j], avg[j]); 00555 } 00556 //fprintf(stderr, "G\n"); 00557 sprintf(outFile, "%s/%s.gexpvar", sArgs.dir_out_arg, 00558 fileStem.c_str()); 00559 CSeekTools::WriteArray(outFile, var); 00560 sprintf(outFile, "%s/%s.gexpmean", sArgs.dir_out_arg, 00561 fileStem.c_str()); 00562 CSeekTools::WriteArray(outFile, avg); 00563 } 00564 00565 } 00566 00567 00568 /* DB mode */ 00569 if(sArgs.db_flag==1){ 00570 00571 if(!sArgs.quant_arg){ 00572 fprintf(stderr, "Must give quant file\n"); 00573 return -1; 00574 } 00575 00576 vector<float> quant; 00577 string strQuantFile = sArgs.quant_arg; 00578 //fprintf(stderr, "%s\n", strQuantFile.c_str()); 00579 CSeekTools::ReadQuantFile(strQuantFile, quant); 00580 00581 //fprintf(stderr, "quant %d %.5f\n", 170, quant[170]); 00582 //getchar(); 00583 00584 if(sArgs.gplat_flag==1){ 00585 bool logit = false; 00586 if(sArgs.logit_flag==1) logit = true; 00587 00588 vector<CSeekPlatform> vp; 00589 map<string, utype> mapstriPlatform; 00590 map<utype, string> mapistrPlatform; 00591 map<utype, utype> mapiPlatform; 00592 vector<string> vecstrPlatforms; 00593 00594 vector<string> vecstrDatasets; 00595 00596 ifsm.open(sArgs.dset_arg); 00597 i = 0; 00598 while(!pistm->eof()){ 00599 pistm->getline(acBuffer, c_iBuffer -1); 00600 if(acBuffer[0]==0) break; 00601 acBuffer[c_iBuffer-1] = 0; 00602 vecstrLine.clear(); 00603 CMeta::Tokenize( acBuffer, vecstrLine ); 00604 /* read dataset name */ 00605 vecstrDatasets.push_back(vecstrLine[0]); 00606 /* just read the platform information */ 00607 string pl = vecstrLine[1]; 00608 map< string, utype >::const_iterator iter; 00609 iter = mapstriPlatform.find(pl); 00610 if(iter== mapstriPlatform.end()){ 00611 utype s = mapstriPlatform.size(); 00612 mapstriPlatform[pl] = s; 00613 mapistrPlatform[s] = pl; 00614 vecstrPlatforms.push_back(pl); 00615 } 00616 utype platform_id = mapstriPlatform[pl]; 00617 mapiPlatform[i] = platform_id; 00618 i++; 00619 } 00620 ifsm.close(); 00621 00622 vector<string> dblist; 00623 ifsm.open(sArgs.dblist_arg); 00624 i = 0; 00625 while(!pistm->eof()){ 00626 pistm->getline(acBuffer, c_iBuffer -1); 00627 if(acBuffer[0]==0) break; 00628 acBuffer[c_iBuffer-1] = 0; 00629 dblist.push_back(acBuffer); 00630 } 00631 dblist.resize(dblist.size()); 00632 ifsm.close(); 00633 00634 bool useNibble = false; 00635 if(sArgs.useNibble_flag==1) useNibble = true; 00636 00637 size_t numPlatforms = mapstriPlatform.size(); 00638 size_t iDatasets = vecstrDatasets.size(); 00639 size_t m_iGenes = vecstrGenes.size(); 00640 CFullMatrix<float> platform_avg, platform_stdev; 00641 platform_avg.Initialize(numPlatforms, m_iGenes); 00642 platform_stdev.Initialize(numPlatforms, m_iGenes); 00643 00644 //printf("Size: %d %d\n", numPlatforms, m_iGenes); getchar(); 00645 00646 for(i=0; i<numPlatforms; i++){ 00647 for(j=0; j<m_iGenes; j++){ 00648 platform_avg.Set(i, j, CMeta::GetNaN()); 00649 platform_stdev.Set(i, j, CMeta::GetNaN()); 00650 //platform_avg.Set(i, j, (float) 0); 00651 //platform_stdev.Set(i, j, (float) 1.0); 00652 } 00653 } 00654 00655 /*if(iDatasets<numThreads){ 00656 numThreads = iDatasets; 00657 omp_set_num_threads(numThreads); 00658 }*/ 00659 00660 string strPrepInputDirectory = sArgs.dir_prep_in_arg; 00661 vector<CSeekDataset*> *vc = new vector<CSeekDataset*>[numThreads]; 00662 CFullMatrix<float> *platform_avg_threads = 00663 new CFullMatrix<float>[numThreads]; 00664 CFullMatrix<float> *platform_stdev_threads= 00665 new CFullMatrix<float>[numThreads]; 00666 00667 for(i=0; i<numThreads; i++){ 00668 InitializeDataset(iDatasets, vecstrDatasets, 00669 strPrepInputDirectory, vc[i]); 00670 platform_avg_threads[i].Initialize(numPlatforms, m_iGenes); 00671 platform_stdev_threads[i].Initialize(numPlatforms, m_iGenes); 00672 for(j=0; j<numPlatforms; j++){ 00673 for(k=0; k<m_iGenes; k++){ 00674 platform_avg_threads[i].Set(j, k, CMeta::GetNaN()); 00675 platform_stdev_threads[i].Set(j, k, CMeta::GetNaN()); 00676 //platform_avg_threads[i].Set(j, k, (float) 0); 00677 //platform_stdev_threads[i].Set(j, k, (float) 1.0); 00678 } 00679 } 00680 } 00681 00682 //printf("Dataset initialized"); getchar(); 00683 vector<string> vecstrQuery; 00684 00685 //#pragma omp parallel for \ 00686 shared(vc, dblist, iDatasets, m_iGenes, vecstrGenes, mapiPlatform, quant, \ 00687 platform_avg_threads, platform_stdev_threads, vecstrQuery, logit) \ 00688 private(i) firstprivate(useNibble) schedule(dynamic) 00689 for(i=0; i<dblist.size(); i++){ 00690 int tid = omp_get_thread_num(); 00691 string DBFile = dblist[i]; 00692 fprintf(stderr, "opening db file %s\n", DBFile.c_str()); 00693 OpenDB(DBFile, useNibble, iDatasets, m_iGenes, 00694 vecstrGenes, mapiPlatform, quant, vc[tid], 00695 platform_avg_threads[tid], platform_stdev_threads[tid], 00696 vecstrQuery, logit); 00697 fprintf(stderr, "finished opening db file %s\n", 00698 DBFile.c_str()); 00699 } 00700 00701 for(i=0; i<numThreads; i++){ 00702 for(j=0; j<numPlatforms; j++){ 00703 for(k=0; k<m_iGenes; k++){ 00704 float ca = platform_avg_threads[i].Get(j, k); 00705 float cs = platform_stdev_threads[i].Get(j, k); 00706 00707 if(ca==CMeta::GetNaN() || cs==CMeta::GetNaN()){ 00708 continue; 00709 } 00710 platform_avg.Set(j, k, ca); 00711 platform_stdev.Set(j, k, cs); 00712 } 00713 } 00714 } 00715 00716 /* 00717 for(i=0; i<numPlatforms; i++){ 00718 printf("Platform %s\n", mapistrPlatform[i].c_str()); 00719 for(j=0; j<vecstrQuery.size(); j++){ 00720 size_t iGene = mapstriGenes[vecstrQuery[j]]; 00721 printf("Gene %s %.5f %.5f\n", vecstrQuery[j].c_str(), 00722 platform_avg.Get(i, iGene), platform_stdev.Get(i,iGene)); 00723 } 00724 }*/ 00725 00726 char outFile[125]; 00727 sprintf(outFile, "%s/all_platforms.gplatavg", sArgs.dir_out_arg); 00728 platform_avg.Save(outFile); 00729 sprintf(outFile, "%s/all_platforms.gplatstdev", sArgs.dir_out_arg); 00730 platform_stdev.Save(outFile); 00731 sprintf(outFile, "%s/all_platforms.gplatorder", sArgs.dir_out_arg); 00732 ofstream outfile; 00733 outfile.open(outFile); 00734 for(i=0; i<vecstrPlatforms.size(); i++){ 00735 outfile << vecstrPlatforms[i] << "\n"; 00736 } 00737 outfile.close(); 00738 } 00739 00740 } else if(sArgs.dab_flag==1){ 00741 00742 string norm_mode = sArgs.norm_mode_arg; 00743 if(sArgs.norm_flag==1 && norm_mode=="rank"){ 00744 if(sArgs.default_type_arg==-1){ 00745 fprintf(stderr, "Please supply parameter --default_type\n"); 00746 return 1; 00747 } 00748 if(sArgs.max_rank_arg==-1){ 00749 fprintf(stderr, "Please supply parameter --max_rank\n"); 00750 return 1; 00751 } 00752 00753 CDataPair Dat; 00754 char outFile[1024]; 00755 fprintf(stderr, "Opening file...\n"); 00756 //if(!Dat.Open(sArgs.dabinput_arg, false, 2, false, false, true)){ 00757 if(!Dat.Open(sArgs.dabinput_arg, false, false, 2, false, false)){ 00758 cerr << "error opening file" << endl; 00759 return 1; 00760 } 00761 fprintf(stderr, "Finished opening file\n"); 00762 string fileName = CMeta::Basename(sArgs.dabinput_arg); 00763 string fileStem = CMeta::Deextension(fileName); 00764 sprintf(outFile, "%s/%s.2.dab", sArgs.dir_out_arg, 00765 fileStem.c_str()); 00766 int max_rank = sArgs.max_rank_arg; 00767 fprintf(stderr, "Using max_rank: %d\n", max_rank); 00768 //cutoff, expTransform, divideNorm, subtractNorm 00769 //CSeekWriter::NormalizeDAB(Dat, vecstrGenes, true, false, true, false); 00770 //CSeekWriter::RankNormalizeDAB(Dat, vecstrGenes, max_rank, rbp_p); 00771 //Dat.Save(outFile); 00772 if(sArgs.default_type_arg==0){ 00773 vector<map<utype,unsigned short> > umat; 00774 CSeekWriter::GetSparseRankMatrix<utype>(Dat, umat, max_rank, 00775 vecstrGenes); 00776 CSeekWriter::WriteSparseMatrix<utype>(Dat, umat, 00777 vecstrGenes, outFile); 00778 }else if(sArgs.default_type_arg==1){ 00779 vector<map<unsigned short,unsigned short> > umat; 00780 CSeekWriter::GetSparseRankMatrix<unsigned short>(Dat, umat, max_rank, 00781 vecstrGenes); 00782 CSeekWriter::WriteSparseMatrix<unsigned short>(Dat, umat, 00783 vecstrGenes, outFile); 00784 }else{ 00785 fprintf(stderr, "Invalid default type!\n"); 00786 return -1; 00787 } 00788 00789 //TEST======================================== 00790 /*CSeekIntIntMap d1(vecstrGenes.size()); 00791 CSparseFlatMatrix<float> sm(0); 00792 float rbp_p = sArgs.rbp_p_arg; 00793 CSeekWriter::ReadSeekSparseMatrix<unsigned short>(outFile, sm, d1, 00794 max_rank, rbp_p, vecstrGenes);*/ 00795 //============================================ 00796 00797 /*fprintf(stderr, "Begin\n"); 00798 vector<unsigned short> l; 00799 vector<vector<float> > mat; 00800 CSeekWriter::ReadSparseMatrixAsArray(l, outFile); 00801 fprintf(stderr, "Begin 2\n"); 00802 CSeekWriter::ReadSparseMatrix(l, mat, 0.99, vecstrGenes);*/ 00803 } 00804 00805 if(sArgs.view_flag==1){ 00806 //fprintf(stderr, "Operation not implemented yet!\n"); 00807 CDataPair Dat; 00808 char outFile[125]; 00809 if(!Dat.Open(sArgs.dabinput_arg, false, false)){ 00810 cerr << "error opening file" << endl; 00811 return 1; 00812 } 00813 vector<unsigned int> veciGenes; 00814 veciGenes.resize(vecstrGenes.size()); 00815 for(i=0; i<vecstrGenes.size(); i++) 00816 veciGenes[i] = (unsigned int) Dat.GetGeneIndex(vecstrGenes[i]); 00817 00818 unsigned int s,t; 00819 vector<float> d; 00820 for(i=0; i<vecstrGenes.size(); i++){ 00821 if((s=veciGenes[i])==(unsigned int)-1) continue; 00822 for(j=0; j<vecstrGenes.size(); j++){ 00823 if((t=veciGenes[j])==(unsigned int) -1) continue; 00824 d.push_back(Dat.Get(s, t)); 00825 } 00826 } 00827 sort(d.begin(), d.end()); 00828 float ff[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}; 00829 for(i=0; i<9; i++){ 00830 size_t ind = (size_t) ((float) d.size() * ff[i]); 00831 fprintf(stderr, "%.3f %.3e\n", ff[i], d[ind]); 00832 } 00833 return 1; 00834 } 00835 00836 if(sArgs.norm_flag==1 && norm_mode=="subtract_z"){ 00837 if(sArgs.cutoff_value_arg==-1.0){ 00838 fprintf(stderr, "Please supply parameter --cutoff_value\n"); 00839 return 1; 00840 } 00841 if(sArgs.default_type_arg==-1){ 00842 fprintf(stderr, "Please supply parameter --default_type\n"); 00843 return 1; 00844 } 00845 00846 CDataPair Dat; 00847 char outFile[125]; 00848 if(!Dat.Open(sArgs.dabinput_arg, false, false)){ 00849 cerr << "error opening file" << endl; 00850 return 1; 00851 } 00852 fprintf(stderr, "Finished opening file\n"); 00853 string fileName = CMeta::Basename(sArgs.dabinput_arg); 00854 string fileStem = CMeta::Deextension(fileName); 00855 sprintf(outFile, "%s/%s.2.dab", sArgs.dir_out_arg, fileStem.c_str()); 00856 //expTransform, divideNorm, subtractNorm 00857 CSeekWriter::NormalizeDAB(Dat, vecstrGenes, false, false, true); 00858 float cutoff = sArgs.cutoff_value_arg; 00859 if(sArgs.default_type_arg==0){ 00860 //unsigned int 00861 vector<map<utype,unsigned short> > umat; 00862 CSeekWriter::ConvertToSparseMatrix<utype>(Dat, umat, vecstrGenes, 00863 cutoff); 00864 CSeekWriter::WriteSparseMatrix<utype>(Dat, umat, vecstrGenes, 00865 outFile); 00866 }else if(sArgs.default_type_arg==1){ 00867 //unsigned short 00868 vector<map<unsigned short,unsigned short> > umat; 00869 CSeekWriter::ConvertToSparseMatrix<unsigned short>(Dat, umat, vecstrGenes, 00870 cutoff); 00871 CSeekWriter::WriteSparseMatrix<unsigned short>(Dat, umat, vecstrGenes, 00872 outFile); 00873 } 00874 else{ 00875 fprintf(stderr, "Error, unsupported type --default_type_arg\n"); 00876 return 1; 00877 } 00878 } 00879 00880 if(sArgs.norm_flag==1 && norm_mode=="topological_overlap"){ 00881 omp_set_num_threads(8); 00882 CDataPair Dat; 00883 char outFile[125]; 00884 if(!Dat.Open(sArgs.dabinput_arg, false, false)){ 00885 cerr << "error opening file" << endl; 00886 return 1; 00887 } 00888 fprintf(stderr, "Finished opening file\n"); 00889 string fileName = CMeta::Basename(sArgs.dabinput_arg); 00890 string fileStem = CMeta::Deextension(fileName); 00891 sprintf(outFile, "%s/%s.dab", sArgs.dir_out_arg, fileStem.c_str()); 00892 CSeekWriter::TopologicalOverlap(Dat, vecstrGenes); 00893 Dat.Save(outFile); 00894 } 00895 00896 if(sArgs.gavg_flag==1){ 00897 bool logit = false; 00898 if(sArgs.logit_flag==1) logit = true; 00899 00900 CDataPair Dat; 00901 char outFile[125]; 00902 if(!Dat.Open(sArgs.dabinput_arg, false, false)){ 00903 cerr << "error opening file" << endl; 00904 return 1; 00905 } 00906 vector<float> vecGeneAvg; 00907 string fileName = CMeta::Basename(sArgs.dabinput_arg); 00908 string fileStem = CMeta::Deextension(fileName); 00909 sprintf(outFile, "%s/%s.gavg", sArgs.dir_out_arg, 00910 fileStem.c_str()); 00911 CSeekWriter::GetGeneAverage(Dat, vecstrGenes, vecGeneAvg, logit, 00912 sArgs.top_avg_percent_arg); 00913 00914 //DEBUGGING 00915 //for(i=0; i<vecGeneAvg.size(); i++){ 00916 // fprintf(stderr, "%s\t%.3f\n", vecstrGenes[i].c_str(), vecGeneAvg[i]); 00917 //} 00918 00919 CSeekTools::WriteArray(outFile, vecGeneAvg); 00920 } 00921 00922 else if(sArgs.gpres_flag==1){ 00923 CDataPair Dat; 00924 char outFile[125]; 00925 if(!Dat.Open(sArgs.dabinput_arg, false, false)){ 00926 cerr << "error opening file" << endl; 00927 return 1; 00928 } 00929 vector<char> vecGenePresence; 00930 string fileName = CMeta::Basename(sArgs.dabinput_arg); 00931 string fileStem = CMeta::Deextension(fileName); 00932 sprintf(outFile, "%s/%s.gpres", sArgs.dir_out_arg, 00933 fileStem.c_str()); 00934 CSeekWriter::GetGenePresence(Dat, vecstrGenes, vecGenePresence); 00935 00936 //DEBUGGING 00937 //for(i=0; i<vecGenePresence.size(); i++){ 00938 // fprintf(stderr, "%s\t%d\n", vecstrGenes[i].c_str(), vecGenePresence[i]); 00939 //} 00940 00941 CSeekTools::WriteArray(outFile, vecGenePresence); 00942 } 00943 00944 } 00945 00946 if(sArgs.dabset_flag==1){ 00947 NormMode n; 00948 string norm_mode = sArgs.norm_mode_arg; 00949 if(norm_mode=="NA"){ 00950 fprintf(stderr, "Error, please supply --norm_mode\n"); 00951 return 1; 00952 } 00953 if(norm_mode=="subtract_z"){ 00954 n = Z_NORM; 00955 if(sArgs.exp_arg==-1){ 00956 fprintf(stderr, "Error, please supply --exp\n"); 00957 return 1; 00958 } 00959 }else if(norm_mode=="rank"){ 00960 n = RANK_NORM; 00961 if(sArgs.rbp_p_arg==-1 || sArgs.max_rank_arg==-1){ 00962 fprintf(stderr, "Error, Need both --rbp_p and --max_rank\n"); 00963 return 1; 00964 } 00965 } 00966 00967 vector<string> dab_list; 00968 int numGenes = vecstrGenes.size(); 00969 string dab_dir = sArgs.dab_dir_arg; 00970 string outdir = sArgs.dir_out_arg; 00971 string outdab = sArgs.out_dab_arg; 00972 if(dab_dir=="NA" || outdab=="NA"){ 00973 fprintf(stderr, "Required args: dab_dir, out_dab"); 00974 return -1; 00975 } 00976 string dw = sArgs.dataset_w_arg; 00977 00978 CSeekTools::ReadListOneColumn(sArgs.dablist_arg, dab_list); 00979 00980 vector<float> dweight; 00981 if(dw=="NA"){ 00982 CSeekTools::InitVector(dweight, dab_list.size(), (float) 1.0); 00983 }else{ 00984 CSeekTools::ReadArray(dw.c_str(), dweight); 00985 if(dweight.size()!=dab_list.size()){ 00986 fprintf(stderr, "Incorrect Size!\n"); 00987 return -1; 00988 } 00989 } 00990 00991 float rbp_p = sArgs.rbp_p_arg; 00992 int max_rank = sArgs.max_rank_arg; 00993 float exp = sArgs.exp_arg; 00994 fprintf(stderr, "Using rbp_p: %.3f, max_rank: %d, exp: %.3f\n", rbp_p, max_rank, exp); 00995 00996 if(sArgs.default_type_arg==0){ 00997 CalculateMatrix<utype>(n, GENE_PRESENCE_VECTOR, dab_list, dab_dir, outdir, 00998 outdab, vecstrGenes, dweight, rbp_p, max_rank, exp); 00999 CalculateMatrix<utype>(n, COUNT_MATRIX, dab_list, dab_dir, outdir, outdab, 01000 vecstrGenes, dweight, rbp_p, max_rank, exp); 01001 //CalculateMatrix<utype>(n, WEIGHTSUM_MATRIX, dab_list, dab_dir, outdir, outdab, 01002 //vecstrGenes, dweight, rbp_p, max_rank, exp); 01003 CalculateMatrix<utype>(n, DISTANCE_MATRIX, dab_list, dab_dir, outdir, outdab, 01004 vecstrGenes, dweight, rbp_p, max_rank, exp); 01005 }else if(sArgs.default_type_arg==1){ 01006 CalculateMatrix<unsigned short>(n, GENE_PRESENCE_VECTOR, dab_list, dab_dir, outdir, 01007 outdab, vecstrGenes, dweight, rbp_p, max_rank, exp); 01008 CalculateMatrix<unsigned short>(n, COUNT_MATRIX, dab_list, dab_dir, outdir, outdab, 01009 vecstrGenes, dweight, rbp_p, max_rank, exp); 01010 //CalculateMatrix<unsigned short>(n, WEIGHTSUM_MATRIX, dab_list, dab_dir, outdir, outdab, 01011 //vecstrGenes, dweight, rbp_p, max_rank); 01012 CalculateMatrix<unsigned short>(n, DISTANCE_MATRIX, dab_list, dab_dir, outdir, outdab, 01013 vecstrGenes, dweight, rbp_p, max_rank, exp); 01014 }else{ 01015 fprintf(stderr, "Invalid default type!\n"); 01016 return -1; 01017 } 01018 } 01019 01020 if(sArgs.combined_dab_flag==1){ 01021 utype numGenes = vecstrGenes.size(); 01022 string dab_base = sArgs.dab_basename_arg; 01023 string dab_dir = sArgs.dab_dir2_arg; 01024 string outdir = sArgs.dir_out_arg; 01025 01026 if(dab_base=="NA" || dab_dir=="NA"){ 01027 fprintf(stderr, "Required args: dab_dir2, dab_basename"); 01028 return -1; 01029 } 01030 01031 string file1 = dab_dir + "/" + dab_base + ".half"; 01032 string file2 = dab_dir + "/" + dab_base + ".pair_count"; 01033 string file3 = dab_dir + "/" + dab_base + ".gene_count"; 01034 string file4 = dab_dir + "/" + dab_base + ".pair_weightsum"; //optional 01035 01036 vector<utype> gene_count; 01037 CSeekTools::ReadArray(file3.c_str(), gene_count); 01038 01039 //normalize by counts 01040 float max_count = (float) *max_element(gene_count.begin(), gene_count.end()); 01041 float min_count_required = max_count * 0.50; 01042 CSeekIntIntMap d1(vecstrGenes.size()); 01043 vector<string> validGenes; 01044 int numValidGenes=0; 01045 for(i=0; i<vecstrGenes.size(); i++){ 01046 if(gene_count[i]<min_count_required) continue; 01047 numValidGenes++; 01048 validGenes.push_back(vecstrGenes[i]); 01049 d1.Add(i); 01050 } 01051 01052 fprintf(stderr, "%.2f %.2f %d\n", max_count, min_count_required, 01053 numValidGenes); 01054 01055 uint32_t dim; 01056 ifstream istm1, istm2, istm3; 01057 istm1.open(file1.c_str(), ios_base::binary); 01058 istm1.read((char*)(&dim), sizeof(dim)); 01059 istm2.open(file2.c_str(), ios_base::binary); 01060 istm2.read((char*)(&dim), sizeof(dim)); 01061 istm3.open(file4.c_str(), ios_base::binary); //test if file3 exists 01062 bool enableWeight = false; 01063 if(!istm3.fail()){ 01064 istm3.read((char*)(&dim), sizeof(dim)); //read dimension if exists 01065 enableWeight = true; 01066 } 01067 01068 /*CHalfMatrix<float> res; 01069 res.Initialize(dim); 01070 if(dim!=numGenes){ 01071 fprintf(stderr, "Error in number of dimensions\n"); 01072 return -1; 01073 }*/ 01074 01075 fprintf(stderr, "Begin Reading\n"); 01076 if(enableWeight){ 01077 fprintf(stderr, "Enable weight: True\n"); 01078 }else{ 01079 fprintf(stderr, "Enable weight: False\n"); 01080 } 01081 CDat CD; 01082 CD.Open(validGenes); 01083 for(i=0; i<validGenes.size(); i++) 01084 for(j=i+1; j<validGenes.size(); j++) 01085 CD.Set(i,j,CMeta::GetNaN()); 01086 01087 float *adScores = new float[dim-1]; 01088 unsigned short *uCounts = new unsigned short[dim-1]; 01089 float *fSum = new float[dim-1]; 01090 01091 for(i=0; (i+1)<dim; i++){ 01092 if(i%2000==0){ 01093 fprintf(stderr, "Finished %d\n", i); 01094 } 01095 istm1.read((char*)adScores, sizeof(*adScores) * (dim-i-1)); 01096 istm2.read((char*)uCounts, sizeof(*uCounts) * (dim-i-1)); 01097 if(enableWeight) 01098 istm3.read((char*)fSum, sizeof(*fSum) * (dim-i-1)); 01099 01100 utype gi = d1.GetForward(i); 01101 if(CSeekTools::IsNaN(gi)) 01102 continue; 01103 for(j=0; j<dim-i-1; j++){ 01104 //reverse of HalfIndex function from halfmatrixi.h 01105 //to get the original coordinate for j: 01106 //original j = new j + 1 + i 01107 utype gj = d1.GetForward(j+i+1); 01108 if(CSeekTools::IsNaN(gj)) 01109 continue; 01110 if((float) uCounts[j]<min_count_required) 01111 continue; 01112 if(enableWeight) 01113 adScores[j] /= fSum[j]; 01114 else 01115 adScores[j] /= (float) uCounts[j]; 01116 CD.Set(gi, gj, adScores[j]); 01117 } 01118 //res.Set(i, adScores); 01119 } 01120 delete[] adScores; 01121 delete[] uCounts; 01122 delete[] fSum; 01123 istm1.close(); 01124 istm2.close(); 01125 if(enableWeight) 01126 istm3.close(); 01127 string file_out = outdir + "/" + dab_base + ".dab"; 01128 CD.Save(file_out.c_str()); 01129 01130 } 01131 01132 #ifdef WIN32 01133 pthread_win32_process_detach_np( ); 01134 #endif // WIN32 01135 return 0; }