Sleipnir
|
00001 /***************************************************************************** 00002 * This file is provided under the Creative Commons Attribution 3.0 license. 00003 * 00004 * You are free to share, copy, distribute, transmit, or adapt this work 00005 * PROVIDED THAT you attribute the work to the authors listed below. 00006 * For more information, please see the following web page: 00007 * http://creativecommons.org/licenses/by/3.0/ 00008 * 00009 * This file is a component of the Sleipnir library for functional genomics, 00010 * authored by: 00011 * Curtis Huttenhower (chuttenh@princeton.edu) 00012 * Mark Schroeder 00013 * Maria D. Chikina 00014 * Olga G. Troyanskaya (ogt@princeton.edu, primary contact) 00015 * 00016 * If you use this library, the included executable tools, or any related 00017 * code in your work, please cite the following publication: 00018 * Curtis Huttenhower, Mark Schroeder, Maria D. Chikina, and 00019 * Olga G. Troyanskaya. 00020 * "The Sleipnir library for computational functional genomics" 00021 *****************************************************************************/ 00022 #include "seekdataset.h" 00023 #include "seekreader.h" 00024 #include "seekevaluate.h" 00025 00026 namespace Sleipnir { 00027 00028 CSeekDataset::CSeekDataset(){ 00029 r = NULL; 00030 rData = NULL; 00031 dbMap = NULL; 00032 geneMap = NULL; 00033 queryMap = NULL; 00034 platform = NULL; 00035 geneAverage.clear(); 00036 geneVariance.clear(); 00037 genePresence.clear(); 00038 query.clear(); 00039 queryIndex.clear(); 00040 iQuerySize = 0; 00041 iNumGenes = 0; 00042 iDBSize = 0; 00043 m_bIsNibble = false; 00044 m_fDsetAverage = CMeta::GetNaN(); 00045 m_fDsetStdev = CMeta::GetNaN(); 00046 weight.clear(); 00047 sum_weight = -1; 00048 } 00049 00050 CSeekDataset::~CSeekDataset(){ 00051 DeleteQuery(); 00052 DeleteQueryBlock(); 00053 if(geneMap!=NULL){ 00054 delete geneMap; 00055 geneMap = NULL; 00056 iNumGenes = 0; 00057 } 00058 geneAverage.clear(); 00059 geneVariance.clear(); 00060 genePresence.clear(); 00061 m_fDsetAverage = CMeta::GetNaN(); 00062 m_fDsetStdev = CMeta::GetNaN(); 00063 platform = NULL; 00064 } 00065 00066 //Copy CSeekDataset from a given object 00067 bool CSeekDataset::Copy(CSeekDataset *src){ 00068 //copies the following data if available: geneAverage, genePresence 00069 //geneVariance, dsetAverage, dsetStdev, geneMap, platform 00070 00071 if(src->geneAverage.size()>0){ 00072 //fprintf(stderr, "Great a!\n"); 00073 geneAverage.resize(src->geneAverage.size()); 00074 utype i; 00075 for(i=0; i<src->geneAverage.size(); i++){ 00076 geneAverage[i] = src->geneAverage[i]; 00077 } 00078 } 00079 if(src->genePresence.size()>0){ 00080 //fprintf(stderr, "Great b!\n"); 00081 genePresence.resize(src->genePresence.size()); 00082 utype i; 00083 for(i=0; i<src->genePresence.size(); i++){ 00084 genePresence[i] = src->genePresence[i]; 00085 } 00086 } 00087 if(src->geneVariance.size()>0){ 00088 geneVariance.resize(src->geneVariance.size()); 00089 utype i; 00090 for(i=0; i<src->geneVariance.size(); i++){ 00091 geneVariance[i] = src->geneVariance[i]; 00092 } 00093 //copy(src->geneVariance.begin(), src->geneVariance.end(), 00094 // geneVariance.begin()); 00095 } 00096 m_fDsetAverage = src->m_fDsetAverage; 00097 m_fDsetStdev = src->m_fDsetStdev; 00098 00099 if(geneMap!=NULL){ 00100 delete geneMap; 00101 iNumGenes = 0; 00102 } 00103 00104 utype iSize = genePresence.size(); 00105 iNumGenes = iSize; 00106 geneMap = new CSeekIntIntMap(src->geneMap); 00107 00108 return true; 00109 } 00110 00111 bool CSeekDataset::ReadDatasetAverageStdev(const string &strFileName){ 00112 vector<float> t; 00113 CSeekTools::ReadArray(strFileName.c_str(), t); 00114 m_fDsetAverage = t[0]; 00115 m_fDsetStdev = t[1]; 00116 return true; 00117 } 00118 00119 bool CSeekDataset::ReadGeneAverage(const string &strFileName){ 00120 return CSeekTools::ReadArray(strFileName.c_str(), geneAverage); 00121 } 00122 00123 bool CSeekDataset::ReadGenePresence(const string &strFileName){ 00124 return CSeekTools::ReadArray(strFileName.c_str(), genePresence); 00125 } 00126 00127 bool CSeekDataset::ReadGeneVariance(const string &strFileName){ 00128 return CSeekTools::ReadArray(strFileName.c_str(), geneVariance); 00129 } 00130 00131 00132 bool CSeekDataset::InitializeGeneMap(){ 00133 if(geneAverage.empty() || genePresence.empty()) { 00134 cerr << "Gene average or gene presence unread" << endl; 00135 return false; 00136 } 00137 utype i; 00138 utype iSize = genePresence.size(); 00139 iNumGenes = iSize; 00140 geneMap = new CSeekIntIntMap(iSize); 00141 vector<char>::const_iterator iterGenePresence = genePresence.begin(); 00142 vector<float>::const_iterator iterGeneAverage = geneAverage.begin(); 00143 00144 for(i=0; iterGenePresence!=genePresence.end(); i++, 00145 iterGenePresence++, iterGeneAverage++){ 00146 if(*iterGenePresence==1 && !CMeta::IsNaN(*iterGeneAverage)){ 00147 geneMap->Add(i); 00148 } 00149 } 00150 return true; 00151 } 00152 00153 00154 bool CSeekDataset::InitializeQueryBlock(const vector<utype> &queryBlock){ 00155 00156 DeleteQueryBlock(); 00157 00158 dbMap = new CSeekIntIntMap(iNumGenes); 00159 00160 vector<utype>::const_iterator iterQ = queryBlock.begin(); 00161 for(; iterQ!=queryBlock.end(); iterQ++){ 00162 if(CSeekTools::IsNaN(geneMap->GetForward(*iterQ))){ 00163 //this query gene is not present in the dataset 00164 continue; 00165 } 00166 dbMap->Add(*iterQ); 00167 } 00168 iDBSize = dbMap->GetNumSet(); 00169 00170 bool DEBUG = false; 00171 00172 if(iDBSize==0){ 00173 if(DEBUG) cerr << "Dataset will be skipped" << endl; 00174 DeleteQueryBlock(); 00175 return true; 00176 } 00177 00178 //fprintf(stderr, "0x1 %lu %d %d\n", CMeta::GetMemoryUsage(), iDBSize, iNumGenes); 00179 r = CSeekTools::Init2DArray(iDBSize, iNumGenes, (unsigned char) 255); 00180 //fprintf(stderr, "0x2 %lu\n", CMeta::GetMemoryUsage()); 00181 //free(r[0]); 00182 //free(r); 00183 //fprintf(stderr, "0x3 %lu\n", CMeta::GetMemoryUsage()); 00184 00185 00186 return true; 00187 } 00188 00189 00190 bool CSeekDataset::InitializeQuery(const vector<utype> &query){ 00191 DeleteQuery(); 00192 00193 if(iDBSize==0 || dbMap==NULL) return true; 00194 //must require initializequeryBlock be executed first 00195 00196 queryMap = new CSeekIntIntMap(iNumGenes); 00197 00198 utype i; 00199 vector<utype>::const_iterator iterQ = query.begin(); 00200 00201 vector<AResult> a; 00202 a.resize(query.size()); 00203 vector<AResult>::iterator iterA = a.begin(); 00204 00205 for(iQuerySize = 0; iterQ!=query.end(); iterQ++){ 00206 //if the query does not exist in this data-set, continue 00207 if(CSeekTools::IsNaN(i = dbMap->GetForward(*iterQ))) continue; 00208 //.i: query genes 00209 (*iterA).i = *iterQ; 00210 //.f: query gene position in dbMap 00211 (*iterA).f = i; 00212 iterA++; 00213 iQuerySize++; 00214 } 00215 00216 bool DEBUG = false; 00217 if(iQuerySize==0){ 00218 if(DEBUG) cerr << "Dataset will be skipped" << endl; 00219 DeleteQuery(); 00220 return true; 00221 } 00222 00223 a.resize(iQuerySize); 00224 sort(a.begin(), a.end(), Ascending()); 00225 00226 for(iterA = a.begin(); iterA!=a.end(); iterA++){ 00227 //add gene to queryMap 00228 queryMap->Add((*iterA).i); 00229 this->query.push_back((*iterA).i); 00230 //add (gene-position in dbMap) to queryIndex 00231 this->queryIndex.push_back((*iterA).f); 00232 } 00233 00234 this->queryIndex.resize(this->queryIndex.size()); 00235 this->query.resize(this->query.size()); 00236 return true; 00237 } 00238 00239 00240 bool CSeekDataset::DeleteQuery(){ 00241 if(queryMap!=NULL){ 00242 delete queryMap; 00243 queryMap = NULL; 00244 } 00245 iQuerySize = 0; 00246 rData = NULL; 00247 weight.clear(); 00248 query.clear(); 00249 queryIndex.clear(); 00250 sum_weight = -1; 00251 return true; 00252 } 00253 00254 00255 bool CSeekDataset::DeleteQueryBlock(){ 00256 DeleteQuery(); 00257 if(dbMap!=NULL){ 00258 delete dbMap; 00259 dbMap = NULL; 00260 } 00261 if(r!=NULL){ 00262 CSeekTools::Free2DArray(r); 00263 r = NULL; 00264 } 00265 iDBSize = 0; 00266 return true; 00267 } 00268 00269 const vector<utype>& CSeekDataset::GetQuery() const{ 00270 return this->query; 00271 } 00272 00273 const vector<utype>& CSeekDataset::GetQueryIndex() const{ 00274 return this->queryIndex; 00275 } 00276 00277 utype** CSeekDataset::GetDataMatrix(){ 00278 return rData; 00279 } 00280 00281 bool CSeekDataset::InitializeDataMatrix(utype **rD, 00282 const vector<float> &quant, const utype &iRows, 00283 const utype &iColumns, const bool bSubtractAvg, 00284 const bool bNormPlatform, const bool logit, 00285 const enum CSeekDataset::DistanceMeasure dist_measure, 00286 const float cutoff, 00287 const bool bRandom, gsl_rng *rand){ 00288 /* assume platform is already set */ 00289 //Assuming rData is not NULL 00290 00291 bool bCorrelation = false; 00292 if(dist_measure==CSeekDataset::CORRELATION) 00293 bCorrelation = true; 00294 00295 if(bCorrelation && (bSubtractAvg || bNormPlatform)){ 00296 fprintf(stderr, "%s, %s\n", "correlation mode enabled", 00297 "please set subtract_avg, subtract_platform to false"); 00298 return false; 00299 } 00300 00301 utype i, j; 00302 rData = rD; 00303 utype ii; 00304 utype iNumGenes = geneMap->GetNumSet(); 00305 utype iNumQueries = iQuerySize; 00306 00307 //fprintf(stderr, "iNumQueries is %d\n", iNumQueries); 00308 //iRows is the gene id, iColumns is the query id 00309 //default value for all rData entries is 0 00310 memset(&rData[0][0], 0, sizeof(utype)*iRows*iColumns); 00311 00312 //assume queryIndex is already sorted 00313 vector<utype> offset; 00314 offset.push_back(0); 00315 for(i=1; i<queryIndex.size(); i++) 00316 offset.push_back(queryIndex[i] - queryIndex[i-1]); 00317 00318 if(bSubtractAvg){ //z-score (Z) + average z score subtraction (A) 00319 00320 if(bNormPlatform){ //subtract platform avg / platform stddev 00321 float *platform_avg = new float[iColumns]; 00322 float *platform_stdev = new float[iColumns]; 00323 00324 //GetColumns() is numQuery 00325 for(j=0; j<iNumQueries; j++){ 00326 utype jj = this->query[j]; 00327 platform_avg[j] = platform->GetPlatformAvg(jj); 00328 platform_stdev[j] = platform->GetPlatformStdev(jj); 00329 } 00330 00331 const vector<utype> &allRGenes = geneMap->GetAllReverse(); 00332 float a = 0; 00333 float vv = 0; 00334 unsigned char x = 0; 00335 utype iGeneMapSize = geneMap->GetNumSet(); 00336 if(logit){ //NOT checked 00337 for(ii=0; ii<iGeneMapSize; ii++){ 00338 for(j=0, i = allRGenes[ii], a=GetGeneAverage(i); 00339 j<iNumQueries; j++){ 00340 if((x = r[queryIndex[j]][i])==255) continue; 00341 vv = ((log(quant[x]) - log((float) 1.0 - quant[x])) 00342 - a - platform_avg[j]) / platform_stdev[j]; 00343 vv = max((float) min(vv, (float)3.2), (float)-3.2); 00344 //By default, cutoff = -nan (i.e., always true) 00345 if(vv>cutoff){ 00346 rData[i][j]= (utype) (vv*100.0) + 320; 00347 //fprintf(stderr, "%.5f %.5f %.5f %.5f\n", quant[x], vv, a, platform_avg[j]); 00348 }else{ 00349 rData[i][j] = 0; 00350 } 00351 } 00352 } 00353 }else{ //if just normal score 00354 for(ii=0; ii<iGeneMapSize; ii++){ 00355 for(j=0, i = allRGenes[ii], a=GetGeneAverage(i); 00356 j<iNumQueries; j++){ 00357 if((x = r[queryIndex[j]][i])==255) continue; 00358 vv = (quant[x] - a - platform_avg[j]) 00359 / platform_stdev[j]; 00360 vv = max((float) min(vv, (float)3.2), (float)-3.2); 00361 if(vv>cutoff){ 00362 rData[i][j]= (utype) (vv*100.0) + 320; 00363 //fprintf(stderr, "r %.2f\n", quant[x]); 00364 }else{ 00365 rData[i][j]= 0; 00366 } 00367 } 00368 } 00369 } 00370 00371 delete[] platform_avg; 00372 delete[] platform_stdev; 00373 00374 }else{ //do not normalize by platform, just subtract gene avg 00375 if(logit){ 00376 for(ii=0; ii<iNumGenes; ii++){ 00377 float a, vv; unsigned char x; 00378 for(i = geneMap->GetReverse(ii), 00379 a = GetGeneAverage(i), j=0; j<iNumQueries; j++){ 00380 if((x = r[queryIndex[j]][i])==255) continue; 00381 vv = log(quant[x]) - log((float)(1.0-quant[x])) - a; 00382 vv = max((float)-3.2, (float)min((float) 3.2, (float) vv)); 00383 //fprintf(stderr, "%.5f %.5f %.5f\n", quant[x], vv, a); 00384 if(vv>cutoff){ 00385 rData[i][j]= (utype) (vv*100.0) + 320; 00386 }else{ 00387 rData[i][j] = 0; 00388 } 00389 } 00390 } 00391 } 00392 else{ 00393 for(ii=0; ii<iNumGenes; ii++){ 00394 float a, vv; unsigned char x; 00395 /* numQueries */ 00396 for(i = geneMap->GetReverse(ii), 00397 a = GetGeneAverage(i), j=0; j<iNumQueries; j++){ 00398 if((x = r[queryIndex[j]][i])==255) continue; 00399 vv = quant[x] - a; 00400 vv = max((float) min((float)vv, (float)3.2), (float)-3.2); 00401 if(vv>cutoff){ 00402 rData[i][j]= (utype) (vv*100.0) + 320; 00403 }else{ 00404 rData[i][j] = 0; 00405 } 00406 } 00407 } 00408 } 00409 } 00410 00411 //return true; 00412 } 00413 /* numGenes */ 00414 else if(logit){ //logit on z-scores 00415 for(ii=0; ii<iNumGenes; ii++){ 00416 unsigned char x; 00417 for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){ 00418 if((x = r[queryIndex[j]][i])==255) continue; 00419 float vv = log(quant[x]) - log((float) 1.0 - quant[x]); 00420 vv = max((float) min(vv, (float)3.2), (float)-3.2); 00421 if(vv>cutoff){ 00422 rData[i][j] = (utype) (vv*100.0) + 320; 00423 }else{ 00424 rData[i][j] = 0; 00425 } 00426 } 00427 } 00428 }else if(bCorrelation){ //correlation mode 00429 //assumed to be from -1 to +1 00430 00431 for(ii=0; ii<iNumGenes; ii++){ 00432 unsigned char x; 00433 for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){ 00434 if((x = r[queryIndex[j]][i])==255) continue; 00435 float vv = quant[x]; 00436 vv = vv * m_fDsetStdev + m_fDsetAverage; 00437 float e = exp(2.0*vv); 00438 vv = (e - 1.0) / (e + 1.0); 00439 if(vv>cutoff){ 00440 vv = vv * 3.0; //scale up by factor of 3 for retaining 00441 //precision, should put value to range 00442 //(-3.0 to 3.0) 00443 vv = max((float) min(vv, (float)3.2), (float)-3.2); 00444 rData[i][j] = (utype) (vv*100.0) + 320; 00445 }else{ 00446 rData[i][j] = (utype) 0; //default value for 00447 //not meeting cutoff 00448 } 00449 } 00450 } 00451 00452 }else{ //just simple z-scores, no subtraction of avg z scores 00453 for(ii=0; ii<iNumGenes; ii++){ 00454 unsigned char x; 00455 for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){ 00456 if((x = r[queryIndex[j]][i])==255) continue; 00457 float vv = quant[x]; 00458 00459 //for functional network================================= 00460 //vv = vv * 6.0 - 3.0; //transform values from 0-1 to values from -3 to +3 00461 00462 vv = max((float) min(vv, (float)3.2), (float)-3.2); 00463 if(vv>cutoff){ 00464 rData[i][j] = (utype) (vv*100.0) + 320; 00465 }else{ 00466 rData[i][j] = 0; 00467 } 00468 } 00469 } 00470 } 00471 00472 if(bRandom){ 00473 vector<vector<utype> > allRandom; 00474 allRandom.resize(iNumQueries); 00475 for(i=0; i<iNumQueries; i++){ 00476 allRandom[i] = vector<utype>(); 00477 } 00478 00479 for(ii=0; ii<iNumGenes; ii++){ 00480 unsigned char x; 00481 for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){ 00482 if((x = r[queryIndex[j]][i])==255) continue; 00483 allRandom[j].push_back(rData[i][j]); 00484 } 00485 } 00486 00487 int max_size = 0; 00488 for(i=0; i<iNumQueries; i++){ 00489 if(allRandom[i].size()>max_size){ 00490 max_size = allRandom[i].size(); 00491 } 00492 } 00493 00494 utype **a = CSeekTools::Init2DArray(iNumQueries, max_size, (utype)0); 00495 00496 for(i=0; i<iNumQueries; i++){ 00497 for(j=0; j<allRandom[i].size(); j++){ 00498 a[i][j] = allRandom[i][j]; 00499 } 00500 gsl_ran_shuffle(rand, a[i], allRandom[i].size(), sizeof(utype)); 00501 } 00502 00503 vector<int> k; 00504 CSeekTools::InitVector(k, iNumQueries, (int) 0); 00505 00506 for(ii=0; ii<iNumGenes; ii++){ 00507 unsigned char x; 00508 for(i = geneMap->GetReverse(ii), j=0; j<iNumQueries; j++){ 00509 if((x = r[queryIndex[j]][i])==255) continue; 00510 //fprintf(stderr, "%d %d\n", rData[i][j], a[j][k[j]]); 00511 rData[i][j] = a[j][k[j]]; 00512 k[j]++; 00513 } 00514 } 00515 00516 CSeekTools::Free2DArray(a); 00517 } 00518 00519 00520 return true; 00521 } 00522 00523 00524 unsigned char** CSeekDataset::GetMatrix(){ 00525 return r; 00526 } 00527 00528 CSeekIntIntMap* CSeekDataset::GetGeneMap(){ 00529 return geneMap; 00530 } 00531 00532 CSeekIntIntMap* CSeekDataset::GetQueryMap(){ 00533 return queryMap; 00534 } 00535 00536 CSeekIntIntMap* CSeekDataset::GetDBMap(){ 00537 return dbMap; 00538 } 00539 00540 float CSeekDataset::GetDatasetAverage() const{ 00541 return m_fDsetAverage; 00542 } 00543 00544 float CSeekDataset::GetDatasetStdev() const{ 00545 return m_fDsetStdev; 00546 } 00547 00548 float CSeekDataset::GetGeneVariance(const utype &i) const{ 00549 return geneVariance[i]; 00550 } 00551 00552 float CSeekDataset::GetGeneAverage(const utype &i) const{ 00553 return geneAverage[i]; 00554 } 00555 00556 utype CSeekDataset::GetNumGenes() const{ 00557 return iNumGenes; 00558 } 00559 00560 bool CSeekDataset::InitializeCVWeight(const utype &i){ 00561 weight.clear(); 00562 weight.resize(i); 00563 sum_weight = -1; 00564 return true; 00565 } 00566 00567 bool CSeekDataset::SetCVWeight(const utype &i, const float &f){ 00568 weight[i] = f; 00569 return true; 00570 } 00571 00572 float CSeekDataset::GetCVWeight(const utype &i){ 00573 return weight[i]; 00574 } 00575 00576 const vector<float>& CSeekDataset::GetCVWeight() const{ 00577 return weight; 00578 } 00579 00580 float CSeekDataset::GetDatasetSumWeight(){ 00581 utype i; 00582 utype num = 0; 00583 if(sum_weight==-1){ 00584 for(sum_weight=0, i=0; i<weight.size(); i++){ 00585 if(weight[i]==-1) continue; 00586 sum_weight+=weight[i]; 00587 num++; 00588 } 00589 if(num>0) sum_weight/=(float)num; 00590 else sum_weight = -1; 00591 } 00592 return sum_weight; 00593 } 00594 00595 void CSeekDataset::SetPlatform(CSeekPlatform &cp){ 00596 platform = &cp; 00597 } 00598 00599 CSeekPlatform& CSeekDataset::GetPlatform() const{ 00600 return *platform; 00601 } 00602 00603 }